biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py CHANGED
@@ -13,7 +13,6 @@ from typing import Dict, Iterable, List, Optional
13
13
  from pydantic import ValidationError
14
14
 
15
15
  from .analysis import get_analysis_backend
16
- from .backends import get_backend
17
16
  from .context import (
18
17
  CharacterBudget,
19
18
  ContextPackPolicy,
@@ -24,16 +23,17 @@ from .context import (
24
23
  )
25
24
  from .corpus import Corpus
26
25
  from .crawl import CrawlRequest, crawl_into_corpus
27
- from .errors import ExtractionRunFatalError, IngestCollisionError
28
- from .evaluation import evaluate_run, load_dataset
26
+ from .errors import ExtractionSnapshotFatalError, IngestCollisionError
27
+ from .evaluation import evaluate_snapshot, load_dataset
29
28
  from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
30
- from .extraction import build_extraction_run
29
+ from .extraction import build_extraction_snapshot
31
30
  from .extraction_evaluation import (
32
- evaluate_extraction_run,
31
+ evaluate_extraction_snapshot,
33
32
  load_extraction_dataset,
34
33
  write_extraction_evaluation_result,
35
34
  )
36
- from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
35
+ from .models import QueryBudget, RetrievalResult, parse_extraction_snapshot_reference
36
+ from .retrievers import get_retriever
37
37
  from .uris import corpus_ref_to_path
38
38
 
39
39
 
@@ -391,48 +391,56 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
391
391
 
392
392
  def cmd_build(arguments: argparse.Namespace) -> int:
393
393
  """
394
- Build a retrieval run for a backend.
394
+ Build a retrieval snapshot for a retriever.
395
395
 
396
396
  :param arguments: Parsed command-line interface arguments.
397
397
  :type arguments: argparse.Namespace
398
398
  :return: Exit code.
399
399
  :rtype: int
400
400
  """
401
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
401
+ from .configuration import (
402
+ apply_dotted_overrides,
403
+ load_configuration_view,
404
+ parse_dotted_overrides,
405
+ )
402
406
 
403
407
  corpus = (
404
408
  Corpus.open(arguments.corpus)
405
409
  if getattr(arguments, "corpus", None)
406
410
  else Corpus.find(Path.cwd())
407
411
  )
408
- backend = get_backend(arguments.backend)
412
+ retriever = get_retriever(arguments.retriever)
409
413
 
410
414
  base_config: Dict[str, object] = {}
411
- if getattr(arguments, "recipe", None):
412
- base_config = load_recipe_view(
413
- arguments.recipe,
414
- recipe_label="Recipe file",
415
- mapping_error_message="Retrieval build recipe must be a mapping/object",
415
+ if getattr(arguments, "configuration", None):
416
+ base_config = load_configuration_view(
417
+ arguments.configuration,
418
+ configuration_label="Configuration file",
419
+ mapping_error_message="Retrieval snapshot configuration must be a mapping/object",
416
420
  )
417
421
 
418
- overrides = parse_dotted_overrides(arguments.config)
419
- config = apply_dotted_overrides(base_config, overrides)
422
+ overrides = parse_dotted_overrides(arguments.override)
423
+ configuration = apply_dotted_overrides(base_config, overrides)
420
424
 
421
- run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
422
- print(run.model_dump_json(indent=2))
425
+ snapshot = retriever.build_snapshot(
426
+ corpus,
427
+ configuration_name=arguments.configuration_name,
428
+ configuration=configuration,
429
+ )
430
+ print(snapshot.model_dump_json(indent=2))
423
431
  return 0
424
432
 
425
433
 
426
434
  def cmd_extract_build(arguments: argparse.Namespace) -> int:
427
435
  """
428
- Build a text extraction run for the corpus using a pipeline of extractors.
436
+ Build a text extraction snapshot for the corpus using a pipeline of extractors.
429
437
 
430
438
  :param arguments: Parsed command-line interface arguments.
431
439
  :type arguments: argparse.Namespace
432
440
  :return: Exit code.
433
441
  :rtype: int
434
442
  """
435
- from .recipes import load_recipe_view
443
+ from .configuration import load_configuration_view
436
444
 
437
445
  corpus = (
438
446
  Corpus.open(arguments.corpus)
@@ -440,17 +448,17 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
440
448
  else Corpus.find(Path.cwd())
441
449
  )
442
450
 
443
- # Load recipe from file if --recipe is provided
444
- if getattr(arguments, "recipe", None):
445
- recipe_data = load_recipe_view(
446
- arguments.recipe,
447
- recipe_label="Recipe file",
448
- mapping_error_message="Extraction recipe must be a mapping/object",
451
+ # Load configuration from file if --configuration is provided
452
+ if getattr(arguments, "configuration", None):
453
+ configuration_data = load_configuration_view(
454
+ arguments.configuration,
455
+ configuration_label="Configuration file",
456
+ mapping_error_message="Extraction configuration must be a mapping/object",
449
457
  )
450
- loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
451
- loaded_config = recipe_data.get("config", {})
458
+ loaded_extractor_id = configuration_data.get("extractor_id", "pipeline")
459
+ loaded_config = configuration_data.get("configuration", {})
452
460
 
453
- # If the recipe specifies a non-pipeline extractor, wrap it in a pipeline
461
+ # If the configuration specifies a non-pipeline extractor, wrap it in a pipeline
454
462
  if loaded_extractor_id != "pipeline":
455
463
  extractor_id = "pipeline"
456
464
  config = {
@@ -476,11 +484,11 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
476
484
  config = {"steps": steps}
477
485
  extractor_id = "pipeline"
478
486
 
479
- manifest = build_extraction_run(
487
+ manifest = build_extraction_snapshot(
480
488
  corpus,
481
489
  extractor_id=extractor_id,
482
- recipe_name=arguments.recipe_name,
483
- config=config,
490
+ configuration_name=arguments.configuration_name,
491
+ configuration=config,
484
492
  )
485
493
  print(manifest.model_dump_json(indent=2))
486
494
  return 0
@@ -488,7 +496,7 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
488
496
 
489
497
  def cmd_extract_list(arguments: argparse.Namespace) -> int:
490
498
  """
491
- List extraction runs stored under the corpus.
499
+ List extraction snapshots stored under the corpus.
492
500
 
493
501
  :param arguments: Parsed command-line interface arguments.
494
502
  :type arguments: argparse.Namespace
@@ -500,14 +508,14 @@ def cmd_extract_list(arguments: argparse.Namespace) -> int:
500
508
  if getattr(arguments, "corpus", None)
501
509
  else Corpus.find(Path.cwd())
502
510
  )
503
- runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
504
- print(json.dumps([entry.model_dump() for entry in runs], indent=2))
511
+ snapshots = corpus.list_extraction_snapshots(extractor_id=arguments.extractor_id)
512
+ print(json.dumps([entry.model_dump() for entry in snapshots], indent=2))
505
513
  return 0
506
514
 
507
515
 
508
516
  def cmd_extract_show(arguments: argparse.Namespace) -> int:
509
517
  """
510
- Show an extraction run manifest.
518
+ Show an extraction snapshot manifest.
511
519
 
512
520
  :param arguments: Parsed command-line interface arguments.
513
521
  :type arguments: argparse.Namespace
@@ -519,9 +527,9 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
519
527
  if getattr(arguments, "corpus", None)
520
528
  else Corpus.find(Path.cwd())
521
529
  )
522
- reference = parse_extraction_run_reference(arguments.run)
523
- manifest = corpus.load_extraction_run_manifest(
524
- extractor_id=reference.extractor_id, run_id=reference.run_id
530
+ reference = parse_extraction_snapshot_reference(arguments.snapshot)
531
+ manifest = corpus.load_extraction_snapshot_manifest(
532
+ extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
525
533
  )
526
534
  print(manifest.model_dump_json(indent=2))
527
535
  return 0
@@ -529,7 +537,7 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
529
537
 
530
538
  def cmd_extract_delete(arguments: argparse.Namespace) -> int:
531
539
  """
532
- Delete an extraction run directory and its derived artifacts.
540
+ Delete an extraction snapshot directory and its derived artifacts.
533
541
 
534
542
  :param arguments: Parsed command-line interface arguments.
535
543
  :type arguments: argparse.Namespace
@@ -541,17 +549,19 @@ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
541
549
  if getattr(arguments, "corpus", None)
542
550
  else Corpus.find(Path.cwd())
543
551
  )
544
- if arguments.confirm != arguments.run:
545
- raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
546
- reference = parse_extraction_run_reference(arguments.run)
547
- corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
548
- print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
552
+ if arguments.confirm != arguments.snapshot:
553
+ raise ValueError("Refusing to delete extraction snapshot without an exact --confirm match.")
554
+ reference = parse_extraction_snapshot_reference(arguments.snapshot)
555
+ corpus.delete_extraction_snapshot(
556
+ extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
557
+ )
558
+ print(json.dumps({"deleted": True, "snapshot": arguments.snapshot}, indent=2))
549
559
  return 0
550
560
 
551
561
 
552
562
  def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
553
563
  """
554
- Evaluate an extraction run against a dataset.
564
+ Evaluate an extraction snapshot against a dataset.
555
565
 
556
566
  :param arguments: Parsed command-line interface arguments.
557
567
  :type arguments: argparse.Namespace
@@ -563,14 +573,14 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
563
573
  if getattr(arguments, "corpus", None)
564
574
  else Corpus.find(Path.cwd())
565
575
  )
566
- if arguments.run:
567
- run_ref = parse_extraction_run_reference(arguments.run)
576
+ if arguments.snapshot:
577
+ snapshot_ref = parse_extraction_snapshot_reference(arguments.snapshot)
568
578
  else:
569
- run_ref = corpus.latest_extraction_run_reference()
570
- if run_ref is None:
571
- raise ValueError("Extraction evaluation requires an extraction run")
579
+ snapshot_ref = corpus.latest_extraction_snapshot_reference()
580
+ if snapshot_ref is None:
581
+ raise ValueError("Extraction evaluation requires an extraction snapshot")
572
582
  print(
573
- "Warning: using latest extraction run; pass --run for reproducibility.",
583
+ "Warning: using latest extraction snapshot; pass --snapshot for reproducibility.",
574
584
  file=sys.stderr,
575
585
  )
576
586
 
@@ -582,17 +592,19 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
582
592
  except ValidationError as exc:
583
593
  raise ValueError(f"Invalid extraction dataset: {exc}") from exc
584
594
 
585
- run = corpus.load_extraction_run_manifest(
586
- extractor_id=run_ref.extractor_id,
587
- run_id=run_ref.run_id,
595
+ snapshot = corpus.load_extraction_snapshot_manifest(
596
+ extractor_id=snapshot_ref.extractor_id,
597
+ snapshot_id=snapshot_ref.snapshot_id,
588
598
  )
589
- result = evaluate_extraction_run(
599
+ result = evaluate_extraction_snapshot(
590
600
  corpus=corpus,
591
- run=run,
592
- extractor_id=run_ref.extractor_id,
601
+ snapshot=snapshot,
602
+ extractor_id=snapshot_ref.extractor_id,
593
603
  dataset=dataset,
594
604
  )
595
- write_extraction_evaluation_result(corpus=corpus, run_id=run.run_id, result=result)
605
+ write_extraction_evaluation_result(
606
+ corpus=corpus, snapshot_id=snapshot.snapshot_id, result=result
607
+ )
596
608
  print(result.model_dump_json(indent=2))
597
609
  return 0
598
610
 
@@ -611,18 +623,21 @@ def cmd_query(arguments: argparse.Namespace) -> int:
611
623
  if getattr(arguments, "corpus", None)
612
624
  else Corpus.find(Path.cwd())
613
625
  )
614
- run_id = arguments.run or corpus.latest_run_id
615
- if not run_id:
616
- raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
617
- run = corpus.load_run(run_id)
618
- if arguments.backend and arguments.backend != run.recipe.backend_id:
626
+ snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
627
+ if not snapshot_id:
619
628
  raise ValueError(
620
- f"Backend mismatch: run uses {run.recipe.backend_id!r} but {arguments.backend!r} was requested"
629
+ "No snapshot identifier provided and no latest snapshot is recorded for this corpus"
621
630
  )
622
- backend = get_backend(run.recipe.backend_id)
631
+ snapshot = corpus.load_snapshot(snapshot_id)
632
+ if arguments.retriever and arguments.retriever != snapshot.configuration.retriever_id:
633
+ raise ValueError(
634
+ "Retriever mismatch: snapshot uses "
635
+ f"{snapshot.configuration.retriever_id!r} but {arguments.retriever!r} was requested"
636
+ )
637
+ retriever = get_retriever(snapshot.configuration.retriever_id)
623
638
  query_text = arguments.query if arguments.query is not None else sys.stdin.read()
624
639
  budget = _budget_from_args(arguments)
625
- result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
640
+ result = retriever.query(corpus, snapshot=snapshot, query_text=query_text, budget=budget)
626
641
  processed_evidence = result.evidence
627
642
  if getattr(arguments, "reranker_id", None):
628
643
  processed_evidence = apply_evidence_reranker(
@@ -693,7 +708,7 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
693
708
 
694
709
  def cmd_eval(arguments: argparse.Namespace) -> int:
695
710
  """
696
- Evaluate a retrieval run against a dataset.
711
+ Evaluate a retrieval snapshot against a dataset.
697
712
 
698
713
  :param arguments: Parsed command-line interface arguments.
699
714
  :type arguments: argparse.Namespace
@@ -705,13 +720,15 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
705
720
  if getattr(arguments, "corpus", None)
706
721
  else Corpus.find(Path.cwd())
707
722
  )
708
- run_id = arguments.run or corpus.latest_run_id
709
- if not run_id:
710
- raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
711
- run = corpus.load_run(run_id)
723
+ snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
724
+ if not snapshot_id:
725
+ raise ValueError(
726
+ "No snapshot identifier provided and no latest snapshot is recorded for this corpus"
727
+ )
728
+ snapshot = corpus.load_snapshot(snapshot_id)
712
729
  dataset = load_dataset(Path(arguments.dataset))
713
730
  budget = _budget_from_args(arguments)
714
- result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
731
+ result = evaluate_snapshot(corpus=corpus, snapshot=snapshot, dataset=dataset, budget=budget)
715
732
  print(result.model_dump_json(indent=2))
716
733
  return 0
717
734
 
@@ -751,29 +768,33 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
751
768
  :return: Exit code.
752
769
  :rtype: int
753
770
  """
754
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
771
+ from .configuration import (
772
+ apply_dotted_overrides,
773
+ load_configuration_view,
774
+ parse_dotted_overrides,
775
+ )
755
776
 
756
777
  corpus = (
757
778
  Corpus.open(arguments.corpus)
758
779
  if getattr(arguments, "corpus", None)
759
780
  else Corpus.find(Path.cwd())
760
781
  )
761
- recipe_data = load_recipe_view(
762
- arguments.recipe,
763
- recipe_label="Recipe file",
764
- mapping_error_message="Topic modeling recipe must be a mapping/object",
782
+ configuration_data = load_configuration_view(
783
+ arguments.configuration,
784
+ configuration_label="Configuration file",
785
+ mapping_error_message="Topic modeling configuration must be a mapping/object",
765
786
  )
766
- overrides = parse_dotted_overrides(arguments.config)
767
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
787
+ overrides = parse_dotted_overrides(arguments.override)
788
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
768
789
 
769
- if arguments.extraction_run:
770
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
790
+ if arguments.extraction_snapshot:
791
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
771
792
  else:
772
- extraction_run = corpus.latest_extraction_run_reference()
773
- if extraction_run is None:
774
- raise ValueError("Topic analysis requires an extraction run to supply text inputs")
793
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
794
+ if extraction_snapshot is None:
795
+ raise ValueError("Topic analysis requires an extraction snapshot to supply text inputs")
775
796
  print(
776
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
797
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
777
798
  file=sys.stderr,
778
799
  )
779
800
 
@@ -781,12 +802,12 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
781
802
  try:
782
803
  output = backend.run_analysis(
783
804
  corpus,
784
- recipe_name=arguments.recipe_name,
785
- config=recipe_data,
786
- extraction_run=extraction_run,
805
+ configuration_name=arguments.configuration_name,
806
+ configuration=configuration_data,
807
+ extraction_snapshot=extraction_snapshot,
787
808
  )
788
809
  except ValidationError as exc:
789
- raise ValueError(f"Invalid topic modeling recipe: {exc}") from exc
810
+ raise ValueError(f"Invalid topic modeling configuration: {exc}") from exc
790
811
  print(output.model_dump_json(indent=2))
791
812
  return 0
792
813
 
@@ -800,7 +821,11 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
800
821
  :return: Exit code.
801
822
  :rtype: int
802
823
  """
803
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
824
+ from .configuration import (
825
+ apply_dotted_overrides,
826
+ load_configuration_view,
827
+ parse_dotted_overrides,
828
+ )
804
829
 
805
830
  corpus = (
806
831
  Corpus.open(arguments.corpus)
@@ -808,28 +833,30 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
808
833
  else Corpus.find(Path.cwd())
809
834
  )
810
835
 
811
- recipe_data: dict[str, object] = {}
812
- if arguments.recipe is not None:
813
- recipe_data = load_recipe_view(
814
- arguments.recipe,
815
- recipe_label="Recipe file",
816
- mapping_error_message="Profiling recipe must be a mapping/object",
836
+ configuration_data: dict[str, object] = {}
837
+ if arguments.configuration is not None:
838
+ configuration_data = load_configuration_view(
839
+ arguments.configuration,
840
+ configuration_label="Configuration file",
841
+ mapping_error_message="Profiling configuration must be a mapping/object",
817
842
  )
818
- overrides = parse_dotted_overrides(arguments.config)
819
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
843
+ overrides = parse_dotted_overrides(arguments.override)
844
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
820
845
  else:
821
- overrides = parse_dotted_overrides(arguments.config)
846
+ overrides = parse_dotted_overrides(arguments.override)
822
847
  if overrides:
823
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
848
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
824
849
 
825
- if arguments.extraction_run:
826
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
850
+ if arguments.extraction_snapshot:
851
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
827
852
  else:
828
- extraction_run = corpus.latest_extraction_run_reference()
829
- if extraction_run is None:
830
- raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
853
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
854
+ if extraction_snapshot is None:
855
+ raise ValueError(
856
+ "Profiling analysis requires an extraction snapshot to supply text inputs"
857
+ )
831
858
  print(
832
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
859
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
833
860
  file=sys.stderr,
834
861
  )
835
862
 
@@ -837,12 +864,12 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
837
864
  try:
838
865
  output = backend.run_analysis(
839
866
  corpus,
840
- recipe_name=arguments.recipe_name,
841
- config=recipe_data,
842
- extraction_run=extraction_run,
867
+ configuration_name=arguments.configuration_name,
868
+ configuration=configuration_data,
869
+ extraction_snapshot=extraction_snapshot,
843
870
  )
844
871
  except ValidationError as exc:
845
- raise ValueError(f"Invalid profiling recipe: {exc}") from exc
872
+ raise ValueError(f"Invalid profiling configuration: {exc}") from exc
846
873
  print(output.model_dump_json(indent=2))
847
874
  return 0
848
875
 
@@ -856,29 +883,35 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
856
883
  :return: Exit code.
857
884
  :rtype: int
858
885
  """
859
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
886
+ from .configuration import (
887
+ apply_dotted_overrides,
888
+ load_configuration_view,
889
+ parse_dotted_overrides,
890
+ )
860
891
 
861
892
  corpus = (
862
893
  Corpus.open(arguments.corpus)
863
894
  if getattr(arguments, "corpus", None)
864
895
  else Corpus.find(Path.cwd())
865
896
  )
866
- recipe_data = load_recipe_view(
867
- arguments.recipe,
868
- recipe_label="Recipe file",
869
- mapping_error_message="Markov analysis recipe must be a mapping/object",
897
+ configuration_data = load_configuration_view(
898
+ arguments.configuration,
899
+ configuration_label="Configuration file",
900
+ mapping_error_message="Markov analysis configuration must be a mapping/object",
870
901
  )
871
- overrides = parse_dotted_overrides(arguments.config)
872
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
902
+ overrides = parse_dotted_overrides(arguments.override)
903
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
873
904
 
874
- if arguments.extraction_run:
875
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
905
+ if arguments.extraction_snapshot:
906
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
876
907
  else:
877
- extraction_run = corpus.latest_extraction_run_reference()
878
- if extraction_run is None:
879
- raise ValueError("Markov analysis requires an extraction run to supply text inputs")
908
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
909
+ if extraction_snapshot is None:
910
+ raise ValueError(
911
+ "Markov analysis requires an extraction snapshot to supply text inputs"
912
+ )
880
913
  print(
881
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
914
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
882
915
  file=sys.stderr,
883
916
  )
884
917
 
@@ -886,12 +919,12 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
886
919
  try:
887
920
  output = backend.run_analysis(
888
921
  corpus,
889
- recipe_name=arguments.recipe_name,
890
- config=recipe_data,
891
- extraction_run=extraction_run,
922
+ configuration_name=arguments.configuration_name,
923
+ configuration=configuration_data,
924
+ extraction_snapshot=extraction_snapshot,
892
925
  )
893
926
  except ValidationError as exc:
894
- raise ValueError(f"Invalid Markov analysis recipe: {exc}") from exc
927
+ raise ValueError(f"Invalid Markov analysis configuration: {exc}") from exc
895
928
  print(output.model_dump_json(indent=2))
896
929
  return 0
897
930
 
@@ -977,41 +1010,46 @@ def build_parser() -> argparse.ArgumentParser:
977
1010
  )
978
1011
  p_purge.set_defaults(func=cmd_purge)
979
1012
 
980
- p_build = sub.add_parser("build", help="Build a retrieval backend run for the corpus.")
1013
+ p_build = sub.add_parser("build", help="Build a retrieval snapshot for the corpus.")
981
1014
  _add_common_corpus_arg(p_build)
982
1015
  p_build.add_argument(
983
- "--backend",
1016
+ "--retriever",
984
1017
  required=True,
985
- help="Backend identifier (for example, scan, sqlite-full-text-search).",
1018
+ help="Retriever identifier (for example, scan, sqlite-full-text-search).",
986
1019
  )
987
- p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
988
1020
  p_build.add_argument(
989
- "--recipe",
1021
+ "--configuration-name", default="default", help="Human-readable configuration name."
1022
+ )
1023
+ p_build.add_argument(
1024
+ "--configuration",
990
1025
  default=None,
991
1026
  action="append",
992
- help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
1027
+ help="Path to YAML configuration file (repeatable). If provided, files are composed in precedence order.",
993
1028
  )
994
1029
  p_build.add_argument(
1030
+ "--override",
995
1031
  "--config",
996
1032
  action="append",
997
1033
  default=None,
998
- help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
1034
+ help="Configuration override as key=value (repeatable). Dotted keys create nested config mappings.",
999
1035
  )
1000
1036
  p_build.set_defaults(func=cmd_build)
1001
1037
 
1002
- p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
1038
+ p_extract = sub.add_parser(
1039
+ "extract", help="Work with text extraction snapshots for the corpus."
1040
+ )
1003
1041
  extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
1004
1042
 
1005
- p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
1043
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction snapshot.")
1006
1044
  _add_common_corpus_arg(p_extract_build)
1007
1045
  p_extract_build.add_argument(
1008
- "--recipe-name", default="default", help="Human-readable recipe name."
1046
+ "--configuration-name", default="default", help="Human-readable configuration name."
1009
1047
  )
1010
1048
  p_extract_build.add_argument(
1011
- "--recipe",
1049
+ "--configuration",
1012
1050
  default=None,
1013
1051
  action="append",
1014
- help="Path to YAML recipe file. If provided, --step arguments are ignored.",
1052
+ help="Path to YAML configuration file. If provided, --step arguments are ignored.",
1015
1053
  )
1016
1054
  p_extract_build.add_argument(
1017
1055
  "--step",
@@ -1021,7 +1059,7 @@ def build_parser() -> argparse.ArgumentParser:
1021
1059
  )
1022
1060
  p_extract_build.set_defaults(func=cmd_extract_build)
1023
1061
 
1024
- p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
1062
+ p_extract_list = extract_sub.add_parser("list", help="List extraction snapshots.")
1025
1063
  _add_common_corpus_arg(p_extract_list)
1026
1064
  p_extract_list.add_argument(
1027
1065
  "--extractor-id",
@@ -1030,37 +1068,39 @@ def build_parser() -> argparse.ArgumentParser:
1030
1068
  )
1031
1069
  p_extract_list.set_defaults(func=cmd_extract_list)
1032
1070
 
1033
- p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
1071
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction snapshot manifest.")
1034
1072
  _add_common_corpus_arg(p_extract_show)
1035
1073
  p_extract_show.add_argument(
1036
- "--run",
1074
+ "--snapshot",
1037
1075
  required=True,
1038
- help="Extraction run reference in the form extractor_id:run_id.",
1076
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1039
1077
  )
1040
1078
  p_extract_show.set_defaults(func=cmd_extract_show)
1041
1079
 
1042
- p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
1080
+ p_extract_delete = extract_sub.add_parser(
1081
+ "delete", help="Delete an extraction snapshot directory."
1082
+ )
1043
1083
  _add_common_corpus_arg(p_extract_delete)
1044
1084
  p_extract_delete.add_argument(
1045
- "--run",
1085
+ "--snapshot",
1046
1086
  required=True,
1047
- help="Extraction run reference in the form extractor_id:run_id.",
1087
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1048
1088
  )
1049
1089
  p_extract_delete.add_argument(
1050
1090
  "--confirm",
1051
1091
  required=True,
1052
- help="Type the exact extractor_id:run_id to confirm deletion.",
1092
+ help="Type the exact extractor_id:snapshot_id to confirm deletion.",
1053
1093
  )
1054
1094
  p_extract_delete.set_defaults(func=cmd_extract_delete)
1055
1095
 
1056
1096
  p_extract_evaluate = extract_sub.add_parser(
1057
- "evaluate", help="Evaluate an extraction run against a dataset."
1097
+ "evaluate", help="Evaluate an extraction snapshot against a dataset."
1058
1098
  )
1059
1099
  _add_common_corpus_arg(p_extract_evaluate)
1060
1100
  p_extract_evaluate.add_argument(
1061
- "--run",
1101
+ "--snapshot",
1062
1102
  default=None,
1063
- help="Extraction run reference in the form extractor_id:run_id (defaults to latest run).",
1103
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id (defaults to latest snapshot).",
1064
1104
  )
1065
1105
  p_extract_evaluate.add_argument(
1066
1106
  "--dataset",
@@ -1071,8 +1111,10 @@ def build_parser() -> argparse.ArgumentParser:
1071
1111
 
1072
1112
  p_query = sub.add_parser("query", help="Run a retrieval query.")
1073
1113
  _add_common_corpus_arg(p_query)
1074
- p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
1075
- p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
1114
+ p_query.add_argument(
1115
+ "--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
1116
+ )
1117
+ p_query.add_argument("--retriever", default=None, help="Validate retriever identifier.")
1076
1118
  p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
1077
1119
  p_query.add_argument(
1078
1120
  "--offset",
@@ -1132,9 +1174,11 @@ def build_parser() -> argparse.ArgumentParser:
1132
1174
  )
1133
1175
  p_context_pack_build.set_defaults(func=cmd_context_pack_build)
1134
1176
 
1135
- p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
1177
+ p_eval = sub.add_parser("eval", help="Evaluate a snapshot against a dataset.")
1136
1178
  _add_common_corpus_arg(p_eval)
1137
- p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
1179
+ p_eval.add_argument(
1180
+ "--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
1181
+ )
1138
1182
  p_eval.add_argument(
1139
1183
  "--dataset",
1140
1184
  required=True,
@@ -1170,78 +1214,81 @@ def build_parser() -> argparse.ArgumentParser:
1170
1214
  p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
1171
1215
  _add_common_corpus_arg(p_analyze_topics)
1172
1216
  p_analyze_topics.add_argument(
1173
- "--recipe",
1217
+ "--configuration",
1174
1218
  required=True,
1175
1219
  action="append",
1176
- help="Path to topic modeling recipe YAML. Repeatable; later recipes override earlier recipes.",
1220
+ help="Path to topic modeling configuration YAML. Repeatable; later files override earlier ones.",
1177
1221
  )
1178
1222
  p_analyze_topics.add_argument(
1223
+ "--override",
1179
1224
  "--config",
1180
1225
  action="append",
1181
1226
  default=[],
1182
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1227
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1183
1228
  )
1184
1229
  p_analyze_topics.add_argument(
1185
- "--recipe-name",
1230
+ "--configuration-name",
1186
1231
  default="default",
1187
- help="Human-readable recipe name.",
1232
+ help="Human-readable configuration name.",
1188
1233
  )
1189
1234
  p_analyze_topics.add_argument(
1190
- "--extraction-run",
1235
+ "--extraction-snapshot",
1191
1236
  default=None,
1192
- help="Extraction run reference in the form extractor_id:run_id.",
1237
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1193
1238
  )
1194
1239
  p_analyze_topics.set_defaults(func=cmd_analyze_topics)
1195
1240
 
1196
1241
  p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
1197
1242
  _add_common_corpus_arg(p_analyze_profile)
1198
1243
  p_analyze_profile.add_argument(
1199
- "--recipe",
1244
+ "--configuration",
1200
1245
  default=None,
1201
1246
  action="append",
1202
- help="Optional profiling recipe YAML file. Repeatable; later recipes override earlier recipes.",
1247
+ help="Optional profiling configuration YAML file. Repeatable; later files override earlier ones.",
1203
1248
  )
1204
1249
  p_analyze_profile.add_argument(
1250
+ "--override",
1205
1251
  "--config",
1206
1252
  action="append",
1207
1253
  default=[],
1208
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1254
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1209
1255
  )
1210
1256
  p_analyze_profile.add_argument(
1211
- "--recipe-name",
1257
+ "--configuration-name",
1212
1258
  default="default",
1213
- help="Human-readable recipe name.",
1259
+ help="Human-readable configuration name.",
1214
1260
  )
1215
1261
  p_analyze_profile.add_argument(
1216
- "--extraction-run",
1262
+ "--extraction-snapshot",
1217
1263
  default=None,
1218
- help="Extraction run reference in the form extractor_id:run_id.",
1264
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1219
1265
  )
1220
1266
  p_analyze_profile.set_defaults(func=cmd_analyze_profile)
1221
1267
 
1222
1268
  p_analyze_markov = analyze_sub.add_parser("markov", help="Run Markov analysis.")
1223
1269
  _add_common_corpus_arg(p_analyze_markov)
1224
1270
  p_analyze_markov.add_argument(
1225
- "--recipe",
1271
+ "--configuration",
1226
1272
  required=True,
1227
1273
  action="append",
1228
- help="Path to Markov analysis recipe YAML. Repeatable; later recipes override earlier recipes.",
1274
+ help="Path to Markov analysis configuration YAML. Repeatable; later files override earlier ones.",
1229
1275
  )
1230
1276
  p_analyze_markov.add_argument(
1277
+ "--override",
1231
1278
  "--config",
1232
1279
  action="append",
1233
1280
  default=[],
1234
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1281
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1235
1282
  )
1236
1283
  p_analyze_markov.add_argument(
1237
- "--recipe-name",
1284
+ "--configuration-name",
1238
1285
  default="default",
1239
- help="Human-readable recipe name.",
1286
+ help="Human-readable configuration name.",
1240
1287
  )
1241
1288
  p_analyze_markov.add_argument(
1242
- "--extraction-run",
1289
+ "--extraction-snapshot",
1243
1290
  default=None,
1244
- help="Extraction run reference in the form extractor_id:run_id.",
1291
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1245
1292
  )
1246
1293
  p_analyze_markov.set_defaults(func=cmd_analyze_markov)
1247
1294
 
@@ -1266,7 +1313,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
1266
1313
  FileExistsError,
1267
1314
  KeyError,
1268
1315
  ValueError,
1269
- ExtractionRunFatalError,
1316
+ ExtractionSnapshotFatalError,
1270
1317
  NotImplementedError,
1271
1318
  ValidationError,
1272
1319
  ) as exception: