biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py CHANGED
@@ -13,7 +13,6 @@ from typing import Dict, Iterable, List, Optional
13
13
  from pydantic import ValidationError
14
14
 
15
15
  from .analysis import get_analysis_backend
16
- from .backends import get_backend
17
16
  from .context import (
18
17
  CharacterBudget,
19
18
  ContextPackPolicy,
@@ -24,16 +23,17 @@ from .context import (
24
23
  )
25
24
  from .corpus import Corpus
26
25
  from .crawl import CrawlRequest, crawl_into_corpus
27
- from .errors import ExtractionRunFatalError
28
- from .evaluation import evaluate_run, load_dataset
26
+ from .errors import ExtractionSnapshotFatalError, IngestCollisionError
27
+ from .evaluation import evaluate_snapshot, load_dataset
29
28
  from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
30
- from .extraction import build_extraction_run
29
+ from .extraction import build_extraction_snapshot
31
30
  from .extraction_evaluation import (
32
- evaluate_extraction_run,
31
+ evaluate_extraction_snapshot,
33
32
  load_extraction_dataset,
34
33
  write_extraction_evaluation_result,
35
34
  )
36
- from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
35
+ from .models import QueryBudget, RetrievalResult, parse_extraction_snapshot_reference
36
+ from .retrievers import get_retriever
37
37
  from .uris import corpus_ref_to_path
38
38
 
39
39
 
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
117
117
 
118
118
  results = []
119
119
 
120
- if arguments.note is not None or arguments.stdin:
121
- text = arguments.note if arguments.note is not None else sys.stdin.read()
122
- ingest_result = corpus.ingest_note(
123
- text,
124
- title=arguments.title,
125
- tags=tags,
126
- source_uri="stdin" if arguments.stdin else "text",
120
+ try:
121
+ if arguments.note is not None or arguments.stdin:
122
+ text = arguments.note if arguments.note is not None else sys.stdin.read()
123
+ ingest_result = corpus.ingest_note(
124
+ text,
125
+ title=arguments.title,
126
+ tags=tags,
127
+ source_uri=None if arguments.stdin else None,
128
+ )
129
+ results.append(ingest_result)
130
+
131
+ for source_path in arguments.files or []:
132
+ results.append(corpus.ingest_source(source_path, tags=tags))
133
+ except IngestCollisionError as error:
134
+ print(
135
+ "Ingest failed: source already ingested\n"
136
+ f"source_uri: {error.source_uri}\n"
137
+ f"existing_item_id: {error.existing_item_id}\n"
138
+ f"existing_relpath: {error.existing_relpath}",
139
+ file=sys.stderr,
127
140
  )
128
- results.append(ingest_result)
129
-
130
- for source_path in arguments.files or []:
131
- results.append(corpus.ingest_source(source_path, tags=tags))
141
+ return 3
132
142
 
133
143
  if not results:
134
144
  print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
@@ -374,55 +384,63 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
374
384
  return QueryBudget(
375
385
  max_total_items=arguments.max_total_items,
376
386
  offset=getattr(arguments, "offset", 0),
377
- max_total_characters=arguments.max_total_characters,
387
+ maximum_total_characters=arguments.maximum_total_characters,
378
388
  max_items_per_source=arguments.max_items_per_source,
379
389
  )
380
390
 
381
391
 
382
392
  def cmd_build(arguments: argparse.Namespace) -> int:
383
393
  """
384
- Build a retrieval run for a backend.
394
+ Build a retrieval snapshot for a retriever.
385
395
 
386
396
  :param arguments: Parsed command-line interface arguments.
387
397
  :type arguments: argparse.Namespace
388
398
  :return: Exit code.
389
399
  :rtype: int
390
400
  """
391
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
401
+ from .configuration import (
402
+ apply_dotted_overrides,
403
+ load_configuration_view,
404
+ parse_dotted_overrides,
405
+ )
392
406
 
393
407
  corpus = (
394
408
  Corpus.open(arguments.corpus)
395
409
  if getattr(arguments, "corpus", None)
396
410
  else Corpus.find(Path.cwd())
397
411
  )
398
- backend = get_backend(arguments.backend)
412
+ retriever = get_retriever(arguments.retriever)
399
413
 
400
414
  base_config: Dict[str, object] = {}
401
- if getattr(arguments, "recipe", None):
402
- base_config = load_recipe_view(
403
- arguments.recipe,
404
- recipe_label="Recipe file",
405
- mapping_error_message="Retrieval build recipe must be a mapping/object",
415
+ if getattr(arguments, "configuration", None):
416
+ base_config = load_configuration_view(
417
+ arguments.configuration,
418
+ configuration_label="Configuration file",
419
+ mapping_error_message="Retrieval snapshot configuration must be a mapping/object",
406
420
  )
407
421
 
408
- overrides = parse_dotted_overrides(arguments.config)
409
- config = apply_dotted_overrides(base_config, overrides)
422
+ overrides = parse_dotted_overrides(arguments.override)
423
+ configuration = apply_dotted_overrides(base_config, overrides)
410
424
 
411
- run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
412
- print(run.model_dump_json(indent=2))
425
+ snapshot = retriever.build_snapshot(
426
+ corpus,
427
+ configuration_name=arguments.configuration_name,
428
+ configuration=configuration,
429
+ )
430
+ print(snapshot.model_dump_json(indent=2))
413
431
  return 0
414
432
 
415
433
 
416
434
  def cmd_extract_build(arguments: argparse.Namespace) -> int:
417
435
  """
418
- Build a text extraction run for the corpus using a pipeline of extractors.
436
+ Build a text extraction snapshot for the corpus using a pipeline of extractors.
419
437
 
420
438
  :param arguments: Parsed command-line interface arguments.
421
439
  :type arguments: argparse.Namespace
422
440
  :return: Exit code.
423
441
  :rtype: int
424
442
  """
425
- from .recipes import load_recipe_view
443
+ from .configuration import load_configuration_view
426
444
 
427
445
  corpus = (
428
446
  Corpus.open(arguments.corpus)
@@ -430,17 +448,17 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
430
448
  else Corpus.find(Path.cwd())
431
449
  )
432
450
 
433
- # Load recipe from file if --recipe is provided
434
- if getattr(arguments, "recipe", None):
435
- recipe_data = load_recipe_view(
436
- arguments.recipe,
437
- recipe_label="Recipe file",
438
- mapping_error_message="Extraction recipe must be a mapping/object",
451
+ # Load configuration from file if --configuration is provided
452
+ if getattr(arguments, "configuration", None):
453
+ configuration_data = load_configuration_view(
454
+ arguments.configuration,
455
+ configuration_label="Configuration file",
456
+ mapping_error_message="Extraction configuration must be a mapping/object",
439
457
  )
440
- loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
441
- loaded_config = recipe_data.get("config", {})
458
+ loaded_extractor_id = configuration_data.get("extractor_id", "pipeline")
459
+ loaded_config = configuration_data.get("configuration", {})
442
460
 
443
- # If the recipe specifies a non-pipeline extractor, wrap it in a pipeline
461
+ # If the configuration specifies a non-pipeline extractor, wrap it in a pipeline
444
462
  if loaded_extractor_id != "pipeline":
445
463
  extractor_id = "pipeline"
446
464
  config = {
@@ -466,11 +484,11 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
466
484
  config = {"steps": steps}
467
485
  extractor_id = "pipeline"
468
486
 
469
- manifest = build_extraction_run(
487
+ manifest = build_extraction_snapshot(
470
488
  corpus,
471
489
  extractor_id=extractor_id,
472
- recipe_name=arguments.recipe_name,
473
- config=config,
490
+ configuration_name=arguments.configuration_name,
491
+ configuration=config,
474
492
  )
475
493
  print(manifest.model_dump_json(indent=2))
476
494
  return 0
@@ -478,7 +496,7 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
478
496
 
479
497
  def cmd_extract_list(arguments: argparse.Namespace) -> int:
480
498
  """
481
- List extraction runs stored under the corpus.
499
+ List extraction snapshots stored under the corpus.
482
500
 
483
501
  :param arguments: Parsed command-line interface arguments.
484
502
  :type arguments: argparse.Namespace
@@ -490,14 +508,14 @@ def cmd_extract_list(arguments: argparse.Namespace) -> int:
490
508
  if getattr(arguments, "corpus", None)
491
509
  else Corpus.find(Path.cwd())
492
510
  )
493
- runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
494
- print(json.dumps([entry.model_dump() for entry in runs], indent=2))
511
+ snapshots = corpus.list_extraction_snapshots(extractor_id=arguments.extractor_id)
512
+ print(json.dumps([entry.model_dump() for entry in snapshots], indent=2))
495
513
  return 0
496
514
 
497
515
 
498
516
  def cmd_extract_show(arguments: argparse.Namespace) -> int:
499
517
  """
500
- Show an extraction run manifest.
518
+ Show an extraction snapshot manifest.
501
519
 
502
520
  :param arguments: Parsed command-line interface arguments.
503
521
  :type arguments: argparse.Namespace
@@ -509,9 +527,9 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
509
527
  if getattr(arguments, "corpus", None)
510
528
  else Corpus.find(Path.cwd())
511
529
  )
512
- reference = parse_extraction_run_reference(arguments.run)
513
- manifest = corpus.load_extraction_run_manifest(
514
- extractor_id=reference.extractor_id, run_id=reference.run_id
530
+ reference = parse_extraction_snapshot_reference(arguments.snapshot)
531
+ manifest = corpus.load_extraction_snapshot_manifest(
532
+ extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
515
533
  )
516
534
  print(manifest.model_dump_json(indent=2))
517
535
  return 0
@@ -519,7 +537,7 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
519
537
 
520
538
  def cmd_extract_delete(arguments: argparse.Namespace) -> int:
521
539
  """
522
- Delete an extraction run directory and its derived artifacts.
540
+ Delete an extraction snapshot directory and its derived artifacts.
523
541
 
524
542
  :param arguments: Parsed command-line interface arguments.
525
543
  :type arguments: argparse.Namespace
@@ -531,17 +549,19 @@ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
531
549
  if getattr(arguments, "corpus", None)
532
550
  else Corpus.find(Path.cwd())
533
551
  )
534
- if arguments.confirm != arguments.run:
535
- raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
536
- reference = parse_extraction_run_reference(arguments.run)
537
- corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
538
- print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
552
+ if arguments.confirm != arguments.snapshot:
553
+ raise ValueError("Refusing to delete extraction snapshot without an exact --confirm match.")
554
+ reference = parse_extraction_snapshot_reference(arguments.snapshot)
555
+ corpus.delete_extraction_snapshot(
556
+ extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
557
+ )
558
+ print(json.dumps({"deleted": True, "snapshot": arguments.snapshot}, indent=2))
539
559
  return 0
540
560
 
541
561
 
542
562
  def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
543
563
  """
544
- Evaluate an extraction run against a dataset.
564
+ Evaluate an extraction snapshot against a dataset.
545
565
 
546
566
  :param arguments: Parsed command-line interface arguments.
547
567
  :type arguments: argparse.Namespace
@@ -553,14 +573,14 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
553
573
  if getattr(arguments, "corpus", None)
554
574
  else Corpus.find(Path.cwd())
555
575
  )
556
- if arguments.run:
557
- run_ref = parse_extraction_run_reference(arguments.run)
576
+ if arguments.snapshot:
577
+ snapshot_ref = parse_extraction_snapshot_reference(arguments.snapshot)
558
578
  else:
559
- run_ref = corpus.latest_extraction_run_reference()
560
- if run_ref is None:
561
- raise ValueError("Extraction evaluation requires an extraction run")
579
+ snapshot_ref = corpus.latest_extraction_snapshot_reference()
580
+ if snapshot_ref is None:
581
+ raise ValueError("Extraction evaluation requires an extraction snapshot")
562
582
  print(
563
- "Warning: using latest extraction run; pass --run for reproducibility.",
583
+ "Warning: using latest extraction snapshot; pass --snapshot for reproducibility.",
564
584
  file=sys.stderr,
565
585
  )
566
586
 
@@ -572,17 +592,19 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
572
592
  except ValidationError as exc:
573
593
  raise ValueError(f"Invalid extraction dataset: {exc}") from exc
574
594
 
575
- run = corpus.load_extraction_run_manifest(
576
- extractor_id=run_ref.extractor_id,
577
- run_id=run_ref.run_id,
595
+ snapshot = corpus.load_extraction_snapshot_manifest(
596
+ extractor_id=snapshot_ref.extractor_id,
597
+ snapshot_id=snapshot_ref.snapshot_id,
578
598
  )
579
- result = evaluate_extraction_run(
599
+ result = evaluate_extraction_snapshot(
580
600
  corpus=corpus,
581
- run=run,
582
- extractor_id=run_ref.extractor_id,
601
+ snapshot=snapshot,
602
+ extractor_id=snapshot_ref.extractor_id,
583
603
  dataset=dataset,
584
604
  )
585
- write_extraction_evaluation_result(corpus=corpus, run_id=run.run_id, result=result)
605
+ write_extraction_evaluation_result(
606
+ corpus=corpus, snapshot_id=snapshot.snapshot_id, result=result
607
+ )
586
608
  print(result.model_dump_json(indent=2))
587
609
  return 0
588
610
 
@@ -601,18 +623,21 @@ def cmd_query(arguments: argparse.Namespace) -> int:
601
623
  if getattr(arguments, "corpus", None)
602
624
  else Corpus.find(Path.cwd())
603
625
  )
604
- run_id = arguments.run or corpus.latest_run_id
605
- if not run_id:
606
- raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
607
- run = corpus.load_run(run_id)
608
- if arguments.backend and arguments.backend != run.recipe.backend_id:
626
+ snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
627
+ if not snapshot_id:
609
628
  raise ValueError(
610
- f"Backend mismatch: run uses {run.recipe.backend_id!r} but {arguments.backend!r} was requested"
629
+ "No snapshot identifier provided and no latest snapshot is recorded for this corpus"
611
630
  )
612
- backend = get_backend(run.recipe.backend_id)
631
+ snapshot = corpus.load_snapshot(snapshot_id)
632
+ if arguments.retriever and arguments.retriever != snapshot.configuration.retriever_id:
633
+ raise ValueError(
634
+ "Retriever mismatch: snapshot uses "
635
+ f"{snapshot.configuration.retriever_id!r} but {arguments.retriever!r} was requested"
636
+ )
637
+ retriever = get_retriever(snapshot.configuration.retriever_id)
613
638
  query_text = arguments.query if arguments.query is not None else sys.stdin.read()
614
639
  budget = _budget_from_args(arguments)
615
- result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
640
+ result = retriever.query(corpus, snapshot=snapshot, query_text=query_text, budget=budget)
616
641
  processed_evidence = result.evidence
617
642
  if getattr(arguments, "reranker_id", None):
618
643
  processed_evidence = apply_evidence_reranker(
@@ -683,7 +708,7 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
683
708
 
684
709
  def cmd_eval(arguments: argparse.Namespace) -> int:
685
710
  """
686
- Evaluate a retrieval run against a dataset.
711
+ Evaluate a retrieval snapshot against a dataset.
687
712
 
688
713
  :param arguments: Parsed command-line interface arguments.
689
714
  :type arguments: argparse.Namespace
@@ -695,13 +720,15 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
695
720
  if getattr(arguments, "corpus", None)
696
721
  else Corpus.find(Path.cwd())
697
722
  )
698
- run_id = arguments.run or corpus.latest_run_id
699
- if not run_id:
700
- raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
701
- run = corpus.load_run(run_id)
723
+ snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
724
+ if not snapshot_id:
725
+ raise ValueError(
726
+ "No snapshot identifier provided and no latest snapshot is recorded for this corpus"
727
+ )
728
+ snapshot = corpus.load_snapshot(snapshot_id)
702
729
  dataset = load_dataset(Path(arguments.dataset))
703
730
  budget = _budget_from_args(arguments)
704
- result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
731
+ result = evaluate_snapshot(corpus=corpus, snapshot=snapshot, dataset=dataset, budget=budget)
705
732
  print(result.model_dump_json(indent=2))
706
733
  return 0
707
734
 
@@ -741,29 +768,33 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
741
768
  :return: Exit code.
742
769
  :rtype: int
743
770
  """
744
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
771
+ from .configuration import (
772
+ apply_dotted_overrides,
773
+ load_configuration_view,
774
+ parse_dotted_overrides,
775
+ )
745
776
 
746
777
  corpus = (
747
778
  Corpus.open(arguments.corpus)
748
779
  if getattr(arguments, "corpus", None)
749
780
  else Corpus.find(Path.cwd())
750
781
  )
751
- recipe_data = load_recipe_view(
752
- arguments.recipe,
753
- recipe_label="Recipe file",
754
- mapping_error_message="Topic modeling recipe must be a mapping/object",
782
+ configuration_data = load_configuration_view(
783
+ arguments.configuration,
784
+ configuration_label="Configuration file",
785
+ mapping_error_message="Topic modeling configuration must be a mapping/object",
755
786
  )
756
- overrides = parse_dotted_overrides(arguments.config)
757
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
787
+ overrides = parse_dotted_overrides(arguments.override)
788
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
758
789
 
759
- if arguments.extraction_run:
760
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
790
+ if arguments.extraction_snapshot:
791
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
761
792
  else:
762
- extraction_run = corpus.latest_extraction_run_reference()
763
- if extraction_run is None:
764
- raise ValueError("Topic analysis requires an extraction run to supply text inputs")
793
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
794
+ if extraction_snapshot is None:
795
+ raise ValueError("Topic analysis requires an extraction snapshot to supply text inputs")
765
796
  print(
766
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
797
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
767
798
  file=sys.stderr,
768
799
  )
769
800
 
@@ -771,12 +802,12 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
771
802
  try:
772
803
  output = backend.run_analysis(
773
804
  corpus,
774
- recipe_name=arguments.recipe_name,
775
- config=recipe_data,
776
- extraction_run=extraction_run,
805
+ configuration_name=arguments.configuration_name,
806
+ configuration=configuration_data,
807
+ extraction_snapshot=extraction_snapshot,
777
808
  )
778
809
  except ValidationError as exc:
779
- raise ValueError(f"Invalid topic modeling recipe: {exc}") from exc
810
+ raise ValueError(f"Invalid topic modeling configuration: {exc}") from exc
780
811
  print(output.model_dump_json(indent=2))
781
812
  return 0
782
813
 
@@ -790,7 +821,11 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
790
821
  :return: Exit code.
791
822
  :rtype: int
792
823
  """
793
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
824
+ from .configuration import (
825
+ apply_dotted_overrides,
826
+ load_configuration_view,
827
+ parse_dotted_overrides,
828
+ )
794
829
 
795
830
  corpus = (
796
831
  Corpus.open(arguments.corpus)
@@ -798,28 +833,30 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
798
833
  else Corpus.find(Path.cwd())
799
834
  )
800
835
 
801
- recipe_data: dict[str, object] = {}
802
- if arguments.recipe is not None:
803
- recipe_data = load_recipe_view(
804
- arguments.recipe,
805
- recipe_label="Recipe file",
806
- mapping_error_message="Profiling recipe must be a mapping/object",
836
+ configuration_data: dict[str, object] = {}
837
+ if arguments.configuration is not None:
838
+ configuration_data = load_configuration_view(
839
+ arguments.configuration,
840
+ configuration_label="Configuration file",
841
+ mapping_error_message="Profiling configuration must be a mapping/object",
807
842
  )
808
- overrides = parse_dotted_overrides(arguments.config)
809
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
843
+ overrides = parse_dotted_overrides(arguments.override)
844
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
810
845
  else:
811
- overrides = parse_dotted_overrides(arguments.config)
846
+ overrides = parse_dotted_overrides(arguments.override)
812
847
  if overrides:
813
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
848
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
814
849
 
815
- if arguments.extraction_run:
816
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
850
+ if arguments.extraction_snapshot:
851
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
817
852
  else:
818
- extraction_run = corpus.latest_extraction_run_reference()
819
- if extraction_run is None:
820
- raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
853
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
854
+ if extraction_snapshot is None:
855
+ raise ValueError(
856
+ "Profiling analysis requires an extraction snapshot to supply text inputs"
857
+ )
821
858
  print(
822
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
859
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
823
860
  file=sys.stderr,
824
861
  )
825
862
 
@@ -827,12 +864,12 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
827
864
  try:
828
865
  output = backend.run_analysis(
829
866
  corpus,
830
- recipe_name=arguments.recipe_name,
831
- config=recipe_data,
832
- extraction_run=extraction_run,
867
+ configuration_name=arguments.configuration_name,
868
+ configuration=configuration_data,
869
+ extraction_snapshot=extraction_snapshot,
833
870
  )
834
871
  except ValidationError as exc:
835
- raise ValueError(f"Invalid profiling recipe: {exc}") from exc
872
+ raise ValueError(f"Invalid profiling configuration: {exc}") from exc
836
873
  print(output.model_dump_json(indent=2))
837
874
  return 0
838
875
 
@@ -846,29 +883,35 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
846
883
  :return: Exit code.
847
884
  :rtype: int
848
885
  """
849
- from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
886
+ from .configuration import (
887
+ apply_dotted_overrides,
888
+ load_configuration_view,
889
+ parse_dotted_overrides,
890
+ )
850
891
 
851
892
  corpus = (
852
893
  Corpus.open(arguments.corpus)
853
894
  if getattr(arguments, "corpus", None)
854
895
  else Corpus.find(Path.cwd())
855
896
  )
856
- recipe_data = load_recipe_view(
857
- arguments.recipe,
858
- recipe_label="Recipe file",
859
- mapping_error_message="Markov analysis recipe must be a mapping/object",
897
+ configuration_data = load_configuration_view(
898
+ arguments.configuration,
899
+ configuration_label="Configuration file",
900
+ mapping_error_message="Markov analysis configuration must be a mapping/object",
860
901
  )
861
- overrides = parse_dotted_overrides(arguments.config)
862
- recipe_data = apply_dotted_overrides(recipe_data, overrides)
902
+ overrides = parse_dotted_overrides(arguments.override)
903
+ configuration_data = apply_dotted_overrides(configuration_data, overrides)
863
904
 
864
- if arguments.extraction_run:
865
- extraction_run = parse_extraction_run_reference(arguments.extraction_run)
905
+ if arguments.extraction_snapshot:
906
+ extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
866
907
  else:
867
- extraction_run = corpus.latest_extraction_run_reference()
868
- if extraction_run is None:
869
- raise ValueError("Markov analysis requires an extraction run to supply text inputs")
908
+ extraction_snapshot = corpus.latest_extraction_snapshot_reference()
909
+ if extraction_snapshot is None:
910
+ raise ValueError(
911
+ "Markov analysis requires an extraction snapshot to supply text inputs"
912
+ )
870
913
  print(
871
- "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
914
+ "Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
872
915
  file=sys.stderr,
873
916
  )
874
917
 
@@ -876,12 +919,12 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
876
919
  try:
877
920
  output = backend.run_analysis(
878
921
  corpus,
879
- recipe_name=arguments.recipe_name,
880
- config=recipe_data,
881
- extraction_run=extraction_run,
922
+ configuration_name=arguments.configuration_name,
923
+ configuration=configuration_data,
924
+ extraction_snapshot=extraction_snapshot,
882
925
  )
883
926
  except ValidationError as exc:
884
- raise ValueError(f"Invalid Markov analysis recipe: {exc}") from exc
927
+ raise ValueError(f"Invalid Markov analysis configuration: {exc}") from exc
885
928
  print(output.model_dump_json(indent=2))
886
929
  return 0
887
930
 
@@ -967,41 +1010,46 @@ def build_parser() -> argparse.ArgumentParser:
967
1010
  )
968
1011
  p_purge.set_defaults(func=cmd_purge)
969
1012
 
970
- p_build = sub.add_parser("build", help="Build a retrieval backend run for the corpus.")
1013
+ p_build = sub.add_parser("build", help="Build a retrieval snapshot for the corpus.")
971
1014
  _add_common_corpus_arg(p_build)
972
1015
  p_build.add_argument(
973
- "--backend",
1016
+ "--retriever",
974
1017
  required=True,
975
- help="Backend identifier (for example, scan, sqlite-full-text-search).",
1018
+ help="Retriever identifier (for example, scan, sqlite-full-text-search).",
1019
+ )
1020
+ p_build.add_argument(
1021
+ "--configuration-name", default="default", help="Human-readable configuration name."
976
1022
  )
977
- p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
978
1023
  p_build.add_argument(
979
- "--recipe",
1024
+ "--configuration",
980
1025
  default=None,
981
1026
  action="append",
982
- help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
1027
+ help="Path to YAML configuration file (repeatable). If provided, files are composed in precedence order.",
983
1028
  )
984
1029
  p_build.add_argument(
1030
+ "--override",
985
1031
  "--config",
986
1032
  action="append",
987
1033
  default=None,
988
- help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
1034
+ help="Configuration override as key=value (repeatable). Dotted keys create nested config mappings.",
989
1035
  )
990
1036
  p_build.set_defaults(func=cmd_build)
991
1037
 
992
- p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
1038
+ p_extract = sub.add_parser(
1039
+ "extract", help="Work with text extraction snapshots for the corpus."
1040
+ )
993
1041
  extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
994
1042
 
995
- p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
1043
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction snapshot.")
996
1044
  _add_common_corpus_arg(p_extract_build)
997
1045
  p_extract_build.add_argument(
998
- "--recipe-name", default="default", help="Human-readable recipe name."
1046
+ "--configuration-name", default="default", help="Human-readable configuration name."
999
1047
  )
1000
1048
  p_extract_build.add_argument(
1001
- "--recipe",
1049
+ "--configuration",
1002
1050
  default=None,
1003
1051
  action="append",
1004
- help="Path to YAML recipe file. If provided, --step arguments are ignored.",
1052
+ help="Path to YAML configuration file. If provided, --step arguments are ignored.",
1005
1053
  )
1006
1054
  p_extract_build.add_argument(
1007
1055
  "--step",
@@ -1011,7 +1059,7 @@ def build_parser() -> argparse.ArgumentParser:
1011
1059
  )
1012
1060
  p_extract_build.set_defaults(func=cmd_extract_build)
1013
1061
 
1014
- p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
1062
+ p_extract_list = extract_sub.add_parser("list", help="List extraction snapshots.")
1015
1063
  _add_common_corpus_arg(p_extract_list)
1016
1064
  p_extract_list.add_argument(
1017
1065
  "--extractor-id",
@@ -1020,37 +1068,39 @@ def build_parser() -> argparse.ArgumentParser:
1020
1068
  )
1021
1069
  p_extract_list.set_defaults(func=cmd_extract_list)
1022
1070
 
1023
- p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
1071
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction snapshot manifest.")
1024
1072
  _add_common_corpus_arg(p_extract_show)
1025
1073
  p_extract_show.add_argument(
1026
- "--run",
1074
+ "--snapshot",
1027
1075
  required=True,
1028
- help="Extraction run reference in the form extractor_id:run_id.",
1076
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1029
1077
  )
1030
1078
  p_extract_show.set_defaults(func=cmd_extract_show)
1031
1079
 
1032
- p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
1080
+ p_extract_delete = extract_sub.add_parser(
1081
+ "delete", help="Delete an extraction snapshot directory."
1082
+ )
1033
1083
  _add_common_corpus_arg(p_extract_delete)
1034
1084
  p_extract_delete.add_argument(
1035
- "--run",
1085
+ "--snapshot",
1036
1086
  required=True,
1037
- help="Extraction run reference in the form extractor_id:run_id.",
1087
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1038
1088
  )
1039
1089
  p_extract_delete.add_argument(
1040
1090
  "--confirm",
1041
1091
  required=True,
1042
- help="Type the exact extractor_id:run_id to confirm deletion.",
1092
+ help="Type the exact extractor_id:snapshot_id to confirm deletion.",
1043
1093
  )
1044
1094
  p_extract_delete.set_defaults(func=cmd_extract_delete)
1045
1095
 
1046
1096
  p_extract_evaluate = extract_sub.add_parser(
1047
- "evaluate", help="Evaluate an extraction run against a dataset."
1097
+ "evaluate", help="Evaluate an extraction snapshot against a dataset."
1048
1098
  )
1049
1099
  _add_common_corpus_arg(p_extract_evaluate)
1050
1100
  p_extract_evaluate.add_argument(
1051
- "--run",
1101
+ "--snapshot",
1052
1102
  default=None,
1053
- help="Extraction run reference in the form extractor_id:run_id (defaults to latest run).",
1103
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id (defaults to latest snapshot).",
1054
1104
  )
1055
1105
  p_extract_evaluate.add_argument(
1056
1106
  "--dataset",
@@ -1061,8 +1111,10 @@ def build_parser() -> argparse.ArgumentParser:
1061
1111
 
1062
1112
  p_query = sub.add_parser("query", help="Run a retrieval query.")
1063
1113
  _add_common_corpus_arg(p_query)
1064
- p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
1065
- p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
1114
+ p_query.add_argument(
1115
+ "--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
1116
+ )
1117
+ p_query.add_argument("--retriever", default=None, help="Validate retriever identifier.")
1066
1118
  p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
1067
1119
  p_query.add_argument(
1068
1120
  "--offset",
@@ -1071,7 +1123,7 @@ def build_parser() -> argparse.ArgumentParser:
1071
1123
  help="Skip this many ranked candidates before selecting evidence (pagination).",
1072
1124
  )
1073
1125
  p_query.add_argument("--max-total-items", type=int, default=5)
1074
- p_query.add_argument("--max-total-characters", type=int, default=2000)
1126
+ p_query.add_argument("--maximum-total-characters", type=int, default=2000)
1075
1127
  p_query.add_argument("--max-items-per-source", type=int, default=5)
1076
1128
  p_query.add_argument(
1077
1129
  "--reranker-id",
@@ -1122,16 +1174,18 @@ def build_parser() -> argparse.ArgumentParser:
1122
1174
  )
1123
1175
  p_context_pack_build.set_defaults(func=cmd_context_pack_build)
1124
1176
 
1125
- p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
1177
+ p_eval = sub.add_parser("eval", help="Evaluate a snapshot against a dataset.")
1126
1178
  _add_common_corpus_arg(p_eval)
1127
- p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
1179
+ p_eval.add_argument(
1180
+ "--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
1181
+ )
1128
1182
  p_eval.add_argument(
1129
1183
  "--dataset",
1130
1184
  required=True,
1131
1185
  help="Path to dataset JavaScript Object Notation file.",
1132
1186
  )
1133
1187
  p_eval.add_argument("--max-total-items", type=int, default=5)
1134
- p_eval.add_argument("--max-total-characters", type=int, default=2000)
1188
+ p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
1135
1189
  p_eval.add_argument("--max-items-per-source", type=int, default=5)
1136
1190
  p_eval.set_defaults(func=cmd_eval)
1137
1191
 
@@ -1160,78 +1214,81 @@ def build_parser() -> argparse.ArgumentParser:
1160
1214
  p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
1161
1215
  _add_common_corpus_arg(p_analyze_topics)
1162
1216
  p_analyze_topics.add_argument(
1163
- "--recipe",
1217
+ "--configuration",
1164
1218
  required=True,
1165
1219
  action="append",
1166
- help="Path to topic modeling recipe YAML. Repeatable; later recipes override earlier recipes.",
1220
+ help="Path to topic modeling configuration YAML. Repeatable; later files override earlier ones.",
1167
1221
  )
1168
1222
  p_analyze_topics.add_argument(
1223
+ "--override",
1169
1224
  "--config",
1170
1225
  action="append",
1171
1226
  default=[],
1172
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1227
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1173
1228
  )
1174
1229
  p_analyze_topics.add_argument(
1175
- "--recipe-name",
1230
+ "--configuration-name",
1176
1231
  default="default",
1177
- help="Human-readable recipe name.",
1232
+ help="Human-readable configuration name.",
1178
1233
  )
1179
1234
  p_analyze_topics.add_argument(
1180
- "--extraction-run",
1235
+ "--extraction-snapshot",
1181
1236
  default=None,
1182
- help="Extraction run reference in the form extractor_id:run_id.",
1237
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1183
1238
  )
1184
1239
  p_analyze_topics.set_defaults(func=cmd_analyze_topics)
1185
1240
 
1186
1241
  p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
1187
1242
  _add_common_corpus_arg(p_analyze_profile)
1188
1243
  p_analyze_profile.add_argument(
1189
- "--recipe",
1244
+ "--configuration",
1190
1245
  default=None,
1191
1246
  action="append",
1192
- help="Optional profiling recipe YAML file. Repeatable; later recipes override earlier recipes.",
1247
+ help="Optional profiling configuration YAML file. Repeatable; later files override earlier ones.",
1193
1248
  )
1194
1249
  p_analyze_profile.add_argument(
1250
+ "--override",
1195
1251
  "--config",
1196
1252
  action="append",
1197
1253
  default=[],
1198
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1254
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1199
1255
  )
1200
1256
  p_analyze_profile.add_argument(
1201
- "--recipe-name",
1257
+ "--configuration-name",
1202
1258
  default="default",
1203
- help="Human-readable recipe name.",
1259
+ help="Human-readable configuration name.",
1204
1260
  )
1205
1261
  p_analyze_profile.add_argument(
1206
- "--extraction-run",
1262
+ "--extraction-snapshot",
1207
1263
  default=None,
1208
- help="Extraction run reference in the form extractor_id:run_id.",
1264
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1209
1265
  )
1210
1266
  p_analyze_profile.set_defaults(func=cmd_analyze_profile)
1211
1267
 
1212
1268
  p_analyze_markov = analyze_sub.add_parser("markov", help="Run Markov analysis.")
1213
1269
  _add_common_corpus_arg(p_analyze_markov)
1214
1270
  p_analyze_markov.add_argument(
1215
- "--recipe",
1271
+ "--configuration",
1216
1272
  required=True,
1217
1273
  action="append",
1218
- help="Path to Markov analysis recipe YAML. Repeatable; later recipes override earlier recipes.",
1274
+ help="Path to Markov analysis configuration YAML. Repeatable; later files override earlier ones.",
1219
1275
  )
1220
1276
  p_analyze_markov.add_argument(
1277
+ "--override",
1221
1278
  "--config",
1222
1279
  action="append",
1223
1280
  default=[],
1224
- help="Override key=value pairs applied after composing recipes (supports dotted keys).",
1281
+ help="Override key=value pairs applied after composing configurations (supports dotted keys).",
1225
1282
  )
1226
1283
  p_analyze_markov.add_argument(
1227
- "--recipe-name",
1284
+ "--configuration-name",
1228
1285
  default="default",
1229
- help="Human-readable recipe name.",
1286
+ help="Human-readable configuration name.",
1230
1287
  )
1231
1288
  p_analyze_markov.add_argument(
1232
- "--extraction-run",
1289
+ "--extraction-snapshot",
1233
1290
  default=None,
1234
- help="Extraction run reference in the form extractor_id:run_id.",
1291
+ help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
1235
1292
  )
1236
1293
  p_analyze_markov.set_defaults(func=cmd_analyze_markov)
1237
1294
 
@@ -1256,7 +1313,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
1256
1313
  FileExistsError,
1257
1314
  KeyError,
1258
1315
  ValueError,
1259
- ExtractionRunFatalError,
1316
+ ExtractionSnapshotFatalError,
1260
1317
  NotImplementedError,
1261
1318
  ValidationError,
1262
1319
  ) as exception: