biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py
CHANGED
|
@@ -13,7 +13,6 @@ from typing import Dict, Iterable, List, Optional
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
15
15
|
from .analysis import get_analysis_backend
|
|
16
|
-
from .backends import get_backend
|
|
17
16
|
from .context import (
|
|
18
17
|
CharacterBudget,
|
|
19
18
|
ContextPackPolicy,
|
|
@@ -24,16 +23,17 @@ from .context import (
|
|
|
24
23
|
)
|
|
25
24
|
from .corpus import Corpus
|
|
26
25
|
from .crawl import CrawlRequest, crawl_into_corpus
|
|
27
|
-
from .errors import
|
|
28
|
-
from .evaluation import
|
|
26
|
+
from .errors import ExtractionSnapshotFatalError, IngestCollisionError
|
|
27
|
+
from .evaluation import evaluate_snapshot, load_dataset
|
|
29
28
|
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
30
|
-
from .extraction import
|
|
29
|
+
from .extraction import build_extraction_snapshot
|
|
31
30
|
from .extraction_evaluation import (
|
|
32
|
-
|
|
31
|
+
evaluate_extraction_snapshot,
|
|
33
32
|
load_extraction_dataset,
|
|
34
33
|
write_extraction_evaluation_result,
|
|
35
34
|
)
|
|
36
|
-
from .models import QueryBudget, RetrievalResult,
|
|
35
|
+
from .models import QueryBudget, RetrievalResult, parse_extraction_snapshot_reference
|
|
36
|
+
from .retrievers import get_retriever
|
|
37
37
|
from .uris import corpus_ref_to_path
|
|
38
38
|
|
|
39
39
|
|
|
@@ -391,48 +391,56 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
391
391
|
|
|
392
392
|
def cmd_build(arguments: argparse.Namespace) -> int:
|
|
393
393
|
"""
|
|
394
|
-
Build a retrieval
|
|
394
|
+
Build a retrieval snapshot for a retriever.
|
|
395
395
|
|
|
396
396
|
:param arguments: Parsed command-line interface arguments.
|
|
397
397
|
:type arguments: argparse.Namespace
|
|
398
398
|
:return: Exit code.
|
|
399
399
|
:rtype: int
|
|
400
400
|
"""
|
|
401
|
-
from .
|
|
401
|
+
from .configuration import (
|
|
402
|
+
apply_dotted_overrides,
|
|
403
|
+
load_configuration_view,
|
|
404
|
+
parse_dotted_overrides,
|
|
405
|
+
)
|
|
402
406
|
|
|
403
407
|
corpus = (
|
|
404
408
|
Corpus.open(arguments.corpus)
|
|
405
409
|
if getattr(arguments, "corpus", None)
|
|
406
410
|
else Corpus.find(Path.cwd())
|
|
407
411
|
)
|
|
408
|
-
|
|
412
|
+
retriever = get_retriever(arguments.retriever)
|
|
409
413
|
|
|
410
414
|
base_config: Dict[str, object] = {}
|
|
411
|
-
if getattr(arguments, "
|
|
412
|
-
base_config =
|
|
413
|
-
arguments.
|
|
414
|
-
|
|
415
|
-
mapping_error_message="Retrieval
|
|
415
|
+
if getattr(arguments, "configuration", None):
|
|
416
|
+
base_config = load_configuration_view(
|
|
417
|
+
arguments.configuration,
|
|
418
|
+
configuration_label="Configuration file",
|
|
419
|
+
mapping_error_message="Retrieval snapshot configuration must be a mapping/object",
|
|
416
420
|
)
|
|
417
421
|
|
|
418
|
-
overrides = parse_dotted_overrides(arguments.
|
|
419
|
-
|
|
422
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
423
|
+
configuration = apply_dotted_overrides(base_config, overrides)
|
|
420
424
|
|
|
421
|
-
|
|
422
|
-
|
|
425
|
+
snapshot = retriever.build_snapshot(
|
|
426
|
+
corpus,
|
|
427
|
+
configuration_name=arguments.configuration_name,
|
|
428
|
+
configuration=configuration,
|
|
429
|
+
)
|
|
430
|
+
print(snapshot.model_dump_json(indent=2))
|
|
423
431
|
return 0
|
|
424
432
|
|
|
425
433
|
|
|
426
434
|
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
427
435
|
"""
|
|
428
|
-
Build a text extraction
|
|
436
|
+
Build a text extraction snapshot for the corpus using a pipeline of extractors.
|
|
429
437
|
|
|
430
438
|
:param arguments: Parsed command-line interface arguments.
|
|
431
439
|
:type arguments: argparse.Namespace
|
|
432
440
|
:return: Exit code.
|
|
433
441
|
:rtype: int
|
|
434
442
|
"""
|
|
435
|
-
from .
|
|
443
|
+
from .configuration import load_configuration_view
|
|
436
444
|
|
|
437
445
|
corpus = (
|
|
438
446
|
Corpus.open(arguments.corpus)
|
|
@@ -440,17 +448,17 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
440
448
|
else Corpus.find(Path.cwd())
|
|
441
449
|
)
|
|
442
450
|
|
|
443
|
-
# Load
|
|
444
|
-
if getattr(arguments, "
|
|
445
|
-
|
|
446
|
-
arguments.
|
|
447
|
-
|
|
448
|
-
mapping_error_message="Extraction
|
|
451
|
+
# Load configuration from file if --configuration is provided
|
|
452
|
+
if getattr(arguments, "configuration", None):
|
|
453
|
+
configuration_data = load_configuration_view(
|
|
454
|
+
arguments.configuration,
|
|
455
|
+
configuration_label="Configuration file",
|
|
456
|
+
mapping_error_message="Extraction configuration must be a mapping/object",
|
|
449
457
|
)
|
|
450
|
-
loaded_extractor_id =
|
|
451
|
-
loaded_config =
|
|
458
|
+
loaded_extractor_id = configuration_data.get("extractor_id", "pipeline")
|
|
459
|
+
loaded_config = configuration_data.get("configuration", {})
|
|
452
460
|
|
|
453
|
-
# If the
|
|
461
|
+
# If the configuration specifies a non-pipeline extractor, wrap it in a pipeline
|
|
454
462
|
if loaded_extractor_id != "pipeline":
|
|
455
463
|
extractor_id = "pipeline"
|
|
456
464
|
config = {
|
|
@@ -476,11 +484,11 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
476
484
|
config = {"steps": steps}
|
|
477
485
|
extractor_id = "pipeline"
|
|
478
486
|
|
|
479
|
-
manifest =
|
|
487
|
+
manifest = build_extraction_snapshot(
|
|
480
488
|
corpus,
|
|
481
489
|
extractor_id=extractor_id,
|
|
482
|
-
|
|
483
|
-
|
|
490
|
+
configuration_name=arguments.configuration_name,
|
|
491
|
+
configuration=config,
|
|
484
492
|
)
|
|
485
493
|
print(manifest.model_dump_json(indent=2))
|
|
486
494
|
return 0
|
|
@@ -488,7 +496,7 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
488
496
|
|
|
489
497
|
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
490
498
|
"""
|
|
491
|
-
List extraction
|
|
499
|
+
List extraction snapshots stored under the corpus.
|
|
492
500
|
|
|
493
501
|
:param arguments: Parsed command-line interface arguments.
|
|
494
502
|
:type arguments: argparse.Namespace
|
|
@@ -500,14 +508,14 @@ def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
|
500
508
|
if getattr(arguments, "corpus", None)
|
|
501
509
|
else Corpus.find(Path.cwd())
|
|
502
510
|
)
|
|
503
|
-
|
|
504
|
-
print(json.dumps([entry.model_dump() for entry in
|
|
511
|
+
snapshots = corpus.list_extraction_snapshots(extractor_id=arguments.extractor_id)
|
|
512
|
+
print(json.dumps([entry.model_dump() for entry in snapshots], indent=2))
|
|
505
513
|
return 0
|
|
506
514
|
|
|
507
515
|
|
|
508
516
|
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
509
517
|
"""
|
|
510
|
-
Show an extraction
|
|
518
|
+
Show an extraction snapshot manifest.
|
|
511
519
|
|
|
512
520
|
:param arguments: Parsed command-line interface arguments.
|
|
513
521
|
:type arguments: argparse.Namespace
|
|
@@ -519,9 +527,9 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
|
519
527
|
if getattr(arguments, "corpus", None)
|
|
520
528
|
else Corpus.find(Path.cwd())
|
|
521
529
|
)
|
|
522
|
-
reference =
|
|
523
|
-
manifest = corpus.
|
|
524
|
-
extractor_id=reference.extractor_id,
|
|
530
|
+
reference = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
531
|
+
manifest = corpus.load_extraction_snapshot_manifest(
|
|
532
|
+
extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
|
|
525
533
|
)
|
|
526
534
|
print(manifest.model_dump_json(indent=2))
|
|
527
535
|
return 0
|
|
@@ -529,7 +537,7 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
|
529
537
|
|
|
530
538
|
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
531
539
|
"""
|
|
532
|
-
Delete an extraction
|
|
540
|
+
Delete an extraction snapshot directory and its derived artifacts.
|
|
533
541
|
|
|
534
542
|
:param arguments: Parsed command-line interface arguments.
|
|
535
543
|
:type arguments: argparse.Namespace
|
|
@@ -541,17 +549,19 @@ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
|
541
549
|
if getattr(arguments, "corpus", None)
|
|
542
550
|
else Corpus.find(Path.cwd())
|
|
543
551
|
)
|
|
544
|
-
if arguments.confirm != arguments.
|
|
545
|
-
raise ValueError("Refusing to delete extraction
|
|
546
|
-
reference =
|
|
547
|
-
corpus.
|
|
548
|
-
|
|
552
|
+
if arguments.confirm != arguments.snapshot:
|
|
553
|
+
raise ValueError("Refusing to delete extraction snapshot without an exact --confirm match.")
|
|
554
|
+
reference = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
555
|
+
corpus.delete_extraction_snapshot(
|
|
556
|
+
extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
|
|
557
|
+
)
|
|
558
|
+
print(json.dumps({"deleted": True, "snapshot": arguments.snapshot}, indent=2))
|
|
549
559
|
return 0
|
|
550
560
|
|
|
551
561
|
|
|
552
562
|
def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
553
563
|
"""
|
|
554
|
-
Evaluate an extraction
|
|
564
|
+
Evaluate an extraction snapshot against a dataset.
|
|
555
565
|
|
|
556
566
|
:param arguments: Parsed command-line interface arguments.
|
|
557
567
|
:type arguments: argparse.Namespace
|
|
@@ -563,14 +573,14 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
|
563
573
|
if getattr(arguments, "corpus", None)
|
|
564
574
|
else Corpus.find(Path.cwd())
|
|
565
575
|
)
|
|
566
|
-
if arguments.
|
|
567
|
-
|
|
576
|
+
if arguments.snapshot:
|
|
577
|
+
snapshot_ref = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
568
578
|
else:
|
|
569
|
-
|
|
570
|
-
if
|
|
571
|
-
raise ValueError("Extraction evaluation requires an extraction
|
|
579
|
+
snapshot_ref = corpus.latest_extraction_snapshot_reference()
|
|
580
|
+
if snapshot_ref is None:
|
|
581
|
+
raise ValueError("Extraction evaluation requires an extraction snapshot")
|
|
572
582
|
print(
|
|
573
|
-
"Warning: using latest extraction
|
|
583
|
+
"Warning: using latest extraction snapshot; pass --snapshot for reproducibility.",
|
|
574
584
|
file=sys.stderr,
|
|
575
585
|
)
|
|
576
586
|
|
|
@@ -582,17 +592,19 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
|
582
592
|
except ValidationError as exc:
|
|
583
593
|
raise ValueError(f"Invalid extraction dataset: {exc}") from exc
|
|
584
594
|
|
|
585
|
-
|
|
586
|
-
extractor_id=
|
|
587
|
-
|
|
595
|
+
snapshot = corpus.load_extraction_snapshot_manifest(
|
|
596
|
+
extractor_id=snapshot_ref.extractor_id,
|
|
597
|
+
snapshot_id=snapshot_ref.snapshot_id,
|
|
588
598
|
)
|
|
589
|
-
result =
|
|
599
|
+
result = evaluate_extraction_snapshot(
|
|
590
600
|
corpus=corpus,
|
|
591
|
-
|
|
592
|
-
extractor_id=
|
|
601
|
+
snapshot=snapshot,
|
|
602
|
+
extractor_id=snapshot_ref.extractor_id,
|
|
593
603
|
dataset=dataset,
|
|
594
604
|
)
|
|
595
|
-
write_extraction_evaluation_result(
|
|
605
|
+
write_extraction_evaluation_result(
|
|
606
|
+
corpus=corpus, snapshot_id=snapshot.snapshot_id, result=result
|
|
607
|
+
)
|
|
596
608
|
print(result.model_dump_json(indent=2))
|
|
597
609
|
return 0
|
|
598
610
|
|
|
@@ -611,18 +623,21 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
611
623
|
if getattr(arguments, "corpus", None)
|
|
612
624
|
else Corpus.find(Path.cwd())
|
|
613
625
|
)
|
|
614
|
-
|
|
615
|
-
if not
|
|
616
|
-
raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
|
|
617
|
-
run = corpus.load_run(run_id)
|
|
618
|
-
if arguments.backend and arguments.backend != run.recipe.backend_id:
|
|
626
|
+
snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
|
|
627
|
+
if not snapshot_id:
|
|
619
628
|
raise ValueError(
|
|
620
|
-
|
|
629
|
+
"No snapshot identifier provided and no latest snapshot is recorded for this corpus"
|
|
621
630
|
)
|
|
622
|
-
|
|
631
|
+
snapshot = corpus.load_snapshot(snapshot_id)
|
|
632
|
+
if arguments.retriever and arguments.retriever != snapshot.configuration.retriever_id:
|
|
633
|
+
raise ValueError(
|
|
634
|
+
"Retriever mismatch: snapshot uses "
|
|
635
|
+
f"{snapshot.configuration.retriever_id!r} but {arguments.retriever!r} was requested"
|
|
636
|
+
)
|
|
637
|
+
retriever = get_retriever(snapshot.configuration.retriever_id)
|
|
623
638
|
query_text = arguments.query if arguments.query is not None else sys.stdin.read()
|
|
624
639
|
budget = _budget_from_args(arguments)
|
|
625
|
-
result =
|
|
640
|
+
result = retriever.query(corpus, snapshot=snapshot, query_text=query_text, budget=budget)
|
|
626
641
|
processed_evidence = result.evidence
|
|
627
642
|
if getattr(arguments, "reranker_id", None):
|
|
628
643
|
processed_evidence = apply_evidence_reranker(
|
|
@@ -693,7 +708,7 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
693
708
|
|
|
694
709
|
def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
695
710
|
"""
|
|
696
|
-
Evaluate a retrieval
|
|
711
|
+
Evaluate a retrieval snapshot against a dataset.
|
|
697
712
|
|
|
698
713
|
:param arguments: Parsed command-line interface arguments.
|
|
699
714
|
:type arguments: argparse.Namespace
|
|
@@ -705,13 +720,15 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
705
720
|
if getattr(arguments, "corpus", None)
|
|
706
721
|
else Corpus.find(Path.cwd())
|
|
707
722
|
)
|
|
708
|
-
|
|
709
|
-
if not
|
|
710
|
-
raise ValueError(
|
|
711
|
-
|
|
723
|
+
snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
|
|
724
|
+
if not snapshot_id:
|
|
725
|
+
raise ValueError(
|
|
726
|
+
"No snapshot identifier provided and no latest snapshot is recorded for this corpus"
|
|
727
|
+
)
|
|
728
|
+
snapshot = corpus.load_snapshot(snapshot_id)
|
|
712
729
|
dataset = load_dataset(Path(arguments.dataset))
|
|
713
730
|
budget = _budget_from_args(arguments)
|
|
714
|
-
result =
|
|
731
|
+
result = evaluate_snapshot(corpus=corpus, snapshot=snapshot, dataset=dataset, budget=budget)
|
|
715
732
|
print(result.model_dump_json(indent=2))
|
|
716
733
|
return 0
|
|
717
734
|
|
|
@@ -751,29 +768,33 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
751
768
|
:return: Exit code.
|
|
752
769
|
:rtype: int
|
|
753
770
|
"""
|
|
754
|
-
from .
|
|
771
|
+
from .configuration import (
|
|
772
|
+
apply_dotted_overrides,
|
|
773
|
+
load_configuration_view,
|
|
774
|
+
parse_dotted_overrides,
|
|
775
|
+
)
|
|
755
776
|
|
|
756
777
|
corpus = (
|
|
757
778
|
Corpus.open(arguments.corpus)
|
|
758
779
|
if getattr(arguments, "corpus", None)
|
|
759
780
|
else Corpus.find(Path.cwd())
|
|
760
781
|
)
|
|
761
|
-
|
|
762
|
-
arguments.
|
|
763
|
-
|
|
764
|
-
mapping_error_message="Topic modeling
|
|
782
|
+
configuration_data = load_configuration_view(
|
|
783
|
+
arguments.configuration,
|
|
784
|
+
configuration_label="Configuration file",
|
|
785
|
+
mapping_error_message="Topic modeling configuration must be a mapping/object",
|
|
765
786
|
)
|
|
766
|
-
overrides = parse_dotted_overrides(arguments.
|
|
767
|
-
|
|
787
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
788
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
768
789
|
|
|
769
|
-
if arguments.
|
|
770
|
-
|
|
790
|
+
if arguments.extraction_snapshot:
|
|
791
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
771
792
|
else:
|
|
772
|
-
|
|
773
|
-
if
|
|
774
|
-
raise ValueError("Topic analysis requires an extraction
|
|
793
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
794
|
+
if extraction_snapshot is None:
|
|
795
|
+
raise ValueError("Topic analysis requires an extraction snapshot to supply text inputs")
|
|
775
796
|
print(
|
|
776
|
-
"Warning: using latest extraction
|
|
797
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
777
798
|
file=sys.stderr,
|
|
778
799
|
)
|
|
779
800
|
|
|
@@ -781,12 +802,12 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
781
802
|
try:
|
|
782
803
|
output = backend.run_analysis(
|
|
783
804
|
corpus,
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
805
|
+
configuration_name=arguments.configuration_name,
|
|
806
|
+
configuration=configuration_data,
|
|
807
|
+
extraction_snapshot=extraction_snapshot,
|
|
787
808
|
)
|
|
788
809
|
except ValidationError as exc:
|
|
789
|
-
raise ValueError(f"Invalid topic modeling
|
|
810
|
+
raise ValueError(f"Invalid topic modeling configuration: {exc}") from exc
|
|
790
811
|
print(output.model_dump_json(indent=2))
|
|
791
812
|
return 0
|
|
792
813
|
|
|
@@ -800,7 +821,11 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
800
821
|
:return: Exit code.
|
|
801
822
|
:rtype: int
|
|
802
823
|
"""
|
|
803
|
-
from .
|
|
824
|
+
from .configuration import (
|
|
825
|
+
apply_dotted_overrides,
|
|
826
|
+
load_configuration_view,
|
|
827
|
+
parse_dotted_overrides,
|
|
828
|
+
)
|
|
804
829
|
|
|
805
830
|
corpus = (
|
|
806
831
|
Corpus.open(arguments.corpus)
|
|
@@ -808,28 +833,30 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
808
833
|
else Corpus.find(Path.cwd())
|
|
809
834
|
)
|
|
810
835
|
|
|
811
|
-
|
|
812
|
-
if arguments.
|
|
813
|
-
|
|
814
|
-
arguments.
|
|
815
|
-
|
|
816
|
-
mapping_error_message="Profiling
|
|
836
|
+
configuration_data: dict[str, object] = {}
|
|
837
|
+
if arguments.configuration is not None:
|
|
838
|
+
configuration_data = load_configuration_view(
|
|
839
|
+
arguments.configuration,
|
|
840
|
+
configuration_label="Configuration file",
|
|
841
|
+
mapping_error_message="Profiling configuration must be a mapping/object",
|
|
817
842
|
)
|
|
818
|
-
overrides = parse_dotted_overrides(arguments.
|
|
819
|
-
|
|
843
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
844
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
820
845
|
else:
|
|
821
|
-
overrides = parse_dotted_overrides(arguments.
|
|
846
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
822
847
|
if overrides:
|
|
823
|
-
|
|
848
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
824
849
|
|
|
825
|
-
if arguments.
|
|
826
|
-
|
|
850
|
+
if arguments.extraction_snapshot:
|
|
851
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
827
852
|
else:
|
|
828
|
-
|
|
829
|
-
if
|
|
830
|
-
raise ValueError(
|
|
853
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
854
|
+
if extraction_snapshot is None:
|
|
855
|
+
raise ValueError(
|
|
856
|
+
"Profiling analysis requires an extraction snapshot to supply text inputs"
|
|
857
|
+
)
|
|
831
858
|
print(
|
|
832
|
-
"Warning: using latest extraction
|
|
859
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
833
860
|
file=sys.stderr,
|
|
834
861
|
)
|
|
835
862
|
|
|
@@ -837,12 +864,12 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
837
864
|
try:
|
|
838
865
|
output = backend.run_analysis(
|
|
839
866
|
corpus,
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
867
|
+
configuration_name=arguments.configuration_name,
|
|
868
|
+
configuration=configuration_data,
|
|
869
|
+
extraction_snapshot=extraction_snapshot,
|
|
843
870
|
)
|
|
844
871
|
except ValidationError as exc:
|
|
845
|
-
raise ValueError(f"Invalid profiling
|
|
872
|
+
raise ValueError(f"Invalid profiling configuration: {exc}") from exc
|
|
846
873
|
print(output.model_dump_json(indent=2))
|
|
847
874
|
return 0
|
|
848
875
|
|
|
@@ -856,29 +883,35 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
|
|
|
856
883
|
:return: Exit code.
|
|
857
884
|
:rtype: int
|
|
858
885
|
"""
|
|
859
|
-
from .
|
|
886
|
+
from .configuration import (
|
|
887
|
+
apply_dotted_overrides,
|
|
888
|
+
load_configuration_view,
|
|
889
|
+
parse_dotted_overrides,
|
|
890
|
+
)
|
|
860
891
|
|
|
861
892
|
corpus = (
|
|
862
893
|
Corpus.open(arguments.corpus)
|
|
863
894
|
if getattr(arguments, "corpus", None)
|
|
864
895
|
else Corpus.find(Path.cwd())
|
|
865
896
|
)
|
|
866
|
-
|
|
867
|
-
arguments.
|
|
868
|
-
|
|
869
|
-
mapping_error_message="Markov analysis
|
|
897
|
+
configuration_data = load_configuration_view(
|
|
898
|
+
arguments.configuration,
|
|
899
|
+
configuration_label="Configuration file",
|
|
900
|
+
mapping_error_message="Markov analysis configuration must be a mapping/object",
|
|
870
901
|
)
|
|
871
|
-
overrides = parse_dotted_overrides(arguments.
|
|
872
|
-
|
|
902
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
903
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
873
904
|
|
|
874
|
-
if arguments.
|
|
875
|
-
|
|
905
|
+
if arguments.extraction_snapshot:
|
|
906
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
876
907
|
else:
|
|
877
|
-
|
|
878
|
-
if
|
|
879
|
-
raise ValueError(
|
|
908
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
909
|
+
if extraction_snapshot is None:
|
|
910
|
+
raise ValueError(
|
|
911
|
+
"Markov analysis requires an extraction snapshot to supply text inputs"
|
|
912
|
+
)
|
|
880
913
|
print(
|
|
881
|
-
"Warning: using latest extraction
|
|
914
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
882
915
|
file=sys.stderr,
|
|
883
916
|
)
|
|
884
917
|
|
|
@@ -886,12 +919,12 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
|
|
|
886
919
|
try:
|
|
887
920
|
output = backend.run_analysis(
|
|
888
921
|
corpus,
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
922
|
+
configuration_name=arguments.configuration_name,
|
|
923
|
+
configuration=configuration_data,
|
|
924
|
+
extraction_snapshot=extraction_snapshot,
|
|
892
925
|
)
|
|
893
926
|
except ValidationError as exc:
|
|
894
|
-
raise ValueError(f"Invalid Markov analysis
|
|
927
|
+
raise ValueError(f"Invalid Markov analysis configuration: {exc}") from exc
|
|
895
928
|
print(output.model_dump_json(indent=2))
|
|
896
929
|
return 0
|
|
897
930
|
|
|
@@ -977,41 +1010,46 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
977
1010
|
)
|
|
978
1011
|
p_purge.set_defaults(func=cmd_purge)
|
|
979
1012
|
|
|
980
|
-
p_build = sub.add_parser("build", help="Build a retrieval
|
|
1013
|
+
p_build = sub.add_parser("build", help="Build a retrieval snapshot for the corpus.")
|
|
981
1014
|
_add_common_corpus_arg(p_build)
|
|
982
1015
|
p_build.add_argument(
|
|
983
|
-
"--
|
|
1016
|
+
"--retriever",
|
|
984
1017
|
required=True,
|
|
985
|
-
help="
|
|
1018
|
+
help="Retriever identifier (for example, scan, sqlite-full-text-search).",
|
|
986
1019
|
)
|
|
987
|
-
p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
988
1020
|
p_build.add_argument(
|
|
989
|
-
"--
|
|
1021
|
+
"--configuration-name", default="default", help="Human-readable configuration name."
|
|
1022
|
+
)
|
|
1023
|
+
p_build.add_argument(
|
|
1024
|
+
"--configuration",
|
|
990
1025
|
default=None,
|
|
991
1026
|
action="append",
|
|
992
|
-
help="Path to YAML
|
|
1027
|
+
help="Path to YAML configuration file (repeatable). If provided, files are composed in precedence order.",
|
|
993
1028
|
)
|
|
994
1029
|
p_build.add_argument(
|
|
1030
|
+
"--override",
|
|
995
1031
|
"--config",
|
|
996
1032
|
action="append",
|
|
997
1033
|
default=None,
|
|
998
|
-
help="
|
|
1034
|
+
help="Configuration override as key=value (repeatable). Dotted keys create nested config mappings.",
|
|
999
1035
|
)
|
|
1000
1036
|
p_build.set_defaults(func=cmd_build)
|
|
1001
1037
|
|
|
1002
|
-
p_extract = sub.add_parser(
|
|
1038
|
+
p_extract = sub.add_parser(
|
|
1039
|
+
"extract", help="Work with text extraction snapshots for the corpus."
|
|
1040
|
+
)
|
|
1003
1041
|
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
1004
1042
|
|
|
1005
|
-
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction
|
|
1043
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction snapshot.")
|
|
1006
1044
|
_add_common_corpus_arg(p_extract_build)
|
|
1007
1045
|
p_extract_build.add_argument(
|
|
1008
|
-
"--
|
|
1046
|
+
"--configuration-name", default="default", help="Human-readable configuration name."
|
|
1009
1047
|
)
|
|
1010
1048
|
p_extract_build.add_argument(
|
|
1011
|
-
"--
|
|
1049
|
+
"--configuration",
|
|
1012
1050
|
default=None,
|
|
1013
1051
|
action="append",
|
|
1014
|
-
help="Path to YAML
|
|
1052
|
+
help="Path to YAML configuration file. If provided, --step arguments are ignored.",
|
|
1015
1053
|
)
|
|
1016
1054
|
p_extract_build.add_argument(
|
|
1017
1055
|
"--step",
|
|
@@ -1021,7 +1059,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1021
1059
|
)
|
|
1022
1060
|
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
1023
1061
|
|
|
1024
|
-
p_extract_list = extract_sub.add_parser("list", help="List extraction
|
|
1062
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction snapshots.")
|
|
1025
1063
|
_add_common_corpus_arg(p_extract_list)
|
|
1026
1064
|
p_extract_list.add_argument(
|
|
1027
1065
|
"--extractor-id",
|
|
@@ -1030,37 +1068,39 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1030
1068
|
)
|
|
1031
1069
|
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
1032
1070
|
|
|
1033
|
-
p_extract_show = extract_sub.add_parser("show", help="Show an extraction
|
|
1071
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction snapshot manifest.")
|
|
1034
1072
|
_add_common_corpus_arg(p_extract_show)
|
|
1035
1073
|
p_extract_show.add_argument(
|
|
1036
|
-
"--
|
|
1074
|
+
"--snapshot",
|
|
1037
1075
|
required=True,
|
|
1038
|
-
help="Extraction
|
|
1076
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1039
1077
|
)
|
|
1040
1078
|
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
1041
1079
|
|
|
1042
|
-
p_extract_delete = extract_sub.add_parser(
|
|
1080
|
+
p_extract_delete = extract_sub.add_parser(
|
|
1081
|
+
"delete", help="Delete an extraction snapshot directory."
|
|
1082
|
+
)
|
|
1043
1083
|
_add_common_corpus_arg(p_extract_delete)
|
|
1044
1084
|
p_extract_delete.add_argument(
|
|
1045
|
-
"--
|
|
1085
|
+
"--snapshot",
|
|
1046
1086
|
required=True,
|
|
1047
|
-
help="Extraction
|
|
1087
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1048
1088
|
)
|
|
1049
1089
|
p_extract_delete.add_argument(
|
|
1050
1090
|
"--confirm",
|
|
1051
1091
|
required=True,
|
|
1052
|
-
help="Type the exact extractor_id:
|
|
1092
|
+
help="Type the exact extractor_id:snapshot_id to confirm deletion.",
|
|
1053
1093
|
)
|
|
1054
1094
|
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
1055
1095
|
|
|
1056
1096
|
p_extract_evaluate = extract_sub.add_parser(
|
|
1057
|
-
"evaluate", help="Evaluate an extraction
|
|
1097
|
+
"evaluate", help="Evaluate an extraction snapshot against a dataset."
|
|
1058
1098
|
)
|
|
1059
1099
|
_add_common_corpus_arg(p_extract_evaluate)
|
|
1060
1100
|
p_extract_evaluate.add_argument(
|
|
1061
|
-
"--
|
|
1101
|
+
"--snapshot",
|
|
1062
1102
|
default=None,
|
|
1063
|
-
help="Extraction
|
|
1103
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id (defaults to latest snapshot).",
|
|
1064
1104
|
)
|
|
1065
1105
|
p_extract_evaluate.add_argument(
|
|
1066
1106
|
"--dataset",
|
|
@@ -1071,8 +1111,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1071
1111
|
|
|
1072
1112
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
1073
1113
|
_add_common_corpus_arg(p_query)
|
|
1074
|
-
p_query.add_argument(
|
|
1075
|
-
|
|
1114
|
+
p_query.add_argument(
|
|
1115
|
+
"--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
|
|
1116
|
+
)
|
|
1117
|
+
p_query.add_argument("--retriever", default=None, help="Validate retriever identifier.")
|
|
1076
1118
|
p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
|
|
1077
1119
|
p_query.add_argument(
|
|
1078
1120
|
"--offset",
|
|
@@ -1132,9 +1174,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1132
1174
|
)
|
|
1133
1175
|
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
1134
1176
|
|
|
1135
|
-
p_eval = sub.add_parser("eval", help="Evaluate a
|
|
1177
|
+
p_eval = sub.add_parser("eval", help="Evaluate a snapshot against a dataset.")
|
|
1136
1178
|
_add_common_corpus_arg(p_eval)
|
|
1137
|
-
p_eval.add_argument(
|
|
1179
|
+
p_eval.add_argument(
|
|
1180
|
+
"--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
|
|
1181
|
+
)
|
|
1138
1182
|
p_eval.add_argument(
|
|
1139
1183
|
"--dataset",
|
|
1140
1184
|
required=True,
|
|
@@ -1170,78 +1214,81 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1170
1214
|
p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
|
|
1171
1215
|
_add_common_corpus_arg(p_analyze_topics)
|
|
1172
1216
|
p_analyze_topics.add_argument(
|
|
1173
|
-
"--
|
|
1217
|
+
"--configuration",
|
|
1174
1218
|
required=True,
|
|
1175
1219
|
action="append",
|
|
1176
|
-
help="Path to topic modeling
|
|
1220
|
+
help="Path to topic modeling configuration YAML. Repeatable; later files override earlier ones.",
|
|
1177
1221
|
)
|
|
1178
1222
|
p_analyze_topics.add_argument(
|
|
1223
|
+
"--override",
|
|
1179
1224
|
"--config",
|
|
1180
1225
|
action="append",
|
|
1181
1226
|
default=[],
|
|
1182
|
-
help="Override key=value pairs applied after composing
|
|
1227
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1183
1228
|
)
|
|
1184
1229
|
p_analyze_topics.add_argument(
|
|
1185
|
-
"--
|
|
1230
|
+
"--configuration-name",
|
|
1186
1231
|
default="default",
|
|
1187
|
-
help="Human-readable
|
|
1232
|
+
help="Human-readable configuration name.",
|
|
1188
1233
|
)
|
|
1189
1234
|
p_analyze_topics.add_argument(
|
|
1190
|
-
"--extraction-
|
|
1235
|
+
"--extraction-snapshot",
|
|
1191
1236
|
default=None,
|
|
1192
|
-
help="Extraction
|
|
1237
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1193
1238
|
)
|
|
1194
1239
|
p_analyze_topics.set_defaults(func=cmd_analyze_topics)
|
|
1195
1240
|
|
|
1196
1241
|
p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
|
|
1197
1242
|
_add_common_corpus_arg(p_analyze_profile)
|
|
1198
1243
|
p_analyze_profile.add_argument(
|
|
1199
|
-
"--
|
|
1244
|
+
"--configuration",
|
|
1200
1245
|
default=None,
|
|
1201
1246
|
action="append",
|
|
1202
|
-
help="Optional profiling
|
|
1247
|
+
help="Optional profiling configuration YAML file. Repeatable; later files override earlier ones.",
|
|
1203
1248
|
)
|
|
1204
1249
|
p_analyze_profile.add_argument(
|
|
1250
|
+
"--override",
|
|
1205
1251
|
"--config",
|
|
1206
1252
|
action="append",
|
|
1207
1253
|
default=[],
|
|
1208
|
-
help="Override key=value pairs applied after composing
|
|
1254
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1209
1255
|
)
|
|
1210
1256
|
p_analyze_profile.add_argument(
|
|
1211
|
-
"--
|
|
1257
|
+
"--configuration-name",
|
|
1212
1258
|
default="default",
|
|
1213
|
-
help="Human-readable
|
|
1259
|
+
help="Human-readable configuration name.",
|
|
1214
1260
|
)
|
|
1215
1261
|
p_analyze_profile.add_argument(
|
|
1216
|
-
"--extraction-
|
|
1262
|
+
"--extraction-snapshot",
|
|
1217
1263
|
default=None,
|
|
1218
|
-
help="Extraction
|
|
1264
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1219
1265
|
)
|
|
1220
1266
|
p_analyze_profile.set_defaults(func=cmd_analyze_profile)
|
|
1221
1267
|
|
|
1222
1268
|
p_analyze_markov = analyze_sub.add_parser("markov", help="Run Markov analysis.")
|
|
1223
1269
|
_add_common_corpus_arg(p_analyze_markov)
|
|
1224
1270
|
p_analyze_markov.add_argument(
|
|
1225
|
-
"--
|
|
1271
|
+
"--configuration",
|
|
1226
1272
|
required=True,
|
|
1227
1273
|
action="append",
|
|
1228
|
-
help="Path to Markov analysis
|
|
1274
|
+
help="Path to Markov analysis configuration YAML. Repeatable; later files override earlier ones.",
|
|
1229
1275
|
)
|
|
1230
1276
|
p_analyze_markov.add_argument(
|
|
1277
|
+
"--override",
|
|
1231
1278
|
"--config",
|
|
1232
1279
|
action="append",
|
|
1233
1280
|
default=[],
|
|
1234
|
-
help="Override key=value pairs applied after composing
|
|
1281
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1235
1282
|
)
|
|
1236
1283
|
p_analyze_markov.add_argument(
|
|
1237
|
-
"--
|
|
1284
|
+
"--configuration-name",
|
|
1238
1285
|
default="default",
|
|
1239
|
-
help="Human-readable
|
|
1286
|
+
help="Human-readable configuration name.",
|
|
1240
1287
|
)
|
|
1241
1288
|
p_analyze_markov.add_argument(
|
|
1242
|
-
"--extraction-
|
|
1289
|
+
"--extraction-snapshot",
|
|
1243
1290
|
default=None,
|
|
1244
|
-
help="Extraction
|
|
1291
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1245
1292
|
)
|
|
1246
1293
|
p_analyze_markov.set_defaults(func=cmd_analyze_markov)
|
|
1247
1294
|
|
|
@@ -1266,7 +1313,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
1266
1313
|
FileExistsError,
|
|
1267
1314
|
KeyError,
|
|
1268
1315
|
ValueError,
|
|
1269
|
-
|
|
1316
|
+
ExtractionSnapshotFatalError,
|
|
1270
1317
|
NotImplementedError,
|
|
1271
1318
|
ValidationError,
|
|
1272
1319
|
) as exception:
|