biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py
CHANGED
|
@@ -13,7 +13,6 @@ from typing import Dict, Iterable, List, Optional
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
15
15
|
from .analysis import get_analysis_backend
|
|
16
|
-
from .backends import get_backend
|
|
17
16
|
from .context import (
|
|
18
17
|
CharacterBudget,
|
|
19
18
|
ContextPackPolicy,
|
|
@@ -24,16 +23,17 @@ from .context import (
|
|
|
24
23
|
)
|
|
25
24
|
from .corpus import Corpus
|
|
26
25
|
from .crawl import CrawlRequest, crawl_into_corpus
|
|
27
|
-
from .errors import
|
|
28
|
-
from .evaluation import
|
|
26
|
+
from .errors import ExtractionSnapshotFatalError, IngestCollisionError
|
|
27
|
+
from .evaluation import evaluate_snapshot, load_dataset
|
|
29
28
|
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
30
|
-
from .extraction import
|
|
29
|
+
from .extraction import build_extraction_snapshot
|
|
31
30
|
from .extraction_evaluation import (
|
|
32
|
-
|
|
31
|
+
evaluate_extraction_snapshot,
|
|
33
32
|
load_extraction_dataset,
|
|
34
33
|
write_extraction_evaluation_result,
|
|
35
34
|
)
|
|
36
|
-
from .models import QueryBudget, RetrievalResult,
|
|
35
|
+
from .models import QueryBudget, RetrievalResult, parse_extraction_snapshot_reference
|
|
36
|
+
from .retrievers import get_retriever
|
|
37
37
|
from .uris import corpus_ref_to_path
|
|
38
38
|
|
|
39
39
|
|
|
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
117
117
|
|
|
118
118
|
results = []
|
|
119
119
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
120
|
+
try:
|
|
121
|
+
if arguments.note is not None or arguments.stdin:
|
|
122
|
+
text = arguments.note if arguments.note is not None else sys.stdin.read()
|
|
123
|
+
ingest_result = corpus.ingest_note(
|
|
124
|
+
text,
|
|
125
|
+
title=arguments.title,
|
|
126
|
+
tags=tags,
|
|
127
|
+
source_uri=None if arguments.stdin else None,
|
|
128
|
+
)
|
|
129
|
+
results.append(ingest_result)
|
|
130
|
+
|
|
131
|
+
for source_path in arguments.files or []:
|
|
132
|
+
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
133
|
+
except IngestCollisionError as error:
|
|
134
|
+
print(
|
|
135
|
+
"Ingest failed: source already ingested\n"
|
|
136
|
+
f"source_uri: {error.source_uri}\n"
|
|
137
|
+
f"existing_item_id: {error.existing_item_id}\n"
|
|
138
|
+
f"existing_relpath: {error.existing_relpath}",
|
|
139
|
+
file=sys.stderr,
|
|
127
140
|
)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
for source_path in arguments.files or []:
|
|
131
|
-
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
141
|
+
return 3
|
|
132
142
|
|
|
133
143
|
if not results:
|
|
134
144
|
print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
|
|
@@ -374,55 +384,63 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
374
384
|
return QueryBudget(
|
|
375
385
|
max_total_items=arguments.max_total_items,
|
|
376
386
|
offset=getattr(arguments, "offset", 0),
|
|
377
|
-
|
|
387
|
+
maximum_total_characters=arguments.maximum_total_characters,
|
|
378
388
|
max_items_per_source=arguments.max_items_per_source,
|
|
379
389
|
)
|
|
380
390
|
|
|
381
391
|
|
|
382
392
|
def cmd_build(arguments: argparse.Namespace) -> int:
|
|
383
393
|
"""
|
|
384
|
-
Build a retrieval
|
|
394
|
+
Build a retrieval snapshot for a retriever.
|
|
385
395
|
|
|
386
396
|
:param arguments: Parsed command-line interface arguments.
|
|
387
397
|
:type arguments: argparse.Namespace
|
|
388
398
|
:return: Exit code.
|
|
389
399
|
:rtype: int
|
|
390
400
|
"""
|
|
391
|
-
from .
|
|
401
|
+
from .configuration import (
|
|
402
|
+
apply_dotted_overrides,
|
|
403
|
+
load_configuration_view,
|
|
404
|
+
parse_dotted_overrides,
|
|
405
|
+
)
|
|
392
406
|
|
|
393
407
|
corpus = (
|
|
394
408
|
Corpus.open(arguments.corpus)
|
|
395
409
|
if getattr(arguments, "corpus", None)
|
|
396
410
|
else Corpus.find(Path.cwd())
|
|
397
411
|
)
|
|
398
|
-
|
|
412
|
+
retriever = get_retriever(arguments.retriever)
|
|
399
413
|
|
|
400
414
|
base_config: Dict[str, object] = {}
|
|
401
|
-
if getattr(arguments, "
|
|
402
|
-
base_config =
|
|
403
|
-
arguments.
|
|
404
|
-
|
|
405
|
-
mapping_error_message="Retrieval
|
|
415
|
+
if getattr(arguments, "configuration", None):
|
|
416
|
+
base_config = load_configuration_view(
|
|
417
|
+
arguments.configuration,
|
|
418
|
+
configuration_label="Configuration file",
|
|
419
|
+
mapping_error_message="Retrieval snapshot configuration must be a mapping/object",
|
|
406
420
|
)
|
|
407
421
|
|
|
408
|
-
overrides = parse_dotted_overrides(arguments.
|
|
409
|
-
|
|
422
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
423
|
+
configuration = apply_dotted_overrides(base_config, overrides)
|
|
410
424
|
|
|
411
|
-
|
|
412
|
-
|
|
425
|
+
snapshot = retriever.build_snapshot(
|
|
426
|
+
corpus,
|
|
427
|
+
configuration_name=arguments.configuration_name,
|
|
428
|
+
configuration=configuration,
|
|
429
|
+
)
|
|
430
|
+
print(snapshot.model_dump_json(indent=2))
|
|
413
431
|
return 0
|
|
414
432
|
|
|
415
433
|
|
|
416
434
|
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
417
435
|
"""
|
|
418
|
-
Build a text extraction
|
|
436
|
+
Build a text extraction snapshot for the corpus using a pipeline of extractors.
|
|
419
437
|
|
|
420
438
|
:param arguments: Parsed command-line interface arguments.
|
|
421
439
|
:type arguments: argparse.Namespace
|
|
422
440
|
:return: Exit code.
|
|
423
441
|
:rtype: int
|
|
424
442
|
"""
|
|
425
|
-
from .
|
|
443
|
+
from .configuration import load_configuration_view
|
|
426
444
|
|
|
427
445
|
corpus = (
|
|
428
446
|
Corpus.open(arguments.corpus)
|
|
@@ -430,17 +448,17 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
430
448
|
else Corpus.find(Path.cwd())
|
|
431
449
|
)
|
|
432
450
|
|
|
433
|
-
# Load
|
|
434
|
-
if getattr(arguments, "
|
|
435
|
-
|
|
436
|
-
arguments.
|
|
437
|
-
|
|
438
|
-
mapping_error_message="Extraction
|
|
451
|
+
# Load configuration from file if --configuration is provided
|
|
452
|
+
if getattr(arguments, "configuration", None):
|
|
453
|
+
configuration_data = load_configuration_view(
|
|
454
|
+
arguments.configuration,
|
|
455
|
+
configuration_label="Configuration file",
|
|
456
|
+
mapping_error_message="Extraction configuration must be a mapping/object",
|
|
439
457
|
)
|
|
440
|
-
loaded_extractor_id =
|
|
441
|
-
loaded_config =
|
|
458
|
+
loaded_extractor_id = configuration_data.get("extractor_id", "pipeline")
|
|
459
|
+
loaded_config = configuration_data.get("configuration", {})
|
|
442
460
|
|
|
443
|
-
# If the
|
|
461
|
+
# If the configuration specifies a non-pipeline extractor, wrap it in a pipeline
|
|
444
462
|
if loaded_extractor_id != "pipeline":
|
|
445
463
|
extractor_id = "pipeline"
|
|
446
464
|
config = {
|
|
@@ -466,11 +484,11 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
466
484
|
config = {"steps": steps}
|
|
467
485
|
extractor_id = "pipeline"
|
|
468
486
|
|
|
469
|
-
manifest =
|
|
487
|
+
manifest = build_extraction_snapshot(
|
|
470
488
|
corpus,
|
|
471
489
|
extractor_id=extractor_id,
|
|
472
|
-
|
|
473
|
-
|
|
490
|
+
configuration_name=arguments.configuration_name,
|
|
491
|
+
configuration=config,
|
|
474
492
|
)
|
|
475
493
|
print(manifest.model_dump_json(indent=2))
|
|
476
494
|
return 0
|
|
@@ -478,7 +496,7 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
478
496
|
|
|
479
497
|
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
480
498
|
"""
|
|
481
|
-
List extraction
|
|
499
|
+
List extraction snapshots stored under the corpus.
|
|
482
500
|
|
|
483
501
|
:param arguments: Parsed command-line interface arguments.
|
|
484
502
|
:type arguments: argparse.Namespace
|
|
@@ -490,14 +508,14 @@ def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
|
490
508
|
if getattr(arguments, "corpus", None)
|
|
491
509
|
else Corpus.find(Path.cwd())
|
|
492
510
|
)
|
|
493
|
-
|
|
494
|
-
print(json.dumps([entry.model_dump() for entry in
|
|
511
|
+
snapshots = corpus.list_extraction_snapshots(extractor_id=arguments.extractor_id)
|
|
512
|
+
print(json.dumps([entry.model_dump() for entry in snapshots], indent=2))
|
|
495
513
|
return 0
|
|
496
514
|
|
|
497
515
|
|
|
498
516
|
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
499
517
|
"""
|
|
500
|
-
Show an extraction
|
|
518
|
+
Show an extraction snapshot manifest.
|
|
501
519
|
|
|
502
520
|
:param arguments: Parsed command-line interface arguments.
|
|
503
521
|
:type arguments: argparse.Namespace
|
|
@@ -509,9 +527,9 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
|
509
527
|
if getattr(arguments, "corpus", None)
|
|
510
528
|
else Corpus.find(Path.cwd())
|
|
511
529
|
)
|
|
512
|
-
reference =
|
|
513
|
-
manifest = corpus.
|
|
514
|
-
extractor_id=reference.extractor_id,
|
|
530
|
+
reference = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
531
|
+
manifest = corpus.load_extraction_snapshot_manifest(
|
|
532
|
+
extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
|
|
515
533
|
)
|
|
516
534
|
print(manifest.model_dump_json(indent=2))
|
|
517
535
|
return 0
|
|
@@ -519,7 +537,7 @@ def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
|
519
537
|
|
|
520
538
|
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
521
539
|
"""
|
|
522
|
-
Delete an extraction
|
|
540
|
+
Delete an extraction snapshot directory and its derived artifacts.
|
|
523
541
|
|
|
524
542
|
:param arguments: Parsed command-line interface arguments.
|
|
525
543
|
:type arguments: argparse.Namespace
|
|
@@ -531,17 +549,19 @@ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
|
531
549
|
if getattr(arguments, "corpus", None)
|
|
532
550
|
else Corpus.find(Path.cwd())
|
|
533
551
|
)
|
|
534
|
-
if arguments.confirm != arguments.
|
|
535
|
-
raise ValueError("Refusing to delete extraction
|
|
536
|
-
reference =
|
|
537
|
-
corpus.
|
|
538
|
-
|
|
552
|
+
if arguments.confirm != arguments.snapshot:
|
|
553
|
+
raise ValueError("Refusing to delete extraction snapshot without an exact --confirm match.")
|
|
554
|
+
reference = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
555
|
+
corpus.delete_extraction_snapshot(
|
|
556
|
+
extractor_id=reference.extractor_id, snapshot_id=reference.snapshot_id
|
|
557
|
+
)
|
|
558
|
+
print(json.dumps({"deleted": True, "snapshot": arguments.snapshot}, indent=2))
|
|
539
559
|
return 0
|
|
540
560
|
|
|
541
561
|
|
|
542
562
|
def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
543
563
|
"""
|
|
544
|
-
Evaluate an extraction
|
|
564
|
+
Evaluate an extraction snapshot against a dataset.
|
|
545
565
|
|
|
546
566
|
:param arguments: Parsed command-line interface arguments.
|
|
547
567
|
:type arguments: argparse.Namespace
|
|
@@ -553,14 +573,14 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
|
553
573
|
if getattr(arguments, "corpus", None)
|
|
554
574
|
else Corpus.find(Path.cwd())
|
|
555
575
|
)
|
|
556
|
-
if arguments.
|
|
557
|
-
|
|
576
|
+
if arguments.snapshot:
|
|
577
|
+
snapshot_ref = parse_extraction_snapshot_reference(arguments.snapshot)
|
|
558
578
|
else:
|
|
559
|
-
|
|
560
|
-
if
|
|
561
|
-
raise ValueError("Extraction evaluation requires an extraction
|
|
579
|
+
snapshot_ref = corpus.latest_extraction_snapshot_reference()
|
|
580
|
+
if snapshot_ref is None:
|
|
581
|
+
raise ValueError("Extraction evaluation requires an extraction snapshot")
|
|
562
582
|
print(
|
|
563
|
-
"Warning: using latest extraction
|
|
583
|
+
"Warning: using latest extraction snapshot; pass --snapshot for reproducibility.",
|
|
564
584
|
file=sys.stderr,
|
|
565
585
|
)
|
|
566
586
|
|
|
@@ -572,17 +592,19 @@ def cmd_extract_evaluate(arguments: argparse.Namespace) -> int:
|
|
|
572
592
|
except ValidationError as exc:
|
|
573
593
|
raise ValueError(f"Invalid extraction dataset: {exc}") from exc
|
|
574
594
|
|
|
575
|
-
|
|
576
|
-
extractor_id=
|
|
577
|
-
|
|
595
|
+
snapshot = corpus.load_extraction_snapshot_manifest(
|
|
596
|
+
extractor_id=snapshot_ref.extractor_id,
|
|
597
|
+
snapshot_id=snapshot_ref.snapshot_id,
|
|
578
598
|
)
|
|
579
|
-
result =
|
|
599
|
+
result = evaluate_extraction_snapshot(
|
|
580
600
|
corpus=corpus,
|
|
581
|
-
|
|
582
|
-
extractor_id=
|
|
601
|
+
snapshot=snapshot,
|
|
602
|
+
extractor_id=snapshot_ref.extractor_id,
|
|
583
603
|
dataset=dataset,
|
|
584
604
|
)
|
|
585
|
-
write_extraction_evaluation_result(
|
|
605
|
+
write_extraction_evaluation_result(
|
|
606
|
+
corpus=corpus, snapshot_id=snapshot.snapshot_id, result=result
|
|
607
|
+
)
|
|
586
608
|
print(result.model_dump_json(indent=2))
|
|
587
609
|
return 0
|
|
588
610
|
|
|
@@ -601,18 +623,21 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
601
623
|
if getattr(arguments, "corpus", None)
|
|
602
624
|
else Corpus.find(Path.cwd())
|
|
603
625
|
)
|
|
604
|
-
|
|
605
|
-
if not
|
|
606
|
-
raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
|
|
607
|
-
run = corpus.load_run(run_id)
|
|
608
|
-
if arguments.backend and arguments.backend != run.recipe.backend_id:
|
|
626
|
+
snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
|
|
627
|
+
if not snapshot_id:
|
|
609
628
|
raise ValueError(
|
|
610
|
-
|
|
629
|
+
"No snapshot identifier provided and no latest snapshot is recorded for this corpus"
|
|
611
630
|
)
|
|
612
|
-
|
|
631
|
+
snapshot = corpus.load_snapshot(snapshot_id)
|
|
632
|
+
if arguments.retriever and arguments.retriever != snapshot.configuration.retriever_id:
|
|
633
|
+
raise ValueError(
|
|
634
|
+
"Retriever mismatch: snapshot uses "
|
|
635
|
+
f"{snapshot.configuration.retriever_id!r} but {arguments.retriever!r} was requested"
|
|
636
|
+
)
|
|
637
|
+
retriever = get_retriever(snapshot.configuration.retriever_id)
|
|
613
638
|
query_text = arguments.query if arguments.query is not None else sys.stdin.read()
|
|
614
639
|
budget = _budget_from_args(arguments)
|
|
615
|
-
result =
|
|
640
|
+
result = retriever.query(corpus, snapshot=snapshot, query_text=query_text, budget=budget)
|
|
616
641
|
processed_evidence = result.evidence
|
|
617
642
|
if getattr(arguments, "reranker_id", None):
|
|
618
643
|
processed_evidence = apply_evidence_reranker(
|
|
@@ -683,7 +708,7 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
683
708
|
|
|
684
709
|
def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
685
710
|
"""
|
|
686
|
-
Evaluate a retrieval
|
|
711
|
+
Evaluate a retrieval snapshot against a dataset.
|
|
687
712
|
|
|
688
713
|
:param arguments: Parsed command-line interface arguments.
|
|
689
714
|
:type arguments: argparse.Namespace
|
|
@@ -695,13 +720,15 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
695
720
|
if getattr(arguments, "corpus", None)
|
|
696
721
|
else Corpus.find(Path.cwd())
|
|
697
722
|
)
|
|
698
|
-
|
|
699
|
-
if not
|
|
700
|
-
raise ValueError(
|
|
701
|
-
|
|
723
|
+
snapshot_id = arguments.snapshot or corpus.latest_snapshot_id
|
|
724
|
+
if not snapshot_id:
|
|
725
|
+
raise ValueError(
|
|
726
|
+
"No snapshot identifier provided and no latest snapshot is recorded for this corpus"
|
|
727
|
+
)
|
|
728
|
+
snapshot = corpus.load_snapshot(snapshot_id)
|
|
702
729
|
dataset = load_dataset(Path(arguments.dataset))
|
|
703
730
|
budget = _budget_from_args(arguments)
|
|
704
|
-
result =
|
|
731
|
+
result = evaluate_snapshot(corpus=corpus, snapshot=snapshot, dataset=dataset, budget=budget)
|
|
705
732
|
print(result.model_dump_json(indent=2))
|
|
706
733
|
return 0
|
|
707
734
|
|
|
@@ -741,29 +768,33 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
741
768
|
:return: Exit code.
|
|
742
769
|
:rtype: int
|
|
743
770
|
"""
|
|
744
|
-
from .
|
|
771
|
+
from .configuration import (
|
|
772
|
+
apply_dotted_overrides,
|
|
773
|
+
load_configuration_view,
|
|
774
|
+
parse_dotted_overrides,
|
|
775
|
+
)
|
|
745
776
|
|
|
746
777
|
corpus = (
|
|
747
778
|
Corpus.open(arguments.corpus)
|
|
748
779
|
if getattr(arguments, "corpus", None)
|
|
749
780
|
else Corpus.find(Path.cwd())
|
|
750
781
|
)
|
|
751
|
-
|
|
752
|
-
arguments.
|
|
753
|
-
|
|
754
|
-
mapping_error_message="Topic modeling
|
|
782
|
+
configuration_data = load_configuration_view(
|
|
783
|
+
arguments.configuration,
|
|
784
|
+
configuration_label="Configuration file",
|
|
785
|
+
mapping_error_message="Topic modeling configuration must be a mapping/object",
|
|
755
786
|
)
|
|
756
|
-
overrides = parse_dotted_overrides(arguments.
|
|
757
|
-
|
|
787
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
788
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
758
789
|
|
|
759
|
-
if arguments.
|
|
760
|
-
|
|
790
|
+
if arguments.extraction_snapshot:
|
|
791
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
761
792
|
else:
|
|
762
|
-
|
|
763
|
-
if
|
|
764
|
-
raise ValueError("Topic analysis requires an extraction
|
|
793
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
794
|
+
if extraction_snapshot is None:
|
|
795
|
+
raise ValueError("Topic analysis requires an extraction snapshot to supply text inputs")
|
|
765
796
|
print(
|
|
766
|
-
"Warning: using latest extraction
|
|
797
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
767
798
|
file=sys.stderr,
|
|
768
799
|
)
|
|
769
800
|
|
|
@@ -771,12 +802,12 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
771
802
|
try:
|
|
772
803
|
output = backend.run_analysis(
|
|
773
804
|
corpus,
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
805
|
+
configuration_name=arguments.configuration_name,
|
|
806
|
+
configuration=configuration_data,
|
|
807
|
+
extraction_snapshot=extraction_snapshot,
|
|
777
808
|
)
|
|
778
809
|
except ValidationError as exc:
|
|
779
|
-
raise ValueError(f"Invalid topic modeling
|
|
810
|
+
raise ValueError(f"Invalid topic modeling configuration: {exc}") from exc
|
|
780
811
|
print(output.model_dump_json(indent=2))
|
|
781
812
|
return 0
|
|
782
813
|
|
|
@@ -790,7 +821,11 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
790
821
|
:return: Exit code.
|
|
791
822
|
:rtype: int
|
|
792
823
|
"""
|
|
793
|
-
from .
|
|
824
|
+
from .configuration import (
|
|
825
|
+
apply_dotted_overrides,
|
|
826
|
+
load_configuration_view,
|
|
827
|
+
parse_dotted_overrides,
|
|
828
|
+
)
|
|
794
829
|
|
|
795
830
|
corpus = (
|
|
796
831
|
Corpus.open(arguments.corpus)
|
|
@@ -798,28 +833,30 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
798
833
|
else Corpus.find(Path.cwd())
|
|
799
834
|
)
|
|
800
835
|
|
|
801
|
-
|
|
802
|
-
if arguments.
|
|
803
|
-
|
|
804
|
-
arguments.
|
|
805
|
-
|
|
806
|
-
mapping_error_message="Profiling
|
|
836
|
+
configuration_data: dict[str, object] = {}
|
|
837
|
+
if arguments.configuration is not None:
|
|
838
|
+
configuration_data = load_configuration_view(
|
|
839
|
+
arguments.configuration,
|
|
840
|
+
configuration_label="Configuration file",
|
|
841
|
+
mapping_error_message="Profiling configuration must be a mapping/object",
|
|
807
842
|
)
|
|
808
|
-
overrides = parse_dotted_overrides(arguments.
|
|
809
|
-
|
|
843
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
844
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
810
845
|
else:
|
|
811
|
-
overrides = parse_dotted_overrides(arguments.
|
|
846
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
812
847
|
if overrides:
|
|
813
|
-
|
|
848
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
814
849
|
|
|
815
|
-
if arguments.
|
|
816
|
-
|
|
850
|
+
if arguments.extraction_snapshot:
|
|
851
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
817
852
|
else:
|
|
818
|
-
|
|
819
|
-
if
|
|
820
|
-
raise ValueError(
|
|
853
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
854
|
+
if extraction_snapshot is None:
|
|
855
|
+
raise ValueError(
|
|
856
|
+
"Profiling analysis requires an extraction snapshot to supply text inputs"
|
|
857
|
+
)
|
|
821
858
|
print(
|
|
822
|
-
"Warning: using latest extraction
|
|
859
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
823
860
|
file=sys.stderr,
|
|
824
861
|
)
|
|
825
862
|
|
|
@@ -827,12 +864,12 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
827
864
|
try:
|
|
828
865
|
output = backend.run_analysis(
|
|
829
866
|
corpus,
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
867
|
+
configuration_name=arguments.configuration_name,
|
|
868
|
+
configuration=configuration_data,
|
|
869
|
+
extraction_snapshot=extraction_snapshot,
|
|
833
870
|
)
|
|
834
871
|
except ValidationError as exc:
|
|
835
|
-
raise ValueError(f"Invalid profiling
|
|
872
|
+
raise ValueError(f"Invalid profiling configuration: {exc}") from exc
|
|
836
873
|
print(output.model_dump_json(indent=2))
|
|
837
874
|
return 0
|
|
838
875
|
|
|
@@ -846,29 +883,35 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
|
|
|
846
883
|
:return: Exit code.
|
|
847
884
|
:rtype: int
|
|
848
885
|
"""
|
|
849
|
-
from .
|
|
886
|
+
from .configuration import (
|
|
887
|
+
apply_dotted_overrides,
|
|
888
|
+
load_configuration_view,
|
|
889
|
+
parse_dotted_overrides,
|
|
890
|
+
)
|
|
850
891
|
|
|
851
892
|
corpus = (
|
|
852
893
|
Corpus.open(arguments.corpus)
|
|
853
894
|
if getattr(arguments, "corpus", None)
|
|
854
895
|
else Corpus.find(Path.cwd())
|
|
855
896
|
)
|
|
856
|
-
|
|
857
|
-
arguments.
|
|
858
|
-
|
|
859
|
-
mapping_error_message="Markov analysis
|
|
897
|
+
configuration_data = load_configuration_view(
|
|
898
|
+
arguments.configuration,
|
|
899
|
+
configuration_label="Configuration file",
|
|
900
|
+
mapping_error_message="Markov analysis configuration must be a mapping/object",
|
|
860
901
|
)
|
|
861
|
-
overrides = parse_dotted_overrides(arguments.
|
|
862
|
-
|
|
902
|
+
overrides = parse_dotted_overrides(arguments.override)
|
|
903
|
+
configuration_data = apply_dotted_overrides(configuration_data, overrides)
|
|
863
904
|
|
|
864
|
-
if arguments.
|
|
865
|
-
|
|
905
|
+
if arguments.extraction_snapshot:
|
|
906
|
+
extraction_snapshot = parse_extraction_snapshot_reference(arguments.extraction_snapshot)
|
|
866
907
|
else:
|
|
867
|
-
|
|
868
|
-
if
|
|
869
|
-
raise ValueError(
|
|
908
|
+
extraction_snapshot = corpus.latest_extraction_snapshot_reference()
|
|
909
|
+
if extraction_snapshot is None:
|
|
910
|
+
raise ValueError(
|
|
911
|
+
"Markov analysis requires an extraction snapshot to supply text inputs"
|
|
912
|
+
)
|
|
870
913
|
print(
|
|
871
|
-
"Warning: using latest extraction
|
|
914
|
+
"Warning: using latest extraction snapshot; pass --extraction-snapshot for reproducibility.",
|
|
872
915
|
file=sys.stderr,
|
|
873
916
|
)
|
|
874
917
|
|
|
@@ -876,12 +919,12 @@ def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
|
|
|
876
919
|
try:
|
|
877
920
|
output = backend.run_analysis(
|
|
878
921
|
corpus,
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
922
|
+
configuration_name=arguments.configuration_name,
|
|
923
|
+
configuration=configuration_data,
|
|
924
|
+
extraction_snapshot=extraction_snapshot,
|
|
882
925
|
)
|
|
883
926
|
except ValidationError as exc:
|
|
884
|
-
raise ValueError(f"Invalid Markov analysis
|
|
927
|
+
raise ValueError(f"Invalid Markov analysis configuration: {exc}") from exc
|
|
885
928
|
print(output.model_dump_json(indent=2))
|
|
886
929
|
return 0
|
|
887
930
|
|
|
@@ -967,41 +1010,46 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
967
1010
|
)
|
|
968
1011
|
p_purge.set_defaults(func=cmd_purge)
|
|
969
1012
|
|
|
970
|
-
p_build = sub.add_parser("build", help="Build a retrieval
|
|
1013
|
+
p_build = sub.add_parser("build", help="Build a retrieval snapshot for the corpus.")
|
|
971
1014
|
_add_common_corpus_arg(p_build)
|
|
972
1015
|
p_build.add_argument(
|
|
973
|
-
"--
|
|
1016
|
+
"--retriever",
|
|
974
1017
|
required=True,
|
|
975
|
-
help="
|
|
1018
|
+
help="Retriever identifier (for example, scan, sqlite-full-text-search).",
|
|
1019
|
+
)
|
|
1020
|
+
p_build.add_argument(
|
|
1021
|
+
"--configuration-name", default="default", help="Human-readable configuration name."
|
|
976
1022
|
)
|
|
977
|
-
p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
978
1023
|
p_build.add_argument(
|
|
979
|
-
"--
|
|
1024
|
+
"--configuration",
|
|
980
1025
|
default=None,
|
|
981
1026
|
action="append",
|
|
982
|
-
help="Path to YAML
|
|
1027
|
+
help="Path to YAML configuration file (repeatable). If provided, files are composed in precedence order.",
|
|
983
1028
|
)
|
|
984
1029
|
p_build.add_argument(
|
|
1030
|
+
"--override",
|
|
985
1031
|
"--config",
|
|
986
1032
|
action="append",
|
|
987
1033
|
default=None,
|
|
988
|
-
help="
|
|
1034
|
+
help="Configuration override as key=value (repeatable). Dotted keys create nested config mappings.",
|
|
989
1035
|
)
|
|
990
1036
|
p_build.set_defaults(func=cmd_build)
|
|
991
1037
|
|
|
992
|
-
p_extract = sub.add_parser(
|
|
1038
|
+
p_extract = sub.add_parser(
|
|
1039
|
+
"extract", help="Work with text extraction snapshots for the corpus."
|
|
1040
|
+
)
|
|
993
1041
|
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
994
1042
|
|
|
995
|
-
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction
|
|
1043
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction snapshot.")
|
|
996
1044
|
_add_common_corpus_arg(p_extract_build)
|
|
997
1045
|
p_extract_build.add_argument(
|
|
998
|
-
"--
|
|
1046
|
+
"--configuration-name", default="default", help="Human-readable configuration name."
|
|
999
1047
|
)
|
|
1000
1048
|
p_extract_build.add_argument(
|
|
1001
|
-
"--
|
|
1049
|
+
"--configuration",
|
|
1002
1050
|
default=None,
|
|
1003
1051
|
action="append",
|
|
1004
|
-
help="Path to YAML
|
|
1052
|
+
help="Path to YAML configuration file. If provided, --step arguments are ignored.",
|
|
1005
1053
|
)
|
|
1006
1054
|
p_extract_build.add_argument(
|
|
1007
1055
|
"--step",
|
|
@@ -1011,7 +1059,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1011
1059
|
)
|
|
1012
1060
|
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
1013
1061
|
|
|
1014
|
-
p_extract_list = extract_sub.add_parser("list", help="List extraction
|
|
1062
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction snapshots.")
|
|
1015
1063
|
_add_common_corpus_arg(p_extract_list)
|
|
1016
1064
|
p_extract_list.add_argument(
|
|
1017
1065
|
"--extractor-id",
|
|
@@ -1020,37 +1068,39 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1020
1068
|
)
|
|
1021
1069
|
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
1022
1070
|
|
|
1023
|
-
p_extract_show = extract_sub.add_parser("show", help="Show an extraction
|
|
1071
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction snapshot manifest.")
|
|
1024
1072
|
_add_common_corpus_arg(p_extract_show)
|
|
1025
1073
|
p_extract_show.add_argument(
|
|
1026
|
-
"--
|
|
1074
|
+
"--snapshot",
|
|
1027
1075
|
required=True,
|
|
1028
|
-
help="Extraction
|
|
1076
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1029
1077
|
)
|
|
1030
1078
|
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
1031
1079
|
|
|
1032
|
-
p_extract_delete = extract_sub.add_parser(
|
|
1080
|
+
p_extract_delete = extract_sub.add_parser(
|
|
1081
|
+
"delete", help="Delete an extraction snapshot directory."
|
|
1082
|
+
)
|
|
1033
1083
|
_add_common_corpus_arg(p_extract_delete)
|
|
1034
1084
|
p_extract_delete.add_argument(
|
|
1035
|
-
"--
|
|
1085
|
+
"--snapshot",
|
|
1036
1086
|
required=True,
|
|
1037
|
-
help="Extraction
|
|
1087
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1038
1088
|
)
|
|
1039
1089
|
p_extract_delete.add_argument(
|
|
1040
1090
|
"--confirm",
|
|
1041
1091
|
required=True,
|
|
1042
|
-
help="Type the exact extractor_id:
|
|
1092
|
+
help="Type the exact extractor_id:snapshot_id to confirm deletion.",
|
|
1043
1093
|
)
|
|
1044
1094
|
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
1045
1095
|
|
|
1046
1096
|
p_extract_evaluate = extract_sub.add_parser(
|
|
1047
|
-
"evaluate", help="Evaluate an extraction
|
|
1097
|
+
"evaluate", help="Evaluate an extraction snapshot against a dataset."
|
|
1048
1098
|
)
|
|
1049
1099
|
_add_common_corpus_arg(p_extract_evaluate)
|
|
1050
1100
|
p_extract_evaluate.add_argument(
|
|
1051
|
-
"--
|
|
1101
|
+
"--snapshot",
|
|
1052
1102
|
default=None,
|
|
1053
|
-
help="Extraction
|
|
1103
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id (defaults to latest snapshot).",
|
|
1054
1104
|
)
|
|
1055
1105
|
p_extract_evaluate.add_argument(
|
|
1056
1106
|
"--dataset",
|
|
@@ -1061,8 +1111,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1061
1111
|
|
|
1062
1112
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
1063
1113
|
_add_common_corpus_arg(p_query)
|
|
1064
|
-
p_query.add_argument(
|
|
1065
|
-
|
|
1114
|
+
p_query.add_argument(
|
|
1115
|
+
"--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
|
|
1116
|
+
)
|
|
1117
|
+
p_query.add_argument("--retriever", default=None, help="Validate retriever identifier.")
|
|
1066
1118
|
p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
|
|
1067
1119
|
p_query.add_argument(
|
|
1068
1120
|
"--offset",
|
|
@@ -1071,7 +1123,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1071
1123
|
help="Skip this many ranked candidates before selecting evidence (pagination).",
|
|
1072
1124
|
)
|
|
1073
1125
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
1074
|
-
p_query.add_argument("--
|
|
1126
|
+
p_query.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1075
1127
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
1076
1128
|
p_query.add_argument(
|
|
1077
1129
|
"--reranker-id",
|
|
@@ -1122,16 +1174,18 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1122
1174
|
)
|
|
1123
1175
|
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
1124
1176
|
|
|
1125
|
-
p_eval = sub.add_parser("eval", help="Evaluate a
|
|
1177
|
+
p_eval = sub.add_parser("eval", help="Evaluate a snapshot against a dataset.")
|
|
1126
1178
|
_add_common_corpus_arg(p_eval)
|
|
1127
|
-
p_eval.add_argument(
|
|
1179
|
+
p_eval.add_argument(
|
|
1180
|
+
"--snapshot", default=None, help="Snapshot identifier (defaults to latest snapshot)."
|
|
1181
|
+
)
|
|
1128
1182
|
p_eval.add_argument(
|
|
1129
1183
|
"--dataset",
|
|
1130
1184
|
required=True,
|
|
1131
1185
|
help="Path to dataset JavaScript Object Notation file.",
|
|
1132
1186
|
)
|
|
1133
1187
|
p_eval.add_argument("--max-total-items", type=int, default=5)
|
|
1134
|
-
p_eval.add_argument("--
|
|
1188
|
+
p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
|
|
1135
1189
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
1136
1190
|
p_eval.set_defaults(func=cmd_eval)
|
|
1137
1191
|
|
|
@@ -1160,78 +1214,81 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1160
1214
|
p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
|
|
1161
1215
|
_add_common_corpus_arg(p_analyze_topics)
|
|
1162
1216
|
p_analyze_topics.add_argument(
|
|
1163
|
-
"--
|
|
1217
|
+
"--configuration",
|
|
1164
1218
|
required=True,
|
|
1165
1219
|
action="append",
|
|
1166
|
-
help="Path to topic modeling
|
|
1220
|
+
help="Path to topic modeling configuration YAML. Repeatable; later files override earlier ones.",
|
|
1167
1221
|
)
|
|
1168
1222
|
p_analyze_topics.add_argument(
|
|
1223
|
+
"--override",
|
|
1169
1224
|
"--config",
|
|
1170
1225
|
action="append",
|
|
1171
1226
|
default=[],
|
|
1172
|
-
help="Override key=value pairs applied after composing
|
|
1227
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1173
1228
|
)
|
|
1174
1229
|
p_analyze_topics.add_argument(
|
|
1175
|
-
"--
|
|
1230
|
+
"--configuration-name",
|
|
1176
1231
|
default="default",
|
|
1177
|
-
help="Human-readable
|
|
1232
|
+
help="Human-readable configuration name.",
|
|
1178
1233
|
)
|
|
1179
1234
|
p_analyze_topics.add_argument(
|
|
1180
|
-
"--extraction-
|
|
1235
|
+
"--extraction-snapshot",
|
|
1181
1236
|
default=None,
|
|
1182
|
-
help="Extraction
|
|
1237
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1183
1238
|
)
|
|
1184
1239
|
p_analyze_topics.set_defaults(func=cmd_analyze_topics)
|
|
1185
1240
|
|
|
1186
1241
|
p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
|
|
1187
1242
|
_add_common_corpus_arg(p_analyze_profile)
|
|
1188
1243
|
p_analyze_profile.add_argument(
|
|
1189
|
-
"--
|
|
1244
|
+
"--configuration",
|
|
1190
1245
|
default=None,
|
|
1191
1246
|
action="append",
|
|
1192
|
-
help="Optional profiling
|
|
1247
|
+
help="Optional profiling configuration YAML file. Repeatable; later files override earlier ones.",
|
|
1193
1248
|
)
|
|
1194
1249
|
p_analyze_profile.add_argument(
|
|
1250
|
+
"--override",
|
|
1195
1251
|
"--config",
|
|
1196
1252
|
action="append",
|
|
1197
1253
|
default=[],
|
|
1198
|
-
help="Override key=value pairs applied after composing
|
|
1254
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1199
1255
|
)
|
|
1200
1256
|
p_analyze_profile.add_argument(
|
|
1201
|
-
"--
|
|
1257
|
+
"--configuration-name",
|
|
1202
1258
|
default="default",
|
|
1203
|
-
help="Human-readable
|
|
1259
|
+
help="Human-readable configuration name.",
|
|
1204
1260
|
)
|
|
1205
1261
|
p_analyze_profile.add_argument(
|
|
1206
|
-
"--extraction-
|
|
1262
|
+
"--extraction-snapshot",
|
|
1207
1263
|
default=None,
|
|
1208
|
-
help="Extraction
|
|
1264
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1209
1265
|
)
|
|
1210
1266
|
p_analyze_profile.set_defaults(func=cmd_analyze_profile)
|
|
1211
1267
|
|
|
1212
1268
|
p_analyze_markov = analyze_sub.add_parser("markov", help="Run Markov analysis.")
|
|
1213
1269
|
_add_common_corpus_arg(p_analyze_markov)
|
|
1214
1270
|
p_analyze_markov.add_argument(
|
|
1215
|
-
"--
|
|
1271
|
+
"--configuration",
|
|
1216
1272
|
required=True,
|
|
1217
1273
|
action="append",
|
|
1218
|
-
help="Path to Markov analysis
|
|
1274
|
+
help="Path to Markov analysis configuration YAML. Repeatable; later files override earlier ones.",
|
|
1219
1275
|
)
|
|
1220
1276
|
p_analyze_markov.add_argument(
|
|
1277
|
+
"--override",
|
|
1221
1278
|
"--config",
|
|
1222
1279
|
action="append",
|
|
1223
1280
|
default=[],
|
|
1224
|
-
help="Override key=value pairs applied after composing
|
|
1281
|
+
help="Override key=value pairs applied after composing configurations (supports dotted keys).",
|
|
1225
1282
|
)
|
|
1226
1283
|
p_analyze_markov.add_argument(
|
|
1227
|
-
"--
|
|
1284
|
+
"--configuration-name",
|
|
1228
1285
|
default="default",
|
|
1229
|
-
help="Human-readable
|
|
1286
|
+
help="Human-readable configuration name.",
|
|
1230
1287
|
)
|
|
1231
1288
|
p_analyze_markov.add_argument(
|
|
1232
|
-
"--extraction-
|
|
1289
|
+
"--extraction-snapshot",
|
|
1233
1290
|
default=None,
|
|
1234
|
-
help="Extraction
|
|
1291
|
+
help="Extraction snapshot reference in the form extractor_id:snapshot_id.",
|
|
1235
1292
|
)
|
|
1236
1293
|
p_analyze_markov.set_defaults(func=cmd_analyze_markov)
|
|
1237
1294
|
|
|
@@ -1256,7 +1313,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
1256
1313
|
FileExistsError,
|
|
1257
1314
|
KeyError,
|
|
1258
1315
|
ValueError,
|
|
1259
|
-
|
|
1316
|
+
ExtractionSnapshotFatalError,
|
|
1260
1317
|
NotImplementedError,
|
|
1261
1318
|
ValidationError,
|
|
1262
1319
|
) as exception:
|