biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/cli.py ADDED
@@ -0,0 +1,804 @@
1
+ """
2
+ Command-line interface for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from pydantic import ValidationError
14
+
15
+ from .backends import get_backend
16
+ from .context import (
17
+ ContextPackPolicy,
18
+ TokenBudget,
19
+ build_context_pack,
20
+ fit_context_pack_to_token_budget,
21
+ )
22
+ from .corpus import Corpus
23
+ from .crawl import CrawlRequest, crawl_into_corpus
24
+ from .errors import ExtractionRunFatalError
25
+ from .evaluation import evaluate_run, load_dataset
26
+ from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
27
+ from .extraction import build_extraction_run
28
+ from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
29
+ from .uris import corpus_ref_to_path
30
+
31
+
32
+ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
33
+ """
34
+ Add the common --corpus argument to a parser.
35
+
36
+ :param parser: Argument parser to modify.
37
+ :type parser: argparse.ArgumentParser
38
+ :return: None.
39
+ :rtype: None
40
+ """
41
+ parser.add_argument(
42
+ "--corpus",
43
+ type=str,
44
+ default=argparse.SUPPRESS,
45
+ dest="corpus",
46
+ help=(
47
+ "Corpus path or uniform resource identifier (defaults to searching from the current working directory "
48
+ "upward)."
49
+ ),
50
+ )
51
+
52
+
53
+ def cmd_init(arguments: argparse.Namespace) -> int:
54
+ """
55
+ Initialize a new corpus from command-line interface arguments.
56
+
57
+ :param arguments: Parsed command-line interface arguments.
58
+ :type arguments: argparse.Namespace
59
+ :return: Exit code.
60
+ :rtype: int
61
+ """
62
+ corpus_path = corpus_ref_to_path(arguments.path)
63
+ corpus = Corpus.init(corpus_path, force=arguments.force)
64
+ print(f"Initialized corpus at {corpus.root}")
65
+ return 0
66
+
67
+
68
+ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
69
+ """
70
+ Parse and deduplicate tag strings.
71
+
72
+ :param raw: Comma-separated tag string.
73
+ :type raw: str or None
74
+ :param raw_list: Repeated tag list.
75
+ :type raw_list: list[str] or None
76
+ :return: Deduplicated tag list.
77
+ :rtype: list[str]
78
+ """
79
+ parsed_tags: List[str] = []
80
+ if raw:
81
+ parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
82
+ if raw_list:
83
+ parsed_tags.extend([tag.strip() for tag in raw_list if tag.strip()])
84
+
85
+ seen_tags = set()
86
+ deduplicated_tags: List[str] = []
87
+ for tag_value in parsed_tags:
88
+ if tag_value not in seen_tags:
89
+ seen_tags.add(tag_value)
90
+ deduplicated_tags.append(tag_value)
91
+ return deduplicated_tags
92
+
93
+
94
+ def cmd_ingest(arguments: argparse.Namespace) -> int:
95
+ """
96
+ Ingest items into a corpus from command-line interface arguments.
97
+
98
+ :param arguments: Parsed command-line interface arguments.
99
+ :type arguments: argparse.Namespace
100
+ :return: Exit code.
101
+ :rtype: int
102
+ """
103
+ corpus = (
104
+ Corpus.open(arguments.corpus)
105
+ if getattr(arguments, "corpus", None)
106
+ else Corpus.find(Path.cwd())
107
+ )
108
+ tags = _parse_tags(arguments.tags, arguments.tag)
109
+
110
+ results = []
111
+
112
+ if arguments.note is not None or arguments.stdin:
113
+ text = arguments.note if arguments.note is not None else sys.stdin.read()
114
+ ingest_result = corpus.ingest_note(
115
+ text,
116
+ title=arguments.title,
117
+ tags=tags,
118
+ source_uri="stdin" if arguments.stdin else "text",
119
+ )
120
+ results.append(ingest_result)
121
+
122
+ for source_path in arguments.files or []:
123
+ results.append(corpus.ingest_source(source_path, tags=tags))
124
+
125
+ if not results:
126
+ print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
127
+ return 2
128
+
129
+ for ingest_result in results:
130
+ print(f"{ingest_result.item_id}\t{ingest_result.relpath}\t{ingest_result.sha256}")
131
+ return 0
132
+
133
+
134
+ def cmd_list(arguments: argparse.Namespace) -> int:
135
+ """
136
+ List items from the corpus.
137
+
138
+ :param arguments: Parsed command-line interface arguments.
139
+ :type arguments: argparse.Namespace
140
+ :return: Exit code.
141
+ :rtype: int
142
+ """
143
+ corpus = (
144
+ Corpus.open(arguments.corpus)
145
+ if getattr(arguments, "corpus", None)
146
+ else Corpus.find(Path.cwd())
147
+ )
148
+ items = corpus.list_items(limit=arguments.limit)
149
+ for item in items:
150
+ title = item.title or ""
151
+ print(f"{item.id}\t{item.created_at}\t{item.relpath}\t{title}\t{','.join(item.tags)}")
152
+ return 0
153
+
154
+
155
+ def cmd_show(arguments: argparse.Namespace) -> int:
156
+ """
157
+ Show an item from the corpus.
158
+
159
+ :param arguments: Parsed command-line interface arguments.
160
+ :type arguments: argparse.Namespace
161
+ :return: Exit code.
162
+ :rtype: int
163
+ """
164
+ corpus = (
165
+ Corpus.open(arguments.corpus)
166
+ if getattr(arguments, "corpus", None)
167
+ else Corpus.find(Path.cwd())
168
+ )
169
+ item = corpus.get_item(arguments.id)
170
+ print(item.model_dump_json(indent=2))
171
+ return 0
172
+
173
+
174
+ def cmd_reindex(arguments: argparse.Namespace) -> int:
175
+ """
176
+ Rebuild the corpus catalog.
177
+
178
+ :param arguments: Parsed command-line interface arguments.
179
+ :type arguments: argparse.Namespace
180
+ :return: Exit code.
181
+ :rtype: int
182
+ """
183
+ corpus = (
184
+ Corpus.open(arguments.corpus)
185
+ if getattr(arguments, "corpus", None)
186
+ else Corpus.find(Path.cwd())
187
+ )
188
+ stats = corpus.reindex()
189
+ print(json.dumps(stats, indent=2, sort_keys=False))
190
+ return 0
191
+
192
+
193
+ def cmd_import_tree(arguments: argparse.Namespace) -> int:
194
+ """
195
+ Import a folder tree into a corpus.
196
+
197
+ :param arguments: Parsed command-line interface arguments.
198
+ :type arguments: argparse.Namespace
199
+ :return: Exit code.
200
+ :rtype: int
201
+ """
202
+ corpus = (
203
+ Corpus.open(arguments.corpus)
204
+ if getattr(arguments, "corpus", None)
205
+ else Corpus.find(Path.cwd())
206
+ )
207
+ tags = _parse_tags(arguments.tags, arguments.tag)
208
+ stats = corpus.import_tree(Path(arguments.path), tags=tags)
209
+ print(json.dumps(stats, indent=2, sort_keys=False))
210
+ return 0
211
+
212
+
213
+ def cmd_purge(arguments: argparse.Namespace) -> int:
214
+ """
215
+ Purge all items and derived artifacts from a corpus.
216
+
217
+ :param arguments: Parsed command-line interface arguments.
218
+ :type arguments: argparse.Namespace
219
+ :return: Exit code.
220
+ :rtype: int
221
+ """
222
+ corpus = (
223
+ Corpus.open(arguments.corpus)
224
+ if getattr(arguments, "corpus", None)
225
+ else Corpus.find(Path.cwd())
226
+ )
227
+ if arguments.confirm is None:
228
+ raise ValueError(f"Purging is dangerous: pass --confirm {corpus.name!r} to proceed")
229
+ corpus.purge(confirm=arguments.confirm)
230
+ print(f"Purged corpus {corpus.root}")
231
+ return 0
232
+
233
+
234
+ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
235
+ """
236
+ Parse repeated key=value config pairs.
237
+
238
+ :param pairs: Config pairs supplied via the command-line interface.
239
+ :type pairs: list[str] or None
240
+ :return: Parsed config mapping.
241
+ :rtype: dict[str, object]
242
+ :raises ValueError: If any entry is not key=value.
243
+ """
244
+ config: Dict[str, object] = {}
245
+ for item in pairs or []:
246
+ if "=" not in item:
247
+ raise ValueError(f"Config values must be key=value (got {item!r})")
248
+ key, raw = item.split("=", 1)
249
+ key = key.strip()
250
+ if not key:
251
+ raise ValueError("Config keys must be non-empty")
252
+ value: object = raw
253
+ if raw.isdigit():
254
+ value = int(raw)
255
+ else:
256
+ try:
257
+ value = float(raw)
258
+ except ValueError:
259
+ value = raw
260
+ config[key] = value
261
+ return config
262
+
263
+
264
+ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
265
+ """
266
+ Parse a pipeline step specification.
267
+
268
+ :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
269
+ :type raw_step: str
270
+ :return: Tuple of extractor_id and config mapping.
271
+ :rtype: tuple[str, dict[str, object]]
272
+ :raises ValueError: If the step spec is invalid.
273
+ """
274
+ raw_step = raw_step.strip()
275
+ if not raw_step:
276
+ raise ValueError("Step spec must be non-empty")
277
+ if ":" not in raw_step:
278
+ return raw_step, {}
279
+ extractor_id, raw_pairs = raw_step.split(":", 1)
280
+ extractor_id = extractor_id.strip()
281
+ if not extractor_id:
282
+ raise ValueError("Step spec must start with an extractor identifier")
283
+ config: Dict[str, object] = {}
284
+ raw_pairs = raw_pairs.strip()
285
+ if not raw_pairs:
286
+ return extractor_id, {}
287
+ for token in raw_pairs.split(","):
288
+ token = token.strip()
289
+ if not token:
290
+ continue
291
+ if "=" not in token:
292
+ raise ValueError(f"Step config values must be key=value (got {token!r})")
293
+ key, value = token.split("=", 1)
294
+ key = key.strip()
295
+ if not key:
296
+ raise ValueError("Step config keys must be non-empty")
297
+ config[key] = value
298
+ return extractor_id, config
299
+
300
+
301
+ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
302
+ """
303
+ Build a QueryBudget from command-line interface arguments.
304
+
305
+ :param arguments: Parsed command-line interface arguments.
306
+ :type arguments: argparse.Namespace
307
+ :return: Query budget instance.
308
+ :rtype: QueryBudget
309
+ """
310
+ return QueryBudget(
311
+ max_total_items=arguments.max_total_items,
312
+ max_total_characters=arguments.max_total_characters,
313
+ max_items_per_source=arguments.max_items_per_source,
314
+ )
315
+
316
+
317
+ def cmd_build(arguments: argparse.Namespace) -> int:
318
+ """
319
+ Build a retrieval run for a backend.
320
+
321
+ :param arguments: Parsed command-line interface arguments.
322
+ :type arguments: argparse.Namespace
323
+ :return: Exit code.
324
+ :rtype: int
325
+ """
326
+ corpus = (
327
+ Corpus.open(arguments.corpus)
328
+ if getattr(arguments, "corpus", None)
329
+ else Corpus.find(Path.cwd())
330
+ )
331
+ backend = get_backend(arguments.backend)
332
+ config = _parse_config_pairs(arguments.config)
333
+ run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
334
+ print(run.model_dump_json(indent=2))
335
+ return 0
336
+
337
+
338
+ def cmd_extract_build(arguments: argparse.Namespace) -> int:
339
+ """
340
+ Build a text extraction run for the corpus using a pipeline of extractors.
341
+
342
+ :param arguments: Parsed command-line interface arguments.
343
+ :type arguments: argparse.Namespace
344
+ :return: Exit code.
345
+ :rtype: int
346
+ """
347
+ corpus = (
348
+ Corpus.open(arguments.corpus)
349
+ if getattr(arguments, "corpus", None)
350
+ else Corpus.find(Path.cwd())
351
+ )
352
+ raw_steps = list(arguments.step or [])
353
+ if not raw_steps:
354
+ raise ValueError("Pipeline extraction requires at least one --step")
355
+ steps: List[Dict[str, object]] = []
356
+ for raw_step in raw_steps:
357
+ extractor_id, step_config = _parse_step_spec(raw_step)
358
+ steps.append({"extractor_id": extractor_id, "config": step_config})
359
+ config = {"steps": steps}
360
+ manifest = build_extraction_run(
361
+ corpus,
362
+ extractor_id="pipeline",
363
+ recipe_name=arguments.recipe_name,
364
+ config=config,
365
+ )
366
+ print(manifest.model_dump_json(indent=2))
367
+ return 0
368
+
369
+
370
+ def cmd_extract_list(arguments: argparse.Namespace) -> int:
371
+ """
372
+ List extraction runs stored under the corpus.
373
+
374
+ :param arguments: Parsed command-line interface arguments.
375
+ :type arguments: argparse.Namespace
376
+ :return: Exit code.
377
+ :rtype: int
378
+ """
379
+ corpus = (
380
+ Corpus.open(arguments.corpus)
381
+ if getattr(arguments, "corpus", None)
382
+ else Corpus.find(Path.cwd())
383
+ )
384
+ runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
385
+ print(json.dumps([entry.model_dump() for entry in runs], indent=2))
386
+ return 0
387
+
388
+
389
+ def cmd_extract_show(arguments: argparse.Namespace) -> int:
390
+ """
391
+ Show an extraction run manifest.
392
+
393
+ :param arguments: Parsed command-line interface arguments.
394
+ :type arguments: argparse.Namespace
395
+ :return: Exit code.
396
+ :rtype: int
397
+ """
398
+ corpus = (
399
+ Corpus.open(arguments.corpus)
400
+ if getattr(arguments, "corpus", None)
401
+ else Corpus.find(Path.cwd())
402
+ )
403
+ reference = parse_extraction_run_reference(arguments.run)
404
+ manifest = corpus.load_extraction_run_manifest(
405
+ extractor_id=reference.extractor_id, run_id=reference.run_id
406
+ )
407
+ print(manifest.model_dump_json(indent=2))
408
+ return 0
409
+
410
+
411
+ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
412
+ """
413
+ Delete an extraction run directory and its derived artifacts.
414
+
415
+ :param arguments: Parsed command-line interface arguments.
416
+ :type arguments: argparse.Namespace
417
+ :return: Exit code.
418
+ :rtype: int
419
+ """
420
+ corpus = (
421
+ Corpus.open(arguments.corpus)
422
+ if getattr(arguments, "corpus", None)
423
+ else Corpus.find(Path.cwd())
424
+ )
425
+ if arguments.confirm != arguments.run:
426
+ raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
427
+ reference = parse_extraction_run_reference(arguments.run)
428
+ corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
429
+ print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
430
+ return 0
431
+
432
+
433
+ def cmd_query(arguments: argparse.Namespace) -> int:
434
+ """
435
+ Execute a retrieval query.
436
+
437
+ :param arguments: Parsed command-line interface arguments.
438
+ :type arguments: argparse.Namespace
439
+ :return: Exit code.
440
+ :rtype: int
441
+ """
442
+ corpus = (
443
+ Corpus.open(arguments.corpus)
444
+ if getattr(arguments, "corpus", None)
445
+ else Corpus.find(Path.cwd())
446
+ )
447
+ run_id = arguments.run or corpus.latest_run_id
448
+ if not run_id:
449
+ raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
450
+ run = corpus.load_run(run_id)
451
+ if arguments.backend and arguments.backend != run.recipe.backend_id:
452
+ raise ValueError(
453
+ f"Backend mismatch: run uses {run.recipe.backend_id!r} but {arguments.backend!r} was requested"
454
+ )
455
+ backend = get_backend(run.recipe.backend_id)
456
+ query_text = arguments.query if arguments.query is not None else sys.stdin.read()
457
+ budget = _budget_from_args(arguments)
458
+ result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
459
+ processed_evidence = result.evidence
460
+ if getattr(arguments, "reranker_id", None):
461
+ processed_evidence = apply_evidence_reranker(
462
+ reranker_id=arguments.reranker_id,
463
+ query_text=result.query_text,
464
+ evidence=processed_evidence,
465
+ )
466
+ if getattr(arguments, "minimum_score", None) is not None:
467
+ processed_evidence = apply_evidence_filter(
468
+ filter_id="filter-minimum-score",
469
+ query_text=result.query_text,
470
+ evidence=processed_evidence,
471
+ config={"minimum_score": float(arguments.minimum_score)},
472
+ )
473
+ if processed_evidence is not result.evidence:
474
+ result = result.model_copy(update={"evidence": processed_evidence})
475
+ print(result.model_dump_json(indent=2))
476
+ return 0
477
+
478
+
479
+ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
480
+ """
481
+ Build a context pack from a retrieval result.
482
+
483
+ The retrieval result is read from standard input as JavaScript Object Notation.
484
+
485
+ :param arguments: Parsed command-line interface arguments.
486
+ :type arguments: argparse.Namespace
487
+ :return: Exit code.
488
+ :rtype: int
489
+ """
490
+ input_text = sys.stdin.read()
491
+ if not input_text.strip():
492
+ raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
493
+ retrieval_result = RetrievalResult.model_validate_json(input_text)
494
+ join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
495
+ policy = ContextPackPolicy(join_with=join_with)
496
+ context_pack = build_context_pack(retrieval_result, policy=policy)
497
+ if arguments.max_tokens is not None:
498
+ context_pack = fit_context_pack_to_token_budget(
499
+ context_pack,
500
+ policy=policy,
501
+ token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
502
+ )
503
+ print(
504
+ json.dumps(
505
+ {
506
+ "policy": policy.model_dump(),
507
+ "context_pack": context_pack.model_dump(),
508
+ },
509
+ indent=2,
510
+ )
511
+ )
512
+ return 0
513
+
514
+
515
+ def cmd_eval(arguments: argparse.Namespace) -> int:
516
+ """
517
+ Evaluate a retrieval run against a dataset.
518
+
519
+ :param arguments: Parsed command-line interface arguments.
520
+ :type arguments: argparse.Namespace
521
+ :return: Exit code.
522
+ :rtype: int
523
+ """
524
+ corpus = (
525
+ Corpus.open(arguments.corpus)
526
+ if getattr(arguments, "corpus", None)
527
+ else Corpus.find(Path.cwd())
528
+ )
529
+ run_id = arguments.run or corpus.latest_run_id
530
+ if not run_id:
531
+ raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
532
+ run = corpus.load_run(run_id)
533
+ dataset = load_dataset(Path(arguments.dataset))
534
+ budget = _budget_from_args(arguments)
535
+ result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
536
+ print(result.model_dump_json(indent=2))
537
+ return 0
538
+
539
+
540
+ def cmd_crawl(arguments: argparse.Namespace) -> int:
541
+ """
542
+ Crawl a website prefix into a corpus.
543
+
544
+ :param arguments: Parsed command-line interface arguments.
545
+ :type arguments: argparse.Namespace
546
+ :return: Exit code.
547
+ :rtype: int
548
+ """
549
+ corpus = (
550
+ Corpus.open(arguments.corpus)
551
+ if getattr(arguments, "corpus", None)
552
+ else Corpus.find(Path.cwd())
553
+ )
554
+ tags = _parse_tags(arguments.tags, arguments.tag)
555
+ request = CrawlRequest(
556
+ root_url=arguments.root_url,
557
+ allowed_prefix=arguments.allowed_prefix,
558
+ max_items=arguments.max_items,
559
+ tags=tags,
560
+ )
561
+ result = crawl_into_corpus(corpus=corpus, request=request)
562
+ print(result.model_dump_json(indent=2))
563
+ return 0
564
+
565
+
566
+ def build_parser() -> argparse.ArgumentParser:
567
+ """
568
+ Build the command-line interface argument parser.
569
+
570
+ :return: Argument parser instance.
571
+ :rtype: argparse.ArgumentParser
572
+ """
573
+ parser = argparse.ArgumentParser(
574
+ prog="biblicus",
575
+ description="Biblicus command-line interface (minimum viable product)",
576
+ )
577
+ parser.add_argument(
578
+ "--corpus",
579
+ type=str,
580
+ default=None,
581
+ dest="corpus",
582
+ help=(
583
+ "Corpus path or uniform resource identifier (defaults to searching from the current working directory "
584
+ "upward). "
585
+ "Can be provided before or after the subcommand."
586
+ ),
587
+ )
588
+ sub = parser.add_subparsers(dest="cmd", required=True)
589
+
590
+ p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
591
+ p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
592
+ p_init.add_argument(
593
+ "--force", action="store_true", help="Overwrite existing config if present."
594
+ )
595
+ p_init.set_defaults(func=cmd_init)
596
+
597
+ p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
598
+ _add_common_corpus_arg(p_ingest)
599
+ p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
600
+ p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
601
+ p_ingest.add_argument(
602
+ "--stdin", action="store_true", help="Read text to ingest from standard input."
603
+ )
604
+ p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
605
+ p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
606
+ p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
607
+ p_ingest.set_defaults(func=cmd_ingest)
608
+
609
+ p_list = sub.add_parser("list", help="List recently ingested items.")
610
+ _add_common_corpus_arg(p_list)
611
+ p_list.add_argument("--limit", type=int, default=50)
612
+ p_list.set_defaults(func=cmd_list)
613
+
614
+ p_show = sub.add_parser("show", help="Show metadata for an item identifier.")
615
+ _add_common_corpus_arg(p_show)
616
+ p_show.add_argument("id", help="Item identifier (universally unique identifier).")
617
+ p_show.set_defaults(func=cmd_show)
618
+
619
+ p_reindex = sub.add_parser(
620
+ "reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
621
+ )
622
+ _add_common_corpus_arg(p_reindex)
623
+ p_reindex.set_defaults(func=cmd_reindex)
624
+
625
+ p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
626
+ _add_common_corpus_arg(p_import_tree)
627
+ p_import_tree.add_argument("path", help="Folder tree root to import.")
628
+ p_import_tree.add_argument(
629
+ "--tags", default=None, help="Comma-separated tags to apply to imported items."
630
+ )
631
+ p_import_tree.add_argument(
632
+ "--tag", action="append", help="Repeatable tag to apply to imported items."
633
+ )
634
+ p_import_tree.set_defaults(func=cmd_import_tree)
635
+
636
+ p_purge = sub.add_parser(
637
+ "purge", help="Delete all items and derived files (requires confirmation)."
638
+ )
639
+ _add_common_corpus_arg(p_purge)
640
+ p_purge.add_argument(
641
+ "--confirm",
642
+ default=None,
643
+ help="Type the corpus name (directory basename) to confirm purging.",
644
+ )
645
+ p_purge.set_defaults(func=cmd_purge)
646
+
647
+ p_build = sub.add_parser("build", help="Build a retrieval backend run for the corpus.")
648
+ _add_common_corpus_arg(p_build)
649
+ p_build.add_argument(
650
+ "--backend",
651
+ required=True,
652
+ help="Backend identifier (for example, scan, sqlite-full-text-search).",
653
+ )
654
+ p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
655
+ p_build.add_argument(
656
+ "--config",
657
+ action="append",
658
+ default=None,
659
+ help="Backend config as key=value (repeatable).",
660
+ )
661
+ p_build.set_defaults(func=cmd_build)
662
+
663
+ p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
664
+ extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
665
+
666
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
667
+ _add_common_corpus_arg(p_extract_build)
668
+ p_extract_build.add_argument(
669
+ "--recipe-name", default="default", help="Human-readable recipe name."
670
+ )
671
+ p_extract_build.add_argument(
672
+ "--step",
673
+ action="append",
674
+ default=None,
675
+ help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
676
+ )
677
+ p_extract_build.set_defaults(func=cmd_extract_build)
678
+
679
+ p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
680
+ _add_common_corpus_arg(p_extract_list)
681
+ p_extract_list.add_argument(
682
+ "--extractor-id",
683
+ default=None,
684
+ help="Optional extractor identifier filter (for example: pipeline).",
685
+ )
686
+ p_extract_list.set_defaults(func=cmd_extract_list)
687
+
688
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
689
+ _add_common_corpus_arg(p_extract_show)
690
+ p_extract_show.add_argument(
691
+ "--run",
692
+ required=True,
693
+ help="Extraction run reference in the form extractor_id:run_id.",
694
+ )
695
+ p_extract_show.set_defaults(func=cmd_extract_show)
696
+
697
+ p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
698
+ _add_common_corpus_arg(p_extract_delete)
699
+ p_extract_delete.add_argument(
700
+ "--run",
701
+ required=True,
702
+ help="Extraction run reference in the form extractor_id:run_id.",
703
+ )
704
+ p_extract_delete.add_argument(
705
+ "--confirm",
706
+ required=True,
707
+ help="Type the exact extractor_id:run_id to confirm deletion.",
708
+ )
709
+ p_extract_delete.set_defaults(func=cmd_extract_delete)
710
+
711
+ p_query = sub.add_parser("query", help="Run a retrieval query.")
712
+ _add_common_corpus_arg(p_query)
713
+ p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
714
+ p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
715
+ p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
716
+ p_query.add_argument("--max-total-items", type=int, default=5)
717
+ p_query.add_argument("--max-total-characters", type=int, default=2000)
718
+ p_query.add_argument("--max-items-per-source", type=int, default=5)
719
+ p_query.add_argument(
720
+ "--reranker-id",
721
+ default=None,
722
+ help="Optional reranker identifier to apply after retrieval (for example: rerank-longest-text).",
723
+ )
724
+ p_query.add_argument(
725
+ "--minimum-score",
726
+ type=float,
727
+ default=None,
728
+ help="Optional minimum score threshold to filter evidence after retrieval.",
729
+ )
730
+ p_query.set_defaults(func=cmd_query)
731
+
732
+ p_context_pack = sub.add_parser("context-pack", help="Build context pack text from evidence.")
733
+ context_pack_sub = p_context_pack.add_subparsers(dest="context_pack_command", required=True)
734
+
735
+ p_context_pack_build = context_pack_sub.add_parser(
736
+ "build", help="Build a context pack from a retrieval result JavaScript Object Notation."
737
+ )
738
+ p_context_pack_build.add_argument(
739
+ "--join-with",
740
+ default="\\n\\n",
741
+ help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
742
+ )
743
+ p_context_pack_build.add_argument(
744
+ "--max-tokens",
745
+ default=None,
746
+ type=int,
747
+ help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
748
+ )
749
+ p_context_pack_build.set_defaults(func=cmd_context_pack_build)
750
+
751
+ p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
752
+ _add_common_corpus_arg(p_eval)
753
+ p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
754
+ p_eval.add_argument(
755
+ "--dataset",
756
+ required=True,
757
+ help="Path to dataset JavaScript Object Notation file.",
758
+ )
759
+ p_eval.add_argument("--max-total-items", type=int, default=5)
760
+ p_eval.add_argument("--max-total-characters", type=int, default=2000)
761
+ p_eval.add_argument("--max-items-per-source", type=int, default=5)
762
+ p_eval.set_defaults(func=cmd_eval)
763
+
764
+ p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
765
+ _add_common_corpus_arg(p_crawl)
766
+ p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
767
+ p_crawl.add_argument(
768
+ "--allowed-prefix",
769
+ required=True,
770
+ help="Uniform resource locator prefix that limits which links are eligible for crawl.",
771
+ )
772
+ p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
773
+ p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
774
+ p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
775
+ p_crawl.set_defaults(func=cmd_crawl)
776
+
777
+ return parser
778
+
779
+
780
+ def main(argument_list: Optional[List[str]] = None) -> int:
781
+ """
782
+ Entry point for the Biblicus command-line interface.
783
+
784
+ :param argument_list: Optional command-line interface arguments.
785
+ :type argument_list: list[str] or None
786
+ :return: Exit code.
787
+ :rtype: int
788
+ """
789
+ parser = build_parser()
790
+ arguments = parser.parse_args(argument_list)
791
+ try:
792
+ return int(arguments.func(arguments))
793
+ except (
794
+ FileNotFoundError,
795
+ FileExistsError,
796
+ KeyError,
797
+ ValueError,
798
+ ExtractionRunFatalError,
799
+ NotImplementedError,
800
+ ValidationError,
801
+ ) as exception:
802
+ message = exception.args[0] if getattr(exception, "args", None) else str(exception)
803
+ print(str(message), file=sys.stderr)
804
+ return 2