biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +177 -53
  11. biblicus/corpus.py +209 -59
  12. biblicus/crawl.py +186 -0
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +280 -79
  16. biblicus/extractors/__init__.py +14 -3
  17. biblicus/extractors/base.py +12 -5
  18. biblicus/extractors/metadata_text.py +13 -5
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +16 -6
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +0 -5
  29. biblicus/hook_manager.py +3 -5
  30. biblicus/hooks.py +3 -7
  31. biblicus/ignore.py +0 -3
  32. biblicus/models.py +118 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +44 -9
  35. biblicus/time.py +1 -2
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
  39. biblicus-0.4.0.dist-info/RECORD +45 -0
  40. biblicus/extractors/cascade.py +0 -101
  41. biblicus-0.2.0.dist-info/RECORD +0 -32
  42. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,15 @@ from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
15
- from ..extraction import ExtractionRunReference, parse_extraction_run_reference
16
15
  from ..frontmatter import parse_front_matter
17
- from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
16
+ from ..models import (
17
+ Evidence,
18
+ ExtractionRunReference,
19
+ QueryBudget,
20
+ RetrievalResult,
21
+ RetrievalRun,
22
+ parse_extraction_run_reference,
23
+ )
18
24
  from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
19
25
  from ..time import utc_now_iso
20
26
 
@@ -51,7 +57,9 @@ class SqliteFullTextSearchBackend:
51
57
 
52
58
  backend_id = "sqlite-full-text-search"
53
59
 
54
- def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
60
+ def build_run(
61
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
62
+ ) -> RetrievalRun:
55
63
  """
56
64
  Build a full-text search version five index for the corpus.
57
65
 
@@ -64,7 +72,6 @@ class SqliteFullTextSearchBackend:
64
72
  :return: Run manifest describing the build.
65
73
  :rtype: RetrievalRun
66
74
  """
67
-
68
75
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
69
76
  catalog = corpus.load_catalog()
70
77
  recipe = create_recipe_manifest(
@@ -110,7 +117,6 @@ class SqliteFullTextSearchBackend:
110
117
  :return: Retrieval results containing evidence.
111
118
  :rtype: RetrievalResult
112
119
  """
113
-
114
120
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
115
121
  db_path = _resolve_run_db_path(corpus, run)
116
122
  candidates = _query_full_text_search_index(
@@ -156,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
156
162
  :return: Candidate limit for backend search.
157
163
  :rtype: int
158
164
  """
159
-
160
165
  return max_total_items * 5
161
166
 
162
167
 
@@ -172,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
172
177
  :rtype: Path
173
178
  :raises FileNotFoundError: If the run does not have artifact paths.
174
179
  """
175
-
176
180
  if not run.artifact_paths:
177
181
  raise FileNotFoundError("Run has no artifact paths to query")
178
182
  return corpus.root / run.artifact_paths[0]
@@ -188,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
188
192
  :rtype: None
189
193
  :raises RuntimeError: If full-text search version five support is unavailable.
190
194
  """
191
-
192
195
  try:
193
196
  cursor = conn.execute(
194
197
  "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
@@ -210,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
210
213
  :return: None.
211
214
  :rtype: None
212
215
  """
213
-
214
216
  conn.execute(
215
217
  """
216
218
  CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
@@ -249,7 +251,6 @@ def _build_full_text_search_index(
249
251
  :return: Index statistics.
250
252
  :rtype: dict[str, int]
251
253
  """
252
-
253
254
  if db_path.exists():
254
255
  db_path.unlink()
255
256
  connection = sqlite3.connect(str(db_path))
@@ -339,7 +340,6 @@ def _load_text_from_item(
339
340
  :return: Text payload or None if not text.
340
341
  :rtype: str or None
341
342
  """
342
-
343
343
  if extraction_reference:
344
344
  extracted_text = corpus.read_extracted_text(
345
345
  extractor_id=extraction_reference.extractor_id,
@@ -375,7 +375,6 @@ def _resolve_extraction_reference(
375
375
  :rtype: ExtractionRunReference or None
376
376
  :raises FileNotFoundError: If an extraction run is referenced but not present.
377
377
  """
378
-
379
378
  if not recipe_config.extraction_run:
380
379
  return None
381
380
  extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
@@ -388,7 +387,9 @@ def _resolve_extraction_reference(
388
387
  return extraction_reference
389
388
 
390
389
 
391
- def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
390
+ def _iter_chunks(
391
+ text: str, *, chunk_size: int, chunk_overlap: int
392
+ ) -> Iterable[Tuple[int, int, str]]:
392
393
  """
393
394
  Yield overlapping chunks of text for indexing.
394
395
 
@@ -402,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
402
403
  :rtype: Iterable[tuple[int, int, str]]
403
404
  :raises ValueError: If the overlap is greater than or equal to the chunk size.
404
405
  """
405
-
406
406
  if chunk_overlap >= chunk_size:
407
407
  raise ValueError("chunk_overlap must be smaller than chunk_size")
408
408
  start_offset = 0
@@ -435,7 +435,6 @@ def _query_full_text_search_index(
435
435
  :return: Evidence candidates.
436
436
  :rtype: list[Evidence]
437
437
  """
438
-
439
438
  connection = sqlite3.connect(str(db_path))
440
439
  try:
441
440
  rows = connection.execute(
biblicus/cli.py CHANGED
@@ -14,9 +14,11 @@ from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
16
  from .corpus import Corpus
17
- from .extraction import build_extraction_run
17
+ from .crawl import CrawlRequest, crawl_into_corpus
18
+ from .errors import ExtractionRunFatalError
18
19
  from .evaluation import evaluate_run, load_dataset
19
- from .models import QueryBudget
20
+ from .extraction import build_extraction_run
21
+ from .models import QueryBudget, parse_extraction_run_reference
20
22
  from .uris import corpus_ref_to_path
21
23
 
22
24
 
@@ -29,7 +31,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
29
31
  :return: None.
30
32
  :rtype: None
31
33
  """
32
-
33
34
  parser.add_argument(
34
35
  "--corpus",
35
36
  type=str,
@@ -51,7 +52,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
51
52
  :return: Exit code.
52
53
  :rtype: int
53
54
  """
54
-
55
55
  corpus_path = corpus_ref_to_path(arguments.path)
56
56
  corpus = Corpus.init(corpus_path, force=arguments.force)
57
57
  print(f"Initialized corpus at {corpus.root}")
@@ -69,7 +69,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
69
69
  :return: Deduplicated tag list.
70
70
  :rtype: list[str]
71
71
  """
72
-
73
72
  parsed_tags: List[str] = []
74
73
  if raw:
75
74
  parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
@@ -94,7 +93,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
94
93
  :return: Exit code.
95
94
  :rtype: int
96
95
  """
97
-
98
96
  corpus = (
99
97
  Corpus.open(arguments.corpus)
100
98
  if getattr(arguments, "corpus", None)
@@ -135,7 +133,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
135
133
  :return: Exit code.
136
134
  :rtype: int
137
135
  """
138
-
139
136
  corpus = (
140
137
  Corpus.open(arguments.corpus)
141
138
  if getattr(arguments, "corpus", None)
@@ -157,7 +154,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
157
154
  :return: Exit code.
158
155
  :rtype: int
159
156
  """
160
-
161
157
  corpus = (
162
158
  Corpus.open(arguments.corpus)
163
159
  if getattr(arguments, "corpus", None)
@@ -177,7 +173,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
177
173
  :return: Exit code.
178
174
  :rtype: int
179
175
  """
180
-
181
176
  corpus = (
182
177
  Corpus.open(arguments.corpus)
183
178
  if getattr(arguments, "corpus", None)
@@ -197,7 +192,6 @@ def cmd_import_tree(arguments: argparse.Namespace) -> int:
197
192
  :return: Exit code.
198
193
  :rtype: int
199
194
  """
200
-
201
195
  corpus = (
202
196
  Corpus.open(arguments.corpus)
203
197
  if getattr(arguments, "corpus", None)
@@ -218,7 +212,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
218
212
  :return: Exit code.
219
213
  :rtype: int
220
214
  """
221
-
222
215
  corpus = (
223
216
  Corpus.open(arguments.corpus)
224
217
  if getattr(arguments, "corpus", None)
@@ -241,7 +234,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
241
234
  :rtype: dict[str, object]
242
235
  :raises ValueError: If any entry is not key=value.
243
236
  """
244
-
245
237
  config: Dict[str, object] = {}
246
238
  for item in pairs or []:
247
239
  if "=" not in item:
@@ -264,7 +256,7 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
264
256
 
265
257
  def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
266
258
  """
267
- Parse a cascade step specification.
259
+ Parse a pipeline step specification.
268
260
 
269
261
  :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
270
262
  :type raw_step: str
@@ -272,7 +264,6 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
272
264
  :rtype: tuple[str, dict[str, object]]
273
265
  :raises ValueError: If the step spec is invalid.
274
266
  """
275
-
276
267
  raw_step = raw_step.strip()
277
268
  if not raw_step:
278
269
  raise ValueError("Step spec must be non-empty")
@@ -309,7 +300,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
309
300
  :return: Query budget instance.
310
301
  :rtype: QueryBudget
311
302
  """
312
-
313
303
  return QueryBudget(
314
304
  max_total_items=arguments.max_total_items,
315
305
  max_total_characters=arguments.max_total_characters,
@@ -326,7 +316,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
326
316
  :return: Exit code.
327
317
  :rtype: int
328
318
  """
329
-
330
319
  corpus = (
331
320
  Corpus.open(arguments.corpus)
332
321
  if getattr(arguments, "corpus", None)
@@ -339,33 +328,31 @@ def cmd_build(arguments: argparse.Namespace) -> int:
339
328
  return 0
340
329
 
341
330
 
342
- def cmd_extract(arguments: argparse.Namespace) -> int:
331
+ def cmd_extract_build(arguments: argparse.Namespace) -> int:
343
332
  """
344
- Build a text extraction run for the corpus.
333
+ Build a text extraction run for the corpus using a pipeline of extractors.
345
334
 
346
335
  :param arguments: Parsed command-line interface arguments.
347
336
  :type arguments: argparse.Namespace
348
337
  :return: Exit code.
349
338
  :rtype: int
350
339
  """
351
-
352
340
  corpus = (
353
341
  Corpus.open(arguments.corpus)
354
342
  if getattr(arguments, "corpus", None)
355
343
  else Corpus.find(Path.cwd())
356
344
  )
357
- config = _parse_config_pairs(arguments.config)
358
- if getattr(arguments, "step", None):
359
- if arguments.extractor != "cascade":
360
- raise ValueError("--step is only supported for the cascade extractor")
361
- steps: List[Dict[str, object]] = []
362
- for raw_step in arguments.step:
363
- extractor_id, step_config = _parse_step_spec(raw_step)
364
- steps.append({"extractor_id": extractor_id, "config": step_config})
365
- config = {"steps": steps}
345
+ raw_steps = list(arguments.step or [])
346
+ if not raw_steps:
347
+ raise ValueError("Pipeline extraction requires at least one --step")
348
+ steps: List[Dict[str, object]] = []
349
+ for raw_step in raw_steps:
350
+ extractor_id, step_config = _parse_step_spec(raw_step)
351
+ steps.append({"extractor_id": extractor_id, "config": step_config})
352
+ config = {"steps": steps}
366
353
  manifest = build_extraction_run(
367
354
  corpus,
368
- extractor_id=arguments.extractor,
355
+ extractor_id="pipeline",
369
356
  recipe_name=arguments.recipe_name,
370
357
  config=config,
371
358
  )
@@ -373,6 +360,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
373
360
  return 0
374
361
 
375
362
 
363
+ def cmd_extract_list(arguments: argparse.Namespace) -> int:
364
+ """
365
+ List extraction runs stored under the corpus.
366
+
367
+ :param arguments: Parsed command-line interface arguments.
368
+ :type arguments: argparse.Namespace
369
+ :return: Exit code.
370
+ :rtype: int
371
+ """
372
+ corpus = (
373
+ Corpus.open(arguments.corpus)
374
+ if getattr(arguments, "corpus", None)
375
+ else Corpus.find(Path.cwd())
376
+ )
377
+ runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
378
+ print(json.dumps([entry.model_dump() for entry in runs], indent=2))
379
+ return 0
380
+
381
+
382
+ def cmd_extract_show(arguments: argparse.Namespace) -> int:
383
+ """
384
+ Show an extraction run manifest.
385
+
386
+ :param arguments: Parsed command-line interface arguments.
387
+ :type arguments: argparse.Namespace
388
+ :return: Exit code.
389
+ :rtype: int
390
+ """
391
+ corpus = (
392
+ Corpus.open(arguments.corpus)
393
+ if getattr(arguments, "corpus", None)
394
+ else Corpus.find(Path.cwd())
395
+ )
396
+ reference = parse_extraction_run_reference(arguments.run)
397
+ manifest = corpus.load_extraction_run_manifest(
398
+ extractor_id=reference.extractor_id, run_id=reference.run_id
399
+ )
400
+ print(manifest.model_dump_json(indent=2))
401
+ return 0
402
+
403
+
404
+ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
405
+ """
406
+ Delete an extraction run directory and its derived artifacts.
407
+
408
+ :param arguments: Parsed command-line interface arguments.
409
+ :type arguments: argparse.Namespace
410
+ :return: Exit code.
411
+ :rtype: int
412
+ """
413
+ corpus = (
414
+ Corpus.open(arguments.corpus)
415
+ if getattr(arguments, "corpus", None)
416
+ else Corpus.find(Path.cwd())
417
+ )
418
+ if arguments.confirm != arguments.run:
419
+ raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
420
+ reference = parse_extraction_run_reference(arguments.run)
421
+ corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
422
+ print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
423
+ return 0
424
+
425
+
376
426
  def cmd_query(arguments: argparse.Namespace) -> int:
377
427
  """
378
428
  Execute a retrieval query.
@@ -382,7 +432,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
382
432
  :return: Exit code.
383
433
  :rtype: int
384
434
  """
385
-
386
435
  corpus = (
387
436
  Corpus.open(arguments.corpus)
388
437
  if getattr(arguments, "corpus", None)
@@ -413,7 +462,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
413
462
  :return: Exit code.
414
463
  :rtype: int
415
464
  """
416
-
417
465
  corpus = (
418
466
  Corpus.open(arguments.corpus)
419
467
  if getattr(arguments, "corpus", None)
@@ -430,6 +478,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
430
478
  return 0
431
479
 
432
480
 
481
+ def cmd_crawl(arguments: argparse.Namespace) -> int:
482
+ """
483
+ Crawl a website prefix into a corpus.
484
+
485
+ :param arguments: Parsed command-line interface arguments.
486
+ :type arguments: argparse.Namespace
487
+ :return: Exit code.
488
+ :rtype: int
489
+ """
490
+ corpus = (
491
+ Corpus.open(arguments.corpus)
492
+ if getattr(arguments, "corpus", None)
493
+ else Corpus.find(Path.cwd())
494
+ )
495
+ tags = _parse_tags(arguments.tags, arguments.tag)
496
+ request = CrawlRequest(
497
+ root_url=arguments.root_url,
498
+ allowed_prefix=arguments.allowed_prefix,
499
+ max_items=arguments.max_items,
500
+ tags=tags,
501
+ )
502
+ result = crawl_into_corpus(corpus=corpus, request=request)
503
+ print(result.model_dump_json(indent=2))
504
+ return 0
505
+
506
+
433
507
  def build_parser() -> argparse.ArgumentParser:
434
508
  """
435
509
  Build the command-line interface argument parser.
@@ -437,7 +511,6 @@ def build_parser() -> argparse.ArgumentParser:
437
511
  :return: Argument parser instance.
438
512
  :rtype: argparse.ArgumentParser
439
513
  """
440
-
441
514
  parser = argparse.ArgumentParser(
442
515
  prog="biblicus",
443
516
  description="Biblicus command-line interface (minimum viable product)",
@@ -457,14 +530,18 @@ def build_parser() -> argparse.ArgumentParser:
457
530
 
458
531
  p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
459
532
  p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
460
- p_init.add_argument("--force", action="store_true", help="Overwrite existing config if present.")
533
+ p_init.add_argument(
534
+ "--force", action="store_true", help="Overwrite existing config if present."
535
+ )
461
536
  p_init.set_defaults(func=cmd_init)
462
537
 
463
538
  p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
464
539
  _add_common_corpus_arg(p_ingest)
465
540
  p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
466
541
  p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
467
- p_ingest.add_argument("--stdin", action="store_true", help="Read text to ingest from standard input.")
542
+ p_ingest.add_argument(
543
+ "--stdin", action="store_true", help="Read text to ingest from standard input."
544
+ )
468
545
  p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
469
546
  p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
470
547
  p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
@@ -480,18 +557,26 @@ def build_parser() -> argparse.ArgumentParser:
480
557
  p_show.add_argument("id", help="Item identifier (universally unique identifier).")
481
558
  p_show.set_defaults(func=cmd_show)
482
559
 
483
- p_reindex = sub.add_parser("reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus.")
560
+ p_reindex = sub.add_parser(
561
+ "reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
562
+ )
484
563
  _add_common_corpus_arg(p_reindex)
485
564
  p_reindex.set_defaults(func=cmd_reindex)
486
565
 
487
566
  p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
488
567
  _add_common_corpus_arg(p_import_tree)
489
568
  p_import_tree.add_argument("path", help="Folder tree root to import.")
490
- p_import_tree.add_argument("--tags", default=None, help="Comma-separated tags to apply to imported items.")
491
- p_import_tree.add_argument("--tag", action="append", help="Repeatable tag to apply to imported items.")
569
+ p_import_tree.add_argument(
570
+ "--tags", default=None, help="Comma-separated tags to apply to imported items."
571
+ )
572
+ p_import_tree.add_argument(
573
+ "--tag", action="append", help="Repeatable tag to apply to imported items."
574
+ )
492
575
  p_import_tree.set_defaults(func=cmd_import_tree)
493
576
 
494
- p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
577
+ p_purge = sub.add_parser(
578
+ "purge", help="Delete all items and derived files (requires confirmation)."
579
+ )
495
580
  _add_common_corpus_arg(p_purge)
496
581
  p_purge.add_argument(
497
582
  "--confirm",
@@ -516,27 +601,53 @@ def build_parser() -> argparse.ArgumentParser:
516
601
  )
517
602
  p_build.set_defaults(func=cmd_build)
518
603
 
519
- p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
520
- _add_common_corpus_arg(p_extract)
521
- p_extract.add_argument(
522
- "--extractor",
523
- required=True,
524
- help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
604
+ p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
605
+ extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
606
+
607
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
608
+ _add_common_corpus_arg(p_extract_build)
609
+ p_extract_build.add_argument(
610
+ "--recipe-name", default="default", help="Human-readable recipe name."
525
611
  )
526
- p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
527
- p_extract.add_argument(
612
+ p_extract_build.add_argument(
528
613
  "--step",
529
614
  action="append",
530
615
  default=None,
531
- help="Cascade step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
616
+ help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
532
617
  )
533
- p_extract.add_argument(
534
- "--config",
535
- action="append",
618
+ p_extract_build.set_defaults(func=cmd_extract_build)
619
+
620
+ p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
621
+ _add_common_corpus_arg(p_extract_list)
622
+ p_extract_list.add_argument(
623
+ "--extractor-id",
536
624
  default=None,
537
- help="Extractor config as key=value (repeatable).",
625
+ help="Optional extractor identifier filter (for example: pipeline).",
538
626
  )
539
- p_extract.set_defaults(func=cmd_extract)
627
+ p_extract_list.set_defaults(func=cmd_extract_list)
628
+
629
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
630
+ _add_common_corpus_arg(p_extract_show)
631
+ p_extract_show.add_argument(
632
+ "--run",
633
+ required=True,
634
+ help="Extraction run reference in the form extractor_id:run_id.",
635
+ )
636
+ p_extract_show.set_defaults(func=cmd_extract_show)
637
+
638
+ p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
639
+ _add_common_corpus_arg(p_extract_delete)
640
+ p_extract_delete.add_argument(
641
+ "--run",
642
+ required=True,
643
+ help="Extraction run reference in the form extractor_id:run_id.",
644
+ )
645
+ p_extract_delete.add_argument(
646
+ "--confirm",
647
+ required=True,
648
+ help="Type the exact extractor_id:run_id to confirm deletion.",
649
+ )
650
+ p_extract_delete.set_defaults(func=cmd_extract_delete)
540
651
 
541
652
  p_query = sub.add_parser("query", help="Run a retrieval query.")
542
653
  _add_common_corpus_arg(p_query)
@@ -561,6 +672,19 @@ def build_parser() -> argparse.ArgumentParser:
561
672
  p_eval.add_argument("--max-items-per-source", type=int, default=5)
562
673
  p_eval.set_defaults(func=cmd_eval)
563
674
 
675
+ p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
676
+ _add_common_corpus_arg(p_crawl)
677
+ p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
678
+ p_crawl.add_argument(
679
+ "--allowed-prefix",
680
+ required=True,
681
+ help="Uniform resource locator prefix that limits which links are eligible for crawl.",
682
+ )
683
+ p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
684
+ p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
685
+ p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
686
+ p_crawl.set_defaults(func=cmd_crawl)
687
+
564
688
  return parser
565
689
 
566
690
 
@@ -573,7 +697,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
573
697
  :return: Exit code.
574
698
  :rtype: int
575
699
  """
576
-
577
700
  parser = build_parser()
578
701
  arguments = parser.parse_args(argument_list)
579
702
  try:
@@ -583,6 +706,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
583
706
  FileExistsError,
584
707
  KeyError,
585
708
  ValueError,
709
+ ExtractionRunFatalError,
586
710
  NotImplementedError,
587
711
  ValidationError,
588
712
  ) as exception: