biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +96 -13
  9. biblicus/backends/sqlite_full_text_search.py +74 -14
  10. biblicus/cli.py +126 -19
  11. biblicus/constants.py +2 -0
  12. biblicus/corpus.py +455 -45
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +529 -0
  16. biblicus/extractors/__init__.py +44 -0
  17. biblicus/extractors/base.py +68 -0
  18. biblicus/extractors/metadata_text.py +106 -0
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +84 -0
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +180 -0
  29. biblicus/hook_manager.py +203 -0
  30. biblicus/hooks.py +261 -0
  31. biblicus/ignore.py +64 -0
  32. biblicus/models.py +107 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +85 -5
  35. biblicus/time.py +0 -1
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. biblicus-0.3.0.dist-info/METADATA +336 -0
  39. biblicus-0.3.0.dist-info/RECORD +44 -0
  40. biblicus-0.1.1.dist-info/METADATA +0 -174
  41. biblicus-0.1.1.dist-info/RECORD +0 -22
  42. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,14 @@ from pydantic import BaseModel, ConfigDict, Field
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
15
15
  from ..frontmatter import parse_front_matter
16
- from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
16
+ from ..models import (
17
+ Evidence,
18
+ ExtractionRunReference,
19
+ QueryBudget,
20
+ RetrievalResult,
21
+ RetrievalRun,
22
+ parse_extraction_run_reference,
23
+ )
17
24
  from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
18
25
  from ..time import utc_now_iso
19
26
 
@@ -28,6 +35,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
28
35
  :vartype chunk_overlap: int
29
36
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
30
37
  :vartype snippet_characters: int
38
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
39
+ :vartype extraction_run: str or None
31
40
  """
32
41
 
33
42
  model_config = ConfigDict(extra="forbid")
@@ -35,6 +44,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
35
44
  chunk_size: int = Field(default=800, ge=1)
36
45
  chunk_overlap: int = Field(default=200, ge=0)
37
46
  snippet_characters: int = Field(default=400, ge=1)
47
+ extraction_run: Optional[str] = None
38
48
 
39
49
 
40
50
  class SqliteFullTextSearchBackend:
@@ -47,7 +57,9 @@ class SqliteFullTextSearchBackend:
47
57
 
48
58
  backend_id = "sqlite-full-text-search"
49
59
 
50
- def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
60
+ def build_run(
61
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
62
+ ) -> RetrievalRun:
51
63
  """
52
64
  Build a full-text search version five index for the corpus.
53
65
 
@@ -60,7 +72,6 @@ class SqliteFullTextSearchBackend:
60
72
  :return: Run manifest describing the build.
61
73
  :rtype: RetrievalRun
62
74
  """
63
-
64
75
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
65
76
  catalog = corpus.load_catalog()
66
77
  recipe = create_recipe_manifest(
@@ -72,11 +83,13 @@ class SqliteFullTextSearchBackend:
72
83
  db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
73
84
  db_path = corpus.root / db_relpath
74
85
  corpus.runs_dir.mkdir(parents=True, exist_ok=True)
86
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
75
87
  stats = _build_full_text_search_index(
76
88
  db_path=db_path,
77
89
  corpus=corpus,
78
90
  items=catalog.items.values(),
79
91
  recipe_config=recipe_config,
92
+ extraction_reference=extraction_reference,
80
93
  )
81
94
  run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
82
95
  corpus.write_run(run)
@@ -104,7 +117,6 @@ class SqliteFullTextSearchBackend:
104
117
  :return: Retrieval results containing evidence.
105
118
  :rtype: RetrievalResult
106
119
  """
107
-
108
120
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
109
121
  db_path = _resolve_run_db_path(corpus, run)
110
122
  candidates = _query_full_text_search_index(
@@ -150,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
150
162
  :return: Candidate limit for backend search.
151
163
  :rtype: int
152
164
  """
153
-
154
165
  return max_total_items * 5
155
166
 
156
167
 
@@ -166,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
166
177
  :rtype: Path
167
178
  :raises FileNotFoundError: If the run does not have artifact paths.
168
179
  """
169
-
170
180
  if not run.artifact_paths:
171
181
  raise FileNotFoundError("Run has no artifact paths to query")
172
182
  return corpus.root / run.artifact_paths[0]
@@ -182,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
182
192
  :rtype: None
183
193
  :raises RuntimeError: If full-text search version five support is unavailable.
184
194
  """
185
-
186
195
  try:
187
196
  cursor = conn.execute(
188
197
  "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
@@ -204,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
204
213
  :return: None.
205
214
  :rtype: None
206
215
  """
207
-
208
216
  conn.execute(
209
217
  """
210
218
  CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
@@ -227,6 +235,7 @@ def _build_full_text_search_index(
227
235
  corpus: Corpus,
228
236
  items: Iterable[object],
229
237
  recipe_config: SqliteFullTextSearchRecipeConfig,
238
+ extraction_reference: Optional[ExtractionRunReference],
230
239
  ) -> Dict[str, int]:
231
240
  """
232
241
  Build a full-text search index from corpus items.
@@ -242,7 +251,6 @@ def _build_full_text_search_index(
242
251
  :return: Index statistics.
243
252
  :rtype: dict[str, int]
244
253
  """
245
-
246
254
  if db_path.exists():
247
255
  db_path.unlink()
248
256
  connection = sqlite3.connect(str(db_path))
@@ -256,7 +264,13 @@ def _build_full_text_search_index(
256
264
  item_count += 1
257
265
  media_type = getattr(catalog_item, "media_type", "")
258
266
  relpath = getattr(catalog_item, "relpath", "")
259
- item_text = _load_text_from_item(corpus, relpath, media_type)
267
+ item_text = _load_text_from_item(
268
+ corpus,
269
+ item_id=str(getattr(catalog_item, "id", "")),
270
+ relpath=str(relpath),
271
+ media_type=str(media_type),
272
+ extraction_reference=extraction_reference,
273
+ )
260
274
  if item_text is None:
261
275
  continue
262
276
  text_item_count += 1
@@ -302,19 +316,38 @@ def _build_full_text_search_index(
302
316
  connection.close()
303
317
 
304
318
 
305
- def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
319
+ def _load_text_from_item(
320
+ corpus: Corpus,
321
+ *,
322
+ item_id: str,
323
+ relpath: str,
324
+ media_type: str,
325
+ extraction_reference: Optional[ExtractionRunReference],
326
+ ) -> Optional[str]:
306
327
  """
307
328
  Load text content from a catalog item.
308
329
 
309
330
  :param corpus: Corpus containing the content.
310
331
  :type corpus: Corpus
332
+ :param item_id: Item identifier.
333
+ :type item_id: str
311
334
  :param relpath: Relative path to the content.
312
335
  :type relpath: str
313
336
  :param media_type: Media type for the content.
314
337
  :type media_type: str
338
+ :param extraction_reference: Optional extraction run reference.
339
+ :type extraction_reference: ExtractionRunReference or None
315
340
  :return: Text payload or None if not text.
316
341
  :rtype: str or None
317
342
  """
343
+ if extraction_reference:
344
+ extracted_text = corpus.read_extracted_text(
345
+ extractor_id=extraction_reference.extractor_id,
346
+ run_id=extraction_reference.run_id,
347
+ item_id=item_id,
348
+ )
349
+ if isinstance(extracted_text, str) and extracted_text.strip():
350
+ return extracted_text
318
351
 
319
352
  content_path = corpus.root / relpath
320
353
  raw_bytes = content_path.read_bytes()
@@ -327,7 +360,36 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
327
360
  return None
328
361
 
329
362
 
330
- def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
363
+ def _resolve_extraction_reference(
364
+ corpus: Corpus,
365
+ recipe_config: SqliteFullTextSearchRecipeConfig,
366
+ ) -> Optional[ExtractionRunReference]:
367
+ """
368
+ Resolve an extraction run reference from a recipe config.
369
+
370
+ :param corpus: Corpus associated with the recipe.
371
+ :type corpus: Corpus
372
+ :param recipe_config: Parsed backend recipe configuration.
373
+ :type recipe_config: SqliteFullTextSearchRecipeConfig
374
+ :return: Parsed extraction reference or None.
375
+ :rtype: ExtractionRunReference or None
376
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
377
+ """
378
+ if not recipe_config.extraction_run:
379
+ return None
380
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
381
+ run_dir = corpus.extraction_run_dir(
382
+ extractor_id=extraction_reference.extractor_id,
383
+ run_id=extraction_reference.run_id,
384
+ )
385
+ if not run_dir.is_dir():
386
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
387
+ return extraction_reference
388
+
389
+
390
+ def _iter_chunks(
391
+ text: str, *, chunk_size: int, chunk_overlap: int
392
+ ) -> Iterable[Tuple[int, int, str]]:
331
393
  """
332
394
  Yield overlapping chunks of text for indexing.
333
395
 
@@ -341,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
341
403
  :rtype: Iterable[tuple[int, int, str]]
342
404
  :raises ValueError: If the overlap is greater than or equal to the chunk size.
343
405
  """
344
-
345
406
  if chunk_overlap >= chunk_size:
346
407
  raise ValueError("chunk_overlap must be smaller than chunk_size")
347
408
  start_offset = 0
@@ -374,7 +435,6 @@ def _query_full_text_search_index(
374
435
  :return: Evidence candidates.
375
436
  :rtype: list[Evidence]
376
437
  """
377
-
378
438
  connection = sqlite3.connect(str(db_path))
379
439
  try:
380
440
  rows = connection.execute(
biblicus/cli.py CHANGED
@@ -14,7 +14,9 @@ from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
16
  from .corpus import Corpus
17
+ from .errors import ExtractionRunFatalError
17
18
  from .evaluation import evaluate_run, load_dataset
19
+ from .extraction import build_extraction_run
18
20
  from .models import QueryBudget
19
21
  from .uris import corpus_ref_to_path
20
22
 
@@ -28,7 +30,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
28
30
  :return: None.
29
31
  :rtype: None
30
32
  """
31
-
32
33
  parser.add_argument(
33
34
  "--corpus",
34
35
  type=str,
@@ -50,7 +51,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
50
51
  :return: Exit code.
51
52
  :rtype: int
52
53
  """
53
-
54
54
  corpus_path = corpus_ref_to_path(arguments.path)
55
55
  corpus = Corpus.init(corpus_path, force=arguments.force)
56
56
  print(f"Initialized corpus at {corpus.root}")
@@ -68,7 +68,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
68
68
  :return: Deduplicated tag list.
69
69
  :rtype: list[str]
70
70
  """
71
-
72
71
  parsed_tags: List[str] = []
73
72
  if raw:
74
73
  parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
@@ -93,7 +92,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
93
92
  :return: Exit code.
94
93
  :rtype: int
95
94
  """
96
-
97
95
  corpus = (
98
96
  Corpus.open(arguments.corpus)
99
97
  if getattr(arguments, "corpus", None)
@@ -134,7 +132,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
134
132
  :return: Exit code.
135
133
  :rtype: int
136
134
  """
137
-
138
135
  corpus = (
139
136
  Corpus.open(arguments.corpus)
140
137
  if getattr(arguments, "corpus", None)
@@ -156,7 +153,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
156
153
  :return: Exit code.
157
154
  :rtype: int
158
155
  """
159
-
160
156
  corpus = (
161
157
  Corpus.open(arguments.corpus)
162
158
  if getattr(arguments, "corpus", None)
@@ -176,7 +172,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
176
172
  :return: Exit code.
177
173
  :rtype: int
178
174
  """
179
-
180
175
  corpus = (
181
176
  Corpus.open(arguments.corpus)
182
177
  if getattr(arguments, "corpus", None)
@@ -187,6 +182,26 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
187
182
  return 0
188
183
 
189
184
 
185
+ def cmd_import_tree(arguments: argparse.Namespace) -> int:
186
+ """
187
+ Import a folder tree into a corpus.
188
+
189
+ :param arguments: Parsed command-line interface arguments.
190
+ :type arguments: argparse.Namespace
191
+ :return: Exit code.
192
+ :rtype: int
193
+ """
194
+ corpus = (
195
+ Corpus.open(arguments.corpus)
196
+ if getattr(arguments, "corpus", None)
197
+ else Corpus.find(Path.cwd())
198
+ )
199
+ tags = _parse_tags(arguments.tags, arguments.tag)
200
+ stats = corpus.import_tree(Path(arguments.path), tags=tags)
201
+ print(json.dumps(stats, indent=2, sort_keys=False))
202
+ return 0
203
+
204
+
190
205
  def cmd_purge(arguments: argparse.Namespace) -> int:
191
206
  """
192
207
  Purge all items and derived artifacts from a corpus.
@@ -196,7 +211,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
196
211
  :return: Exit code.
197
212
  :rtype: int
198
213
  """
199
-
200
214
  corpus = (
201
215
  Corpus.open(arguments.corpus)
202
216
  if getattr(arguments, "corpus", None)
@@ -219,7 +233,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
219
233
  :rtype: dict[str, object]
220
234
  :raises ValueError: If any entry is not key=value.
221
235
  """
222
-
223
236
  config: Dict[str, object] = {}
224
237
  for item in pairs or []:
225
238
  if "=" not in item:
@@ -240,6 +253,43 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
240
253
  return config
241
254
 
242
255
 
256
+ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
257
+ """
258
+ Parse a pipeline step specification.
259
+
260
+ :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
261
+ :type raw_step: str
262
+ :return: Tuple of extractor_id and config mapping.
263
+ :rtype: tuple[str, dict[str, object]]
264
+ :raises ValueError: If the step spec is invalid.
265
+ """
266
+ raw_step = raw_step.strip()
267
+ if not raw_step:
268
+ raise ValueError("Step spec must be non-empty")
269
+ if ":" not in raw_step:
270
+ return raw_step, {}
271
+ extractor_id, raw_pairs = raw_step.split(":", 1)
272
+ extractor_id = extractor_id.strip()
273
+ if not extractor_id:
274
+ raise ValueError("Step spec must start with an extractor identifier")
275
+ config: Dict[str, object] = {}
276
+ raw_pairs = raw_pairs.strip()
277
+ if not raw_pairs:
278
+ return extractor_id, {}
279
+ for token in raw_pairs.split(","):
280
+ token = token.strip()
281
+ if not token:
282
+ continue
283
+ if "=" not in token:
284
+ raise ValueError(f"Step config values must be key=value (got {token!r})")
285
+ key, value = token.split("=", 1)
286
+ key = key.strip()
287
+ if not key:
288
+ raise ValueError("Step config keys must be non-empty")
289
+ config[key] = value
290
+ return extractor_id, config
291
+
292
+
243
293
  def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
244
294
  """
245
295
  Build a QueryBudget from command-line interface arguments.
@@ -249,7 +299,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
249
299
  :return: Query budget instance.
250
300
  :rtype: QueryBudget
251
301
  """
252
-
253
302
  return QueryBudget(
254
303
  max_total_items=arguments.max_total_items,
255
304
  max_total_characters=arguments.max_total_characters,
@@ -266,7 +315,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
266
315
  :return: Exit code.
267
316
  :rtype: int
268
317
  """
269
-
270
318
  corpus = (
271
319
  Corpus.open(arguments.corpus)
272
320
  if getattr(arguments, "corpus", None)
@@ -279,6 +327,38 @@ def cmd_build(arguments: argparse.Namespace) -> int:
279
327
  return 0
280
328
 
281
329
 
330
+ def cmd_extract(arguments: argparse.Namespace) -> int:
331
+ """
332
+ Build a text extraction run for the corpus using a pipeline of extractors.
333
+
334
+ :param arguments: Parsed command-line interface arguments.
335
+ :type arguments: argparse.Namespace
336
+ :return: Exit code.
337
+ :rtype: int
338
+ """
339
+ corpus = (
340
+ Corpus.open(arguments.corpus)
341
+ if getattr(arguments, "corpus", None)
342
+ else Corpus.find(Path.cwd())
343
+ )
344
+ raw_steps = list(arguments.step or [])
345
+ if not raw_steps:
346
+ raise ValueError("Pipeline extraction requires at least one --step")
347
+ steps: List[Dict[str, object]] = []
348
+ for raw_step in raw_steps:
349
+ extractor_id, step_config = _parse_step_spec(raw_step)
350
+ steps.append({"extractor_id": extractor_id, "config": step_config})
351
+ config = {"steps": steps}
352
+ manifest = build_extraction_run(
353
+ corpus,
354
+ extractor_id="pipeline",
355
+ recipe_name=arguments.recipe_name,
356
+ config=config,
357
+ )
358
+ print(manifest.model_dump_json(indent=2))
359
+ return 0
360
+
361
+
282
362
  def cmd_query(arguments: argparse.Namespace) -> int:
283
363
  """
284
364
  Execute a retrieval query.
@@ -288,7 +368,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
288
368
  :return: Exit code.
289
369
  :rtype: int
290
370
  """
291
-
292
371
  corpus = (
293
372
  Corpus.open(arguments.corpus)
294
373
  if getattr(arguments, "corpus", None)
@@ -319,7 +398,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
319
398
  :return: Exit code.
320
399
  :rtype: int
321
400
  """
322
-
323
401
  corpus = (
324
402
  Corpus.open(arguments.corpus)
325
403
  if getattr(arguments, "corpus", None)
@@ -343,7 +421,6 @@ def build_parser() -> argparse.ArgumentParser:
343
421
  :return: Argument parser instance.
344
422
  :rtype: argparse.ArgumentParser
345
423
  """
346
-
347
424
  parser = argparse.ArgumentParser(
348
425
  prog="biblicus",
349
426
  description="Biblicus command-line interface (minimum viable product)",
@@ -363,14 +440,18 @@ def build_parser() -> argparse.ArgumentParser:
363
440
 
364
441
  p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
365
442
  p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
366
- p_init.add_argument("--force", action="store_true", help="Overwrite existing config if present.")
443
+ p_init.add_argument(
444
+ "--force", action="store_true", help="Overwrite existing config if present."
445
+ )
367
446
  p_init.set_defaults(func=cmd_init)
368
447
 
369
448
  p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
370
449
  _add_common_corpus_arg(p_ingest)
371
450
  p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
372
451
  p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
373
- p_ingest.add_argument("--stdin", action="store_true", help="Read text to ingest from standard input.")
452
+ p_ingest.add_argument(
453
+ "--stdin", action="store_true", help="Read text to ingest from standard input."
454
+ )
374
455
  p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
375
456
  p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
376
457
  p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
@@ -386,11 +467,26 @@ def build_parser() -> argparse.ArgumentParser:
386
467
  p_show.add_argument("id", help="Item identifier (universally unique identifier).")
387
468
  p_show.set_defaults(func=cmd_show)
388
469
 
389
- p_reindex = sub.add_parser("reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus.")
470
+ p_reindex = sub.add_parser(
471
+ "reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
472
+ )
390
473
  _add_common_corpus_arg(p_reindex)
391
474
  p_reindex.set_defaults(func=cmd_reindex)
392
475
 
393
- p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
476
+ p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
477
+ _add_common_corpus_arg(p_import_tree)
478
+ p_import_tree.add_argument("path", help="Folder tree root to import.")
479
+ p_import_tree.add_argument(
480
+ "--tags", default=None, help="Comma-separated tags to apply to imported items."
481
+ )
482
+ p_import_tree.add_argument(
483
+ "--tag", action="append", help="Repeatable tag to apply to imported items."
484
+ )
485
+ p_import_tree.set_defaults(func=cmd_import_tree)
486
+
487
+ p_purge = sub.add_parser(
488
+ "purge", help="Delete all items and derived files (requires confirmation)."
489
+ )
394
490
  _add_common_corpus_arg(p_purge)
395
491
  p_purge.add_argument(
396
492
  "--confirm",
@@ -415,6 +511,17 @@ def build_parser() -> argparse.ArgumentParser:
415
511
  )
416
512
  p_build.set_defaults(func=cmd_build)
417
513
 
514
+ p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
515
+ _add_common_corpus_arg(p_extract)
516
+ p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
517
+ p_extract.add_argument(
518
+ "--step",
519
+ action="append",
520
+ default=None,
521
+ help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
522
+ )
523
+ p_extract.set_defaults(func=cmd_extract)
524
+
418
525
  p_query = sub.add_parser("query", help="Run a retrieval query.")
419
526
  _add_common_corpus_arg(p_query)
420
527
  p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
@@ -450,7 +557,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
450
557
  :return: Exit code.
451
558
  :rtype: int
452
559
  """
453
-
454
560
  parser = build_parser()
455
561
  arguments = parser.parse_args(argument_list)
456
562
  try:
@@ -460,6 +566,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
460
566
  FileExistsError,
461
567
  KeyError,
462
568
  ValueError,
569
+ ExtractionRunFatalError,
463
570
  NotImplementedError,
464
571
  ValidationError,
465
572
  ) as exception:
biblicus/constants.py CHANGED
@@ -8,3 +8,5 @@ CORPUS_DIR_NAME = ".biblicus"
8
8
  DEFAULT_RAW_DIR = "raw"
9
9
  SIDECAR_SUFFIX = ".biblicus.yml"
10
10
  RUNS_DIR_NAME = "runs"
11
+ EXTRACTION_RUNS_DIR_NAME = "extraction"
12
+ HOOK_LOGS_DIR_NAME = "hook_logs"