biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +33 -49
  11. biblicus/corpus.py +39 -58
  12. biblicus/errors.py +15 -0
  13. biblicus/evaluation.py +4 -8
  14. biblicus/extraction.py +276 -77
  15. biblicus/extractors/__init__.py +14 -3
  16. biblicus/extractors/base.py +12 -5
  17. biblicus/extractors/metadata_text.py +13 -5
  18. biblicus/extractors/openai_stt.py +180 -0
  19. biblicus/extractors/pass_through_text.py +16 -6
  20. biblicus/extractors/pdf_text.py +100 -0
  21. biblicus/extractors/pipeline.py +105 -0
  22. biblicus/extractors/rapidocr_text.py +129 -0
  23. biblicus/extractors/select_longest_text.py +105 -0
  24. biblicus/extractors/select_text.py +100 -0
  25. biblicus/extractors/unstructured_text.py +100 -0
  26. biblicus/frontmatter.py +0 -3
  27. biblicus/hook_logging.py +0 -5
  28. biblicus/hook_manager.py +3 -5
  29. biblicus/hooks.py +3 -7
  30. biblicus/ignore.py +0 -3
  31. biblicus/models.py +87 -0
  32. biblicus/retrieval.py +0 -4
  33. biblicus/sources.py +44 -9
  34. biblicus/time.py +0 -1
  35. biblicus/uris.py +3 -4
  36. biblicus/user_config.py +138 -0
  37. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
  38. biblicus-0.3.0.dist-info/RECORD +44 -0
  39. biblicus/extractors/cascade.py +0 -101
  40. biblicus-0.2.0.dist-info/RECORD +0 -32
  41. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  42. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,15 @@ from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
15
- from ..extraction import ExtractionRunReference, parse_extraction_run_reference
16
15
  from ..frontmatter import parse_front_matter
17
- from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
16
+ from ..models import (
17
+ Evidence,
18
+ ExtractionRunReference,
19
+ QueryBudget,
20
+ RetrievalResult,
21
+ RetrievalRun,
22
+ parse_extraction_run_reference,
23
+ )
18
24
  from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
19
25
  from ..time import utc_now_iso
20
26
 
@@ -51,7 +57,9 @@ class SqliteFullTextSearchBackend:
51
57
 
52
58
  backend_id = "sqlite-full-text-search"
53
59
 
54
- def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
60
+ def build_run(
61
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
62
+ ) -> RetrievalRun:
55
63
  """
56
64
  Build a full-text search version five index for the corpus.
57
65
 
@@ -64,7 +72,6 @@ class SqliteFullTextSearchBackend:
64
72
  :return: Run manifest describing the build.
65
73
  :rtype: RetrievalRun
66
74
  """
67
-
68
75
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
69
76
  catalog = corpus.load_catalog()
70
77
  recipe = create_recipe_manifest(
@@ -110,7 +117,6 @@ class SqliteFullTextSearchBackend:
110
117
  :return: Retrieval results containing evidence.
111
118
  :rtype: RetrievalResult
112
119
  """
113
-
114
120
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
115
121
  db_path = _resolve_run_db_path(corpus, run)
116
122
  candidates = _query_full_text_search_index(
@@ -156,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
156
162
  :return: Candidate limit for backend search.
157
163
  :rtype: int
158
164
  """
159
-
160
165
  return max_total_items * 5
161
166
 
162
167
 
@@ -172,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
172
177
  :rtype: Path
173
178
  :raises FileNotFoundError: If the run does not have artifact paths.
174
179
  """
175
-
176
180
  if not run.artifact_paths:
177
181
  raise FileNotFoundError("Run has no artifact paths to query")
178
182
  return corpus.root / run.artifact_paths[0]
@@ -188,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
188
192
  :rtype: None
189
193
  :raises RuntimeError: If full-text search version five support is unavailable.
190
194
  """
191
-
192
195
  try:
193
196
  cursor = conn.execute(
194
197
  "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
@@ -210,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
210
213
  :return: None.
211
214
  :rtype: None
212
215
  """
213
-
214
216
  conn.execute(
215
217
  """
216
218
  CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
@@ -249,7 +251,6 @@ def _build_full_text_search_index(
249
251
  :return: Index statistics.
250
252
  :rtype: dict[str, int]
251
253
  """
252
-
253
254
  if db_path.exists():
254
255
  db_path.unlink()
255
256
  connection = sqlite3.connect(str(db_path))
@@ -339,7 +340,6 @@ def _load_text_from_item(
339
340
  :return: Text payload or None if not text.
340
341
  :rtype: str or None
341
342
  """
342
-
343
343
  if extraction_reference:
344
344
  extracted_text = corpus.read_extracted_text(
345
345
  extractor_id=extraction_reference.extractor_id,
@@ -375,7 +375,6 @@ def _resolve_extraction_reference(
375
375
  :rtype: ExtractionRunReference or None
376
376
  :raises FileNotFoundError: If an extraction run is referenced but not present.
377
377
  """
378
-
379
378
  if not recipe_config.extraction_run:
380
379
  return None
381
380
  extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
@@ -388,7 +387,9 @@ def _resolve_extraction_reference(
388
387
  return extraction_reference
389
388
 
390
389
 
391
- def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
390
+ def _iter_chunks(
391
+ text: str, *, chunk_size: int, chunk_overlap: int
392
+ ) -> Iterable[Tuple[int, int, str]]:
392
393
  """
393
394
  Yield overlapping chunks of text for indexing.
394
395
 
@@ -402,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
402
403
  :rtype: Iterable[tuple[int, int, str]]
403
404
  :raises ValueError: If the overlap is greater than or equal to the chunk size.
404
405
  """
405
-
406
406
  if chunk_overlap >= chunk_size:
407
407
  raise ValueError("chunk_overlap must be smaller than chunk_size")
408
408
  start_offset = 0
@@ -435,7 +435,6 @@ def _query_full_text_search_index(
435
435
  :return: Evidence candidates.
436
436
  :rtype: list[Evidence]
437
437
  """
438
-
439
438
  connection = sqlite3.connect(str(db_path))
440
439
  try:
441
440
  rows = connection.execute(
biblicus/cli.py CHANGED
@@ -14,8 +14,9 @@ from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
16
  from .corpus import Corpus
17
- from .extraction import build_extraction_run
17
+ from .errors import ExtractionRunFatalError
18
18
  from .evaluation import evaluate_run, load_dataset
19
+ from .extraction import build_extraction_run
19
20
  from .models import QueryBudget
20
21
  from .uris import corpus_ref_to_path
21
22
 
@@ -29,7 +30,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
29
30
  :return: None.
30
31
  :rtype: None
31
32
  """
32
-
33
33
  parser.add_argument(
34
34
  "--corpus",
35
35
  type=str,
@@ -51,7 +51,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
51
51
  :return: Exit code.
52
52
  :rtype: int
53
53
  """
54
-
55
54
  corpus_path = corpus_ref_to_path(arguments.path)
56
55
  corpus = Corpus.init(corpus_path, force=arguments.force)
57
56
  print(f"Initialized corpus at {corpus.root}")
@@ -69,7 +68,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
69
68
  :return: Deduplicated tag list.
70
69
  :rtype: list[str]
71
70
  """
72
-
73
71
  parsed_tags: List[str] = []
74
72
  if raw:
75
73
  parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
@@ -94,7 +92,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
94
92
  :return: Exit code.
95
93
  :rtype: int
96
94
  """
97
-
98
95
  corpus = (
99
96
  Corpus.open(arguments.corpus)
100
97
  if getattr(arguments, "corpus", None)
@@ -135,7 +132,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
135
132
  :return: Exit code.
136
133
  :rtype: int
137
134
  """
138
-
139
135
  corpus = (
140
136
  Corpus.open(arguments.corpus)
141
137
  if getattr(arguments, "corpus", None)
@@ -157,7 +153,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
157
153
  :return: Exit code.
158
154
  :rtype: int
159
155
  """
160
-
161
156
  corpus = (
162
157
  Corpus.open(arguments.corpus)
163
158
  if getattr(arguments, "corpus", None)
@@ -177,7 +172,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
177
172
  :return: Exit code.
178
173
  :rtype: int
179
174
  """
180
-
181
175
  corpus = (
182
176
  Corpus.open(arguments.corpus)
183
177
  if getattr(arguments, "corpus", None)
@@ -197,7 +191,6 @@ def cmd_import_tree(arguments: argparse.Namespace) -> int:
197
191
  :return: Exit code.
198
192
  :rtype: int
199
193
  """
200
-
201
194
  corpus = (
202
195
  Corpus.open(arguments.corpus)
203
196
  if getattr(arguments, "corpus", None)
@@ -218,7 +211,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
218
211
  :return: Exit code.
219
212
  :rtype: int
220
213
  """
221
-
222
214
  corpus = (
223
215
  Corpus.open(arguments.corpus)
224
216
  if getattr(arguments, "corpus", None)
@@ -241,7 +233,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
241
233
  :rtype: dict[str, object]
242
234
  :raises ValueError: If any entry is not key=value.
243
235
  """
244
-
245
236
  config: Dict[str, object] = {}
246
237
  for item in pairs or []:
247
238
  if "=" not in item:
@@ -264,7 +255,7 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
264
255
 
265
256
  def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
266
257
  """
267
- Parse a cascade step specification.
258
+ Parse a pipeline step specification.
268
259
 
269
260
  :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
270
261
  :type raw_step: str
@@ -272,7 +263,6 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
272
263
  :rtype: tuple[str, dict[str, object]]
273
264
  :raises ValueError: If the step spec is invalid.
274
265
  """
275
-
276
266
  raw_step = raw_step.strip()
277
267
  if not raw_step:
278
268
  raise ValueError("Step spec must be non-empty")
@@ -309,7 +299,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
309
299
  :return: Query budget instance.
310
300
  :rtype: QueryBudget
311
301
  """
312
-
313
302
  return QueryBudget(
314
303
  max_total_items=arguments.max_total_items,
315
304
  max_total_characters=arguments.max_total_characters,
@@ -326,7 +315,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
326
315
  :return: Exit code.
327
316
  :rtype: int
328
317
  """
329
-
330
318
  corpus = (
331
319
  Corpus.open(arguments.corpus)
332
320
  if getattr(arguments, "corpus", None)
@@ -341,31 +329,29 @@ def cmd_build(arguments: argparse.Namespace) -> int:
341
329
 
342
330
  def cmd_extract(arguments: argparse.Namespace) -> int:
343
331
  """
344
- Build a text extraction run for the corpus.
332
+ Build a text extraction run for the corpus using a pipeline of extractors.
345
333
 
346
334
  :param arguments: Parsed command-line interface arguments.
347
335
  :type arguments: argparse.Namespace
348
336
  :return: Exit code.
349
337
  :rtype: int
350
338
  """
351
-
352
339
  corpus = (
353
340
  Corpus.open(arguments.corpus)
354
341
  if getattr(arguments, "corpus", None)
355
342
  else Corpus.find(Path.cwd())
356
343
  )
357
- config = _parse_config_pairs(arguments.config)
358
- if getattr(arguments, "step", None):
359
- if arguments.extractor != "cascade":
360
- raise ValueError("--step is only supported for the cascade extractor")
361
- steps: List[Dict[str, object]] = []
362
- for raw_step in arguments.step:
363
- extractor_id, step_config = _parse_step_spec(raw_step)
364
- steps.append({"extractor_id": extractor_id, "config": step_config})
365
- config = {"steps": steps}
344
+ raw_steps = list(arguments.step or [])
345
+ if not raw_steps:
346
+ raise ValueError("Pipeline extraction requires at least one --step")
347
+ steps: List[Dict[str, object]] = []
348
+ for raw_step in raw_steps:
349
+ extractor_id, step_config = _parse_step_spec(raw_step)
350
+ steps.append({"extractor_id": extractor_id, "config": step_config})
351
+ config = {"steps": steps}
366
352
  manifest = build_extraction_run(
367
353
  corpus,
368
- extractor_id=arguments.extractor,
354
+ extractor_id="pipeline",
369
355
  recipe_name=arguments.recipe_name,
370
356
  config=config,
371
357
  )
@@ -382,7 +368,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
382
368
  :return: Exit code.
383
369
  :rtype: int
384
370
  """
385
-
386
371
  corpus = (
387
372
  Corpus.open(arguments.corpus)
388
373
  if getattr(arguments, "corpus", None)
@@ -413,7 +398,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
413
398
  :return: Exit code.
414
399
  :rtype: int
415
400
  """
416
-
417
401
  corpus = (
418
402
  Corpus.open(arguments.corpus)
419
403
  if getattr(arguments, "corpus", None)
@@ -437,7 +421,6 @@ def build_parser() -> argparse.ArgumentParser:
437
421
  :return: Argument parser instance.
438
422
  :rtype: argparse.ArgumentParser
439
423
  """
440
-
441
424
  parser = argparse.ArgumentParser(
442
425
  prog="biblicus",
443
426
  description="Biblicus command-line interface (minimum viable product)",
@@ -457,14 +440,18 @@ def build_parser() -> argparse.ArgumentParser:
457
440
 
458
441
  p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
459
442
  p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
460
- p_init.add_argument("--force", action="store_true", help="Overwrite existing config if present.")
443
+ p_init.add_argument(
444
+ "--force", action="store_true", help="Overwrite existing config if present."
445
+ )
461
446
  p_init.set_defaults(func=cmd_init)
462
447
 
463
448
  p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
464
449
  _add_common_corpus_arg(p_ingest)
465
450
  p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
466
451
  p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
467
- p_ingest.add_argument("--stdin", action="store_true", help="Read text to ingest from standard input.")
452
+ p_ingest.add_argument(
453
+ "--stdin", action="store_true", help="Read text to ingest from standard input."
454
+ )
468
455
  p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
469
456
  p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
470
457
  p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
@@ -480,18 +467,26 @@ def build_parser() -> argparse.ArgumentParser:
480
467
  p_show.add_argument("id", help="Item identifier (universally unique identifier).")
481
468
  p_show.set_defaults(func=cmd_show)
482
469
 
483
- p_reindex = sub.add_parser("reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus.")
470
+ p_reindex = sub.add_parser(
471
+ "reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
472
+ )
484
473
  _add_common_corpus_arg(p_reindex)
485
474
  p_reindex.set_defaults(func=cmd_reindex)
486
475
 
487
476
  p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
488
477
  _add_common_corpus_arg(p_import_tree)
489
478
  p_import_tree.add_argument("path", help="Folder tree root to import.")
490
- p_import_tree.add_argument("--tags", default=None, help="Comma-separated tags to apply to imported items.")
491
- p_import_tree.add_argument("--tag", action="append", help="Repeatable tag to apply to imported items.")
479
+ p_import_tree.add_argument(
480
+ "--tags", default=None, help="Comma-separated tags to apply to imported items."
481
+ )
482
+ p_import_tree.add_argument(
483
+ "--tag", action="append", help="Repeatable tag to apply to imported items."
484
+ )
492
485
  p_import_tree.set_defaults(func=cmd_import_tree)
493
486
 
494
- p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
487
+ p_purge = sub.add_parser(
488
+ "purge", help="Delete all items and derived files (requires confirmation)."
489
+ )
495
490
  _add_common_corpus_arg(p_purge)
496
491
  p_purge.add_argument(
497
492
  "--confirm",
@@ -518,23 +513,12 @@ def build_parser() -> argparse.ArgumentParser:
518
513
 
519
514
  p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
520
515
  _add_common_corpus_arg(p_extract)
521
- p_extract.add_argument(
522
- "--extractor",
523
- required=True,
524
- help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
525
- )
526
516
  p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
527
517
  p_extract.add_argument(
528
518
  "--step",
529
519
  action="append",
530
520
  default=None,
531
- help="Cascade step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
532
- )
533
- p_extract.add_argument(
534
- "--config",
535
- action="append",
536
- default=None,
537
- help="Extractor config as key=value (repeatable).",
521
+ help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
538
522
  )
539
523
  p_extract.set_defaults(func=cmd_extract)
540
524
 
@@ -573,7 +557,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
573
557
  :return: Exit code.
574
558
  :rtype: int
575
559
  """
576
-
577
560
  parser = build_parser()
578
561
  arguments = parser.parse_args(argument_list)
579
562
  try:
@@ -583,6 +566,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
583
566
  FileExistsError,
584
567
  KeyError,
585
568
  ValueError,
569
+ ExtractionRunFatalError,
586
570
  NotImplementedError,
587
571
  ValidationError,
588
572
  ) as exception: