biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +33 -49
- biblicus/corpus.py +39 -58
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +276 -77
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +87 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -12,9 +12,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
12
12
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..extraction import ExtractionRunReference, parse_extraction_run_reference
|
|
16
15
|
from ..frontmatter import parse_front_matter
|
|
17
|
-
from ..models import
|
|
16
|
+
from ..models import (
|
|
17
|
+
Evidence,
|
|
18
|
+
ExtractionRunReference,
|
|
19
|
+
QueryBudget,
|
|
20
|
+
RetrievalResult,
|
|
21
|
+
RetrievalRun,
|
|
22
|
+
parse_extraction_run_reference,
|
|
23
|
+
)
|
|
18
24
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
19
25
|
from ..time import utc_now_iso
|
|
20
26
|
|
|
@@ -51,7 +57,9 @@ class SqliteFullTextSearchBackend:
|
|
|
51
57
|
|
|
52
58
|
backend_id = "sqlite-full-text-search"
|
|
53
59
|
|
|
54
|
-
def build_run(
|
|
60
|
+
def build_run(
|
|
61
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
62
|
+
) -> RetrievalRun:
|
|
55
63
|
"""
|
|
56
64
|
Build a full-text search version five index for the corpus.
|
|
57
65
|
|
|
@@ -64,7 +72,6 @@ class SqliteFullTextSearchBackend:
|
|
|
64
72
|
:return: Run manifest describing the build.
|
|
65
73
|
:rtype: RetrievalRun
|
|
66
74
|
"""
|
|
67
|
-
|
|
68
75
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
|
|
69
76
|
catalog = corpus.load_catalog()
|
|
70
77
|
recipe = create_recipe_manifest(
|
|
@@ -110,7 +117,6 @@ class SqliteFullTextSearchBackend:
|
|
|
110
117
|
:return: Retrieval results containing evidence.
|
|
111
118
|
:rtype: RetrievalResult
|
|
112
119
|
"""
|
|
113
|
-
|
|
114
120
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
|
|
115
121
|
db_path = _resolve_run_db_path(corpus, run)
|
|
116
122
|
candidates = _query_full_text_search_index(
|
|
@@ -156,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
156
162
|
:return: Candidate limit for backend search.
|
|
157
163
|
:rtype: int
|
|
158
164
|
"""
|
|
159
|
-
|
|
160
165
|
return max_total_items * 5
|
|
161
166
|
|
|
162
167
|
|
|
@@ -172,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
|
|
|
172
177
|
:rtype: Path
|
|
173
178
|
:raises FileNotFoundError: If the run does not have artifact paths.
|
|
174
179
|
"""
|
|
175
|
-
|
|
176
180
|
if not run.artifact_paths:
|
|
177
181
|
raise FileNotFoundError("Run has no artifact paths to query")
|
|
178
182
|
return corpus.root / run.artifact_paths[0]
|
|
@@ -188,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
|
|
|
188
192
|
:rtype: None
|
|
189
193
|
:raises RuntimeError: If full-text search version five support is unavailable.
|
|
190
194
|
"""
|
|
191
|
-
|
|
192
195
|
try:
|
|
193
196
|
cursor = conn.execute(
|
|
194
197
|
"CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
|
|
@@ -210,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
210
213
|
:return: None.
|
|
211
214
|
:rtype: None
|
|
212
215
|
"""
|
|
213
|
-
|
|
214
216
|
conn.execute(
|
|
215
217
|
"""
|
|
216
218
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
@@ -249,7 +251,6 @@ def _build_full_text_search_index(
|
|
|
249
251
|
:return: Index statistics.
|
|
250
252
|
:rtype: dict[str, int]
|
|
251
253
|
"""
|
|
252
|
-
|
|
253
254
|
if db_path.exists():
|
|
254
255
|
db_path.unlink()
|
|
255
256
|
connection = sqlite3.connect(str(db_path))
|
|
@@ -339,7 +340,6 @@ def _load_text_from_item(
|
|
|
339
340
|
:return: Text payload or None if not text.
|
|
340
341
|
:rtype: str or None
|
|
341
342
|
"""
|
|
342
|
-
|
|
343
343
|
if extraction_reference:
|
|
344
344
|
extracted_text = corpus.read_extracted_text(
|
|
345
345
|
extractor_id=extraction_reference.extractor_id,
|
|
@@ -375,7 +375,6 @@ def _resolve_extraction_reference(
|
|
|
375
375
|
:rtype: ExtractionRunReference or None
|
|
376
376
|
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
377
377
|
"""
|
|
378
|
-
|
|
379
378
|
if not recipe_config.extraction_run:
|
|
380
379
|
return None
|
|
381
380
|
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
@@ -388,7 +387,9 @@ def _resolve_extraction_reference(
|
|
|
388
387
|
return extraction_reference
|
|
389
388
|
|
|
390
389
|
|
|
391
|
-
def _iter_chunks(
|
|
390
|
+
def _iter_chunks(
|
|
391
|
+
text: str, *, chunk_size: int, chunk_overlap: int
|
|
392
|
+
) -> Iterable[Tuple[int, int, str]]:
|
|
392
393
|
"""
|
|
393
394
|
Yield overlapping chunks of text for indexing.
|
|
394
395
|
|
|
@@ -402,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
|
|
|
402
403
|
:rtype: Iterable[tuple[int, int, str]]
|
|
403
404
|
:raises ValueError: If the overlap is greater than or equal to the chunk size.
|
|
404
405
|
"""
|
|
405
|
-
|
|
406
406
|
if chunk_overlap >= chunk_size:
|
|
407
407
|
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
|
408
408
|
start_offset = 0
|
|
@@ -435,7 +435,6 @@ def _query_full_text_search_index(
|
|
|
435
435
|
:return: Evidence candidates.
|
|
436
436
|
:rtype: list[Evidence]
|
|
437
437
|
"""
|
|
438
|
-
|
|
439
438
|
connection = sqlite3.connect(str(db_path))
|
|
440
439
|
try:
|
|
441
440
|
rows = connection.execute(
|
biblicus/cli.py
CHANGED
|
@@ -14,8 +14,9 @@ from pydantic import ValidationError
|
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
16
|
from .corpus import Corpus
|
|
17
|
-
from .
|
|
17
|
+
from .errors import ExtractionRunFatalError
|
|
18
18
|
from .evaluation import evaluate_run, load_dataset
|
|
19
|
+
from .extraction import build_extraction_run
|
|
19
20
|
from .models import QueryBudget
|
|
20
21
|
from .uris import corpus_ref_to_path
|
|
21
22
|
|
|
@@ -29,7 +30,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
|
|
|
29
30
|
:return: None.
|
|
30
31
|
:rtype: None
|
|
31
32
|
"""
|
|
32
|
-
|
|
33
33
|
parser.add_argument(
|
|
34
34
|
"--corpus",
|
|
35
35
|
type=str,
|
|
@@ -51,7 +51,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
|
|
|
51
51
|
:return: Exit code.
|
|
52
52
|
:rtype: int
|
|
53
53
|
"""
|
|
54
|
-
|
|
55
54
|
corpus_path = corpus_ref_to_path(arguments.path)
|
|
56
55
|
corpus = Corpus.init(corpus_path, force=arguments.force)
|
|
57
56
|
print(f"Initialized corpus at {corpus.root}")
|
|
@@ -69,7 +68,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
|
|
|
69
68
|
:return: Deduplicated tag list.
|
|
70
69
|
:rtype: list[str]
|
|
71
70
|
"""
|
|
72
|
-
|
|
73
71
|
parsed_tags: List[str] = []
|
|
74
72
|
if raw:
|
|
75
73
|
parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
|
|
@@ -94,7 +92,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
94
92
|
:return: Exit code.
|
|
95
93
|
:rtype: int
|
|
96
94
|
"""
|
|
97
|
-
|
|
98
95
|
corpus = (
|
|
99
96
|
Corpus.open(arguments.corpus)
|
|
100
97
|
if getattr(arguments, "corpus", None)
|
|
@@ -135,7 +132,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
|
|
|
135
132
|
:return: Exit code.
|
|
136
133
|
:rtype: int
|
|
137
134
|
"""
|
|
138
|
-
|
|
139
135
|
corpus = (
|
|
140
136
|
Corpus.open(arguments.corpus)
|
|
141
137
|
if getattr(arguments, "corpus", None)
|
|
@@ -157,7 +153,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
|
|
|
157
153
|
:return: Exit code.
|
|
158
154
|
:rtype: int
|
|
159
155
|
"""
|
|
160
|
-
|
|
161
156
|
corpus = (
|
|
162
157
|
Corpus.open(arguments.corpus)
|
|
163
158
|
if getattr(arguments, "corpus", None)
|
|
@@ -177,7 +172,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
|
177
172
|
:return: Exit code.
|
|
178
173
|
:rtype: int
|
|
179
174
|
"""
|
|
180
|
-
|
|
181
175
|
corpus = (
|
|
182
176
|
Corpus.open(arguments.corpus)
|
|
183
177
|
if getattr(arguments, "corpus", None)
|
|
@@ -197,7 +191,6 @@ def cmd_import_tree(arguments: argparse.Namespace) -> int:
|
|
|
197
191
|
:return: Exit code.
|
|
198
192
|
:rtype: int
|
|
199
193
|
"""
|
|
200
|
-
|
|
201
194
|
corpus = (
|
|
202
195
|
Corpus.open(arguments.corpus)
|
|
203
196
|
if getattr(arguments, "corpus", None)
|
|
@@ -218,7 +211,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
|
218
211
|
:return: Exit code.
|
|
219
212
|
:rtype: int
|
|
220
213
|
"""
|
|
221
|
-
|
|
222
214
|
corpus = (
|
|
223
215
|
Corpus.open(arguments.corpus)
|
|
224
216
|
if getattr(arguments, "corpus", None)
|
|
@@ -241,7 +233,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
241
233
|
:rtype: dict[str, object]
|
|
242
234
|
:raises ValueError: If any entry is not key=value.
|
|
243
235
|
"""
|
|
244
|
-
|
|
245
236
|
config: Dict[str, object] = {}
|
|
246
237
|
for item in pairs or []:
|
|
247
238
|
if "=" not in item:
|
|
@@ -264,7 +255,7 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
264
255
|
|
|
265
256
|
def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
266
257
|
"""
|
|
267
|
-
Parse a
|
|
258
|
+
Parse a pipeline step specification.
|
|
268
259
|
|
|
269
260
|
:param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
|
|
270
261
|
:type raw_step: str
|
|
@@ -272,7 +263,6 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
|
272
263
|
:rtype: tuple[str, dict[str, object]]
|
|
273
264
|
:raises ValueError: If the step spec is invalid.
|
|
274
265
|
"""
|
|
275
|
-
|
|
276
266
|
raw_step = raw_step.strip()
|
|
277
267
|
if not raw_step:
|
|
278
268
|
raise ValueError("Step spec must be non-empty")
|
|
@@ -309,7 +299,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
309
299
|
:return: Query budget instance.
|
|
310
300
|
:rtype: QueryBudget
|
|
311
301
|
"""
|
|
312
|
-
|
|
313
302
|
return QueryBudget(
|
|
314
303
|
max_total_items=arguments.max_total_items,
|
|
315
304
|
max_total_characters=arguments.max_total_characters,
|
|
@@ -326,7 +315,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
326
315
|
:return: Exit code.
|
|
327
316
|
:rtype: int
|
|
328
317
|
"""
|
|
329
|
-
|
|
330
318
|
corpus = (
|
|
331
319
|
Corpus.open(arguments.corpus)
|
|
332
320
|
if getattr(arguments, "corpus", None)
|
|
@@ -341,31 +329,29 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
341
329
|
|
|
342
330
|
def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
343
331
|
"""
|
|
344
|
-
Build a text extraction run for the corpus.
|
|
332
|
+
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
345
333
|
|
|
346
334
|
:param arguments: Parsed command-line interface arguments.
|
|
347
335
|
:type arguments: argparse.Namespace
|
|
348
336
|
:return: Exit code.
|
|
349
337
|
:rtype: int
|
|
350
338
|
"""
|
|
351
|
-
|
|
352
339
|
corpus = (
|
|
353
340
|
Corpus.open(arguments.corpus)
|
|
354
341
|
if getattr(arguments, "corpus", None)
|
|
355
342
|
else Corpus.find(Path.cwd())
|
|
356
343
|
)
|
|
357
|
-
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
config = {"steps": steps}
|
|
344
|
+
raw_steps = list(arguments.step or [])
|
|
345
|
+
if not raw_steps:
|
|
346
|
+
raise ValueError("Pipeline extraction requires at least one --step")
|
|
347
|
+
steps: List[Dict[str, object]] = []
|
|
348
|
+
for raw_step in raw_steps:
|
|
349
|
+
extractor_id, step_config = _parse_step_spec(raw_step)
|
|
350
|
+
steps.append({"extractor_id": extractor_id, "config": step_config})
|
|
351
|
+
config = {"steps": steps}
|
|
366
352
|
manifest = build_extraction_run(
|
|
367
353
|
corpus,
|
|
368
|
-
extractor_id=
|
|
354
|
+
extractor_id="pipeline",
|
|
369
355
|
recipe_name=arguments.recipe_name,
|
|
370
356
|
config=config,
|
|
371
357
|
)
|
|
@@ -382,7 +368,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
382
368
|
:return: Exit code.
|
|
383
369
|
:rtype: int
|
|
384
370
|
"""
|
|
385
|
-
|
|
386
371
|
corpus = (
|
|
387
372
|
Corpus.open(arguments.corpus)
|
|
388
373
|
if getattr(arguments, "corpus", None)
|
|
@@ -413,7 +398,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
413
398
|
:return: Exit code.
|
|
414
399
|
:rtype: int
|
|
415
400
|
"""
|
|
416
|
-
|
|
417
401
|
corpus = (
|
|
418
402
|
Corpus.open(arguments.corpus)
|
|
419
403
|
if getattr(arguments, "corpus", None)
|
|
@@ -437,7 +421,6 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
437
421
|
:return: Argument parser instance.
|
|
438
422
|
:rtype: argparse.ArgumentParser
|
|
439
423
|
"""
|
|
440
|
-
|
|
441
424
|
parser = argparse.ArgumentParser(
|
|
442
425
|
prog="biblicus",
|
|
443
426
|
description="Biblicus command-line interface (minimum viable product)",
|
|
@@ -457,14 +440,18 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
457
440
|
|
|
458
441
|
p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
|
|
459
442
|
p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
|
|
460
|
-
p_init.add_argument(
|
|
443
|
+
p_init.add_argument(
|
|
444
|
+
"--force", action="store_true", help="Overwrite existing config if present."
|
|
445
|
+
)
|
|
461
446
|
p_init.set_defaults(func=cmd_init)
|
|
462
447
|
|
|
463
448
|
p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
|
|
464
449
|
_add_common_corpus_arg(p_ingest)
|
|
465
450
|
p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
|
|
466
451
|
p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
|
|
467
|
-
p_ingest.add_argument(
|
|
452
|
+
p_ingest.add_argument(
|
|
453
|
+
"--stdin", action="store_true", help="Read text to ingest from standard input."
|
|
454
|
+
)
|
|
468
455
|
p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
|
|
469
456
|
p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
|
|
470
457
|
p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
|
|
@@ -480,18 +467,26 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
480
467
|
p_show.add_argument("id", help="Item identifier (universally unique identifier).")
|
|
481
468
|
p_show.set_defaults(func=cmd_show)
|
|
482
469
|
|
|
483
|
-
p_reindex = sub.add_parser(
|
|
470
|
+
p_reindex = sub.add_parser(
|
|
471
|
+
"reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
|
|
472
|
+
)
|
|
484
473
|
_add_common_corpus_arg(p_reindex)
|
|
485
474
|
p_reindex.set_defaults(func=cmd_reindex)
|
|
486
475
|
|
|
487
476
|
p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
|
|
488
477
|
_add_common_corpus_arg(p_import_tree)
|
|
489
478
|
p_import_tree.add_argument("path", help="Folder tree root to import.")
|
|
490
|
-
p_import_tree.add_argument(
|
|
491
|
-
|
|
479
|
+
p_import_tree.add_argument(
|
|
480
|
+
"--tags", default=None, help="Comma-separated tags to apply to imported items."
|
|
481
|
+
)
|
|
482
|
+
p_import_tree.add_argument(
|
|
483
|
+
"--tag", action="append", help="Repeatable tag to apply to imported items."
|
|
484
|
+
)
|
|
492
485
|
p_import_tree.set_defaults(func=cmd_import_tree)
|
|
493
486
|
|
|
494
|
-
p_purge = sub.add_parser(
|
|
487
|
+
p_purge = sub.add_parser(
|
|
488
|
+
"purge", help="Delete all items and derived files (requires confirmation)."
|
|
489
|
+
)
|
|
495
490
|
_add_common_corpus_arg(p_purge)
|
|
496
491
|
p_purge.add_argument(
|
|
497
492
|
"--confirm",
|
|
@@ -518,23 +513,12 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
518
513
|
|
|
519
514
|
p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
|
|
520
515
|
_add_common_corpus_arg(p_extract)
|
|
521
|
-
p_extract.add_argument(
|
|
522
|
-
"--extractor",
|
|
523
|
-
required=True,
|
|
524
|
-
help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
|
|
525
|
-
)
|
|
526
516
|
p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
527
517
|
p_extract.add_argument(
|
|
528
518
|
"--step",
|
|
529
519
|
action="append",
|
|
530
520
|
default=None,
|
|
531
|
-
help="
|
|
532
|
-
)
|
|
533
|
-
p_extract.add_argument(
|
|
534
|
-
"--config",
|
|
535
|
-
action="append",
|
|
536
|
-
default=None,
|
|
537
|
-
help="Extractor config as key=value (repeatable).",
|
|
521
|
+
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
538
522
|
)
|
|
539
523
|
p_extract.set_defaults(func=cmd_extract)
|
|
540
524
|
|
|
@@ -573,7 +557,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
573
557
|
:return: Exit code.
|
|
574
558
|
:rtype: int
|
|
575
559
|
"""
|
|
576
|
-
|
|
577
560
|
parser = build_parser()
|
|
578
561
|
arguments = parser.parse_args(argument_list)
|
|
579
562
|
try:
|
|
@@ -583,6 +566,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
583
566
|
FileExistsError,
|
|
584
567
|
KeyError,
|
|
585
568
|
ValueError,
|
|
569
|
+
ExtractionRunFatalError,
|
|
586
570
|
NotImplementedError,
|
|
587
571
|
ValidationError,
|
|
588
572
|
) as exception:
|