biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +177 -53
- biblicus/corpus.py +209 -59
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +280 -79
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +118 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +1 -2
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
- biblicus-0.4.0.dist-info/RECORD +45 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -12,9 +12,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
12
12
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..extraction import ExtractionRunReference, parse_extraction_run_reference
|
|
16
15
|
from ..frontmatter import parse_front_matter
|
|
17
|
-
from ..models import
|
|
16
|
+
from ..models import (
|
|
17
|
+
Evidence,
|
|
18
|
+
ExtractionRunReference,
|
|
19
|
+
QueryBudget,
|
|
20
|
+
RetrievalResult,
|
|
21
|
+
RetrievalRun,
|
|
22
|
+
parse_extraction_run_reference,
|
|
23
|
+
)
|
|
18
24
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
19
25
|
from ..time import utc_now_iso
|
|
20
26
|
|
|
@@ -51,7 +57,9 @@ class SqliteFullTextSearchBackend:
|
|
|
51
57
|
|
|
52
58
|
backend_id = "sqlite-full-text-search"
|
|
53
59
|
|
|
54
|
-
def build_run(
|
|
60
|
+
def build_run(
|
|
61
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
62
|
+
) -> RetrievalRun:
|
|
55
63
|
"""
|
|
56
64
|
Build a full-text search version five index for the corpus.
|
|
57
65
|
|
|
@@ -64,7 +72,6 @@ class SqliteFullTextSearchBackend:
|
|
|
64
72
|
:return: Run manifest describing the build.
|
|
65
73
|
:rtype: RetrievalRun
|
|
66
74
|
"""
|
|
67
|
-
|
|
68
75
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
|
|
69
76
|
catalog = corpus.load_catalog()
|
|
70
77
|
recipe = create_recipe_manifest(
|
|
@@ -110,7 +117,6 @@ class SqliteFullTextSearchBackend:
|
|
|
110
117
|
:return: Retrieval results containing evidence.
|
|
111
118
|
:rtype: RetrievalResult
|
|
112
119
|
"""
|
|
113
|
-
|
|
114
120
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
|
|
115
121
|
db_path = _resolve_run_db_path(corpus, run)
|
|
116
122
|
candidates = _query_full_text_search_index(
|
|
@@ -156,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
156
162
|
:return: Candidate limit for backend search.
|
|
157
163
|
:rtype: int
|
|
158
164
|
"""
|
|
159
|
-
|
|
160
165
|
return max_total_items * 5
|
|
161
166
|
|
|
162
167
|
|
|
@@ -172,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
|
|
|
172
177
|
:rtype: Path
|
|
173
178
|
:raises FileNotFoundError: If the run does not have artifact paths.
|
|
174
179
|
"""
|
|
175
|
-
|
|
176
180
|
if not run.artifact_paths:
|
|
177
181
|
raise FileNotFoundError("Run has no artifact paths to query")
|
|
178
182
|
return corpus.root / run.artifact_paths[0]
|
|
@@ -188,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
|
|
|
188
192
|
:rtype: None
|
|
189
193
|
:raises RuntimeError: If full-text search version five support is unavailable.
|
|
190
194
|
"""
|
|
191
|
-
|
|
192
195
|
try:
|
|
193
196
|
cursor = conn.execute(
|
|
194
197
|
"CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
|
|
@@ -210,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
210
213
|
:return: None.
|
|
211
214
|
:rtype: None
|
|
212
215
|
"""
|
|
213
|
-
|
|
214
216
|
conn.execute(
|
|
215
217
|
"""
|
|
216
218
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
@@ -249,7 +251,6 @@ def _build_full_text_search_index(
|
|
|
249
251
|
:return: Index statistics.
|
|
250
252
|
:rtype: dict[str, int]
|
|
251
253
|
"""
|
|
252
|
-
|
|
253
254
|
if db_path.exists():
|
|
254
255
|
db_path.unlink()
|
|
255
256
|
connection = sqlite3.connect(str(db_path))
|
|
@@ -339,7 +340,6 @@ def _load_text_from_item(
|
|
|
339
340
|
:return: Text payload or None if not text.
|
|
340
341
|
:rtype: str or None
|
|
341
342
|
"""
|
|
342
|
-
|
|
343
343
|
if extraction_reference:
|
|
344
344
|
extracted_text = corpus.read_extracted_text(
|
|
345
345
|
extractor_id=extraction_reference.extractor_id,
|
|
@@ -375,7 +375,6 @@ def _resolve_extraction_reference(
|
|
|
375
375
|
:rtype: ExtractionRunReference or None
|
|
376
376
|
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
377
377
|
"""
|
|
378
|
-
|
|
379
378
|
if not recipe_config.extraction_run:
|
|
380
379
|
return None
|
|
381
380
|
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
@@ -388,7 +387,9 @@ def _resolve_extraction_reference(
|
|
|
388
387
|
return extraction_reference
|
|
389
388
|
|
|
390
389
|
|
|
391
|
-
def _iter_chunks(
|
|
390
|
+
def _iter_chunks(
|
|
391
|
+
text: str, *, chunk_size: int, chunk_overlap: int
|
|
392
|
+
) -> Iterable[Tuple[int, int, str]]:
|
|
392
393
|
"""
|
|
393
394
|
Yield overlapping chunks of text for indexing.
|
|
394
395
|
|
|
@@ -402,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
|
|
|
402
403
|
:rtype: Iterable[tuple[int, int, str]]
|
|
403
404
|
:raises ValueError: If the overlap is greater than or equal to the chunk size.
|
|
404
405
|
"""
|
|
405
|
-
|
|
406
406
|
if chunk_overlap >= chunk_size:
|
|
407
407
|
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
|
408
408
|
start_offset = 0
|
|
@@ -435,7 +435,6 @@ def _query_full_text_search_index(
|
|
|
435
435
|
:return: Evidence candidates.
|
|
436
436
|
:rtype: list[Evidence]
|
|
437
437
|
"""
|
|
438
|
-
|
|
439
438
|
connection = sqlite3.connect(str(db_path))
|
|
440
439
|
try:
|
|
441
440
|
rows = connection.execute(
|
biblicus/cli.py
CHANGED
|
@@ -14,9 +14,11 @@ from pydantic import ValidationError
|
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
16
|
from .corpus import Corpus
|
|
17
|
-
from .
|
|
17
|
+
from .crawl import CrawlRequest, crawl_into_corpus
|
|
18
|
+
from .errors import ExtractionRunFatalError
|
|
18
19
|
from .evaluation import evaluate_run, load_dataset
|
|
19
|
-
from .
|
|
20
|
+
from .extraction import build_extraction_run
|
|
21
|
+
from .models import QueryBudget, parse_extraction_run_reference
|
|
20
22
|
from .uris import corpus_ref_to_path
|
|
21
23
|
|
|
22
24
|
|
|
@@ -29,7 +31,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
|
|
|
29
31
|
:return: None.
|
|
30
32
|
:rtype: None
|
|
31
33
|
"""
|
|
32
|
-
|
|
33
34
|
parser.add_argument(
|
|
34
35
|
"--corpus",
|
|
35
36
|
type=str,
|
|
@@ -51,7 +52,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
|
|
|
51
52
|
:return: Exit code.
|
|
52
53
|
:rtype: int
|
|
53
54
|
"""
|
|
54
|
-
|
|
55
55
|
corpus_path = corpus_ref_to_path(arguments.path)
|
|
56
56
|
corpus = Corpus.init(corpus_path, force=arguments.force)
|
|
57
57
|
print(f"Initialized corpus at {corpus.root}")
|
|
@@ -69,7 +69,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
|
|
|
69
69
|
:return: Deduplicated tag list.
|
|
70
70
|
:rtype: list[str]
|
|
71
71
|
"""
|
|
72
|
-
|
|
73
72
|
parsed_tags: List[str] = []
|
|
74
73
|
if raw:
|
|
75
74
|
parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
|
|
@@ -94,7 +93,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
94
93
|
:return: Exit code.
|
|
95
94
|
:rtype: int
|
|
96
95
|
"""
|
|
97
|
-
|
|
98
96
|
corpus = (
|
|
99
97
|
Corpus.open(arguments.corpus)
|
|
100
98
|
if getattr(arguments, "corpus", None)
|
|
@@ -135,7 +133,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
|
|
|
135
133
|
:return: Exit code.
|
|
136
134
|
:rtype: int
|
|
137
135
|
"""
|
|
138
|
-
|
|
139
136
|
corpus = (
|
|
140
137
|
Corpus.open(arguments.corpus)
|
|
141
138
|
if getattr(arguments, "corpus", None)
|
|
@@ -157,7 +154,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
|
|
|
157
154
|
:return: Exit code.
|
|
158
155
|
:rtype: int
|
|
159
156
|
"""
|
|
160
|
-
|
|
161
157
|
corpus = (
|
|
162
158
|
Corpus.open(arguments.corpus)
|
|
163
159
|
if getattr(arguments, "corpus", None)
|
|
@@ -177,7 +173,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
|
177
173
|
:return: Exit code.
|
|
178
174
|
:rtype: int
|
|
179
175
|
"""
|
|
180
|
-
|
|
181
176
|
corpus = (
|
|
182
177
|
Corpus.open(arguments.corpus)
|
|
183
178
|
if getattr(arguments, "corpus", None)
|
|
@@ -197,7 +192,6 @@ def cmd_import_tree(arguments: argparse.Namespace) -> int:
|
|
|
197
192
|
:return: Exit code.
|
|
198
193
|
:rtype: int
|
|
199
194
|
"""
|
|
200
|
-
|
|
201
195
|
corpus = (
|
|
202
196
|
Corpus.open(arguments.corpus)
|
|
203
197
|
if getattr(arguments, "corpus", None)
|
|
@@ -218,7 +212,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
|
218
212
|
:return: Exit code.
|
|
219
213
|
:rtype: int
|
|
220
214
|
"""
|
|
221
|
-
|
|
222
215
|
corpus = (
|
|
223
216
|
Corpus.open(arguments.corpus)
|
|
224
217
|
if getattr(arguments, "corpus", None)
|
|
@@ -241,7 +234,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
241
234
|
:rtype: dict[str, object]
|
|
242
235
|
:raises ValueError: If any entry is not key=value.
|
|
243
236
|
"""
|
|
244
|
-
|
|
245
237
|
config: Dict[str, object] = {}
|
|
246
238
|
for item in pairs or []:
|
|
247
239
|
if "=" not in item:
|
|
@@ -264,7 +256,7 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
264
256
|
|
|
265
257
|
def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
266
258
|
"""
|
|
267
|
-
Parse a
|
|
259
|
+
Parse a pipeline step specification.
|
|
268
260
|
|
|
269
261
|
:param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
|
|
270
262
|
:type raw_step: str
|
|
@@ -272,7 +264,6 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
|
272
264
|
:rtype: tuple[str, dict[str, object]]
|
|
273
265
|
:raises ValueError: If the step spec is invalid.
|
|
274
266
|
"""
|
|
275
|
-
|
|
276
267
|
raw_step = raw_step.strip()
|
|
277
268
|
if not raw_step:
|
|
278
269
|
raise ValueError("Step spec must be non-empty")
|
|
@@ -309,7 +300,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
309
300
|
:return: Query budget instance.
|
|
310
301
|
:rtype: QueryBudget
|
|
311
302
|
"""
|
|
312
|
-
|
|
313
303
|
return QueryBudget(
|
|
314
304
|
max_total_items=arguments.max_total_items,
|
|
315
305
|
max_total_characters=arguments.max_total_characters,
|
|
@@ -326,7 +316,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
326
316
|
:return: Exit code.
|
|
327
317
|
:rtype: int
|
|
328
318
|
"""
|
|
329
|
-
|
|
330
319
|
corpus = (
|
|
331
320
|
Corpus.open(arguments.corpus)
|
|
332
321
|
if getattr(arguments, "corpus", None)
|
|
@@ -339,33 +328,31 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
339
328
|
return 0
|
|
340
329
|
|
|
341
330
|
|
|
342
|
-
def
|
|
331
|
+
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
343
332
|
"""
|
|
344
|
-
Build a text extraction run for the corpus.
|
|
333
|
+
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
345
334
|
|
|
346
335
|
:param arguments: Parsed command-line interface arguments.
|
|
347
336
|
:type arguments: argparse.Namespace
|
|
348
337
|
:return: Exit code.
|
|
349
338
|
:rtype: int
|
|
350
339
|
"""
|
|
351
|
-
|
|
352
340
|
corpus = (
|
|
353
341
|
Corpus.open(arguments.corpus)
|
|
354
342
|
if getattr(arguments, "corpus", None)
|
|
355
343
|
else Corpus.find(Path.cwd())
|
|
356
344
|
)
|
|
357
|
-
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
config = {"steps": steps}
|
|
345
|
+
raw_steps = list(arguments.step or [])
|
|
346
|
+
if not raw_steps:
|
|
347
|
+
raise ValueError("Pipeline extraction requires at least one --step")
|
|
348
|
+
steps: List[Dict[str, object]] = []
|
|
349
|
+
for raw_step in raw_steps:
|
|
350
|
+
extractor_id, step_config = _parse_step_spec(raw_step)
|
|
351
|
+
steps.append({"extractor_id": extractor_id, "config": step_config})
|
|
352
|
+
config = {"steps": steps}
|
|
366
353
|
manifest = build_extraction_run(
|
|
367
354
|
corpus,
|
|
368
|
-
extractor_id=
|
|
355
|
+
extractor_id="pipeline",
|
|
369
356
|
recipe_name=arguments.recipe_name,
|
|
370
357
|
config=config,
|
|
371
358
|
)
|
|
@@ -373,6 +360,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
|
373
360
|
return 0
|
|
374
361
|
|
|
375
362
|
|
|
363
|
+
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
364
|
+
"""
|
|
365
|
+
List extraction runs stored under the corpus.
|
|
366
|
+
|
|
367
|
+
:param arguments: Parsed command-line interface arguments.
|
|
368
|
+
:type arguments: argparse.Namespace
|
|
369
|
+
:return: Exit code.
|
|
370
|
+
:rtype: int
|
|
371
|
+
"""
|
|
372
|
+
corpus = (
|
|
373
|
+
Corpus.open(arguments.corpus)
|
|
374
|
+
if getattr(arguments, "corpus", None)
|
|
375
|
+
else Corpus.find(Path.cwd())
|
|
376
|
+
)
|
|
377
|
+
runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
|
|
378
|
+
print(json.dumps([entry.model_dump() for entry in runs], indent=2))
|
|
379
|
+
return 0
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
383
|
+
"""
|
|
384
|
+
Show an extraction run manifest.
|
|
385
|
+
|
|
386
|
+
:param arguments: Parsed command-line interface arguments.
|
|
387
|
+
:type arguments: argparse.Namespace
|
|
388
|
+
:return: Exit code.
|
|
389
|
+
:rtype: int
|
|
390
|
+
"""
|
|
391
|
+
corpus = (
|
|
392
|
+
Corpus.open(arguments.corpus)
|
|
393
|
+
if getattr(arguments, "corpus", None)
|
|
394
|
+
else Corpus.find(Path.cwd())
|
|
395
|
+
)
|
|
396
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
397
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
398
|
+
extractor_id=reference.extractor_id, run_id=reference.run_id
|
|
399
|
+
)
|
|
400
|
+
print(manifest.model_dump_json(indent=2))
|
|
401
|
+
return 0
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
405
|
+
"""
|
|
406
|
+
Delete an extraction run directory and its derived artifacts.
|
|
407
|
+
|
|
408
|
+
:param arguments: Parsed command-line interface arguments.
|
|
409
|
+
:type arguments: argparse.Namespace
|
|
410
|
+
:return: Exit code.
|
|
411
|
+
:rtype: int
|
|
412
|
+
"""
|
|
413
|
+
corpus = (
|
|
414
|
+
Corpus.open(arguments.corpus)
|
|
415
|
+
if getattr(arguments, "corpus", None)
|
|
416
|
+
else Corpus.find(Path.cwd())
|
|
417
|
+
)
|
|
418
|
+
if arguments.confirm != arguments.run:
|
|
419
|
+
raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
|
|
420
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
421
|
+
corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
|
|
422
|
+
print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
|
|
423
|
+
return 0
|
|
424
|
+
|
|
425
|
+
|
|
376
426
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
377
427
|
"""
|
|
378
428
|
Execute a retrieval query.
|
|
@@ -382,7 +432,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
382
432
|
:return: Exit code.
|
|
383
433
|
:rtype: int
|
|
384
434
|
"""
|
|
385
|
-
|
|
386
435
|
corpus = (
|
|
387
436
|
Corpus.open(arguments.corpus)
|
|
388
437
|
if getattr(arguments, "corpus", None)
|
|
@@ -413,7 +462,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
413
462
|
:return: Exit code.
|
|
414
463
|
:rtype: int
|
|
415
464
|
"""
|
|
416
|
-
|
|
417
465
|
corpus = (
|
|
418
466
|
Corpus.open(arguments.corpus)
|
|
419
467
|
if getattr(arguments, "corpus", None)
|
|
@@ -430,6 +478,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
430
478
|
return 0
|
|
431
479
|
|
|
432
480
|
|
|
481
|
+
def cmd_crawl(arguments: argparse.Namespace) -> int:
|
|
482
|
+
"""
|
|
483
|
+
Crawl a website prefix into a corpus.
|
|
484
|
+
|
|
485
|
+
:param arguments: Parsed command-line interface arguments.
|
|
486
|
+
:type arguments: argparse.Namespace
|
|
487
|
+
:return: Exit code.
|
|
488
|
+
:rtype: int
|
|
489
|
+
"""
|
|
490
|
+
corpus = (
|
|
491
|
+
Corpus.open(arguments.corpus)
|
|
492
|
+
if getattr(arguments, "corpus", None)
|
|
493
|
+
else Corpus.find(Path.cwd())
|
|
494
|
+
)
|
|
495
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
496
|
+
request = CrawlRequest(
|
|
497
|
+
root_url=arguments.root_url,
|
|
498
|
+
allowed_prefix=arguments.allowed_prefix,
|
|
499
|
+
max_items=arguments.max_items,
|
|
500
|
+
tags=tags,
|
|
501
|
+
)
|
|
502
|
+
result = crawl_into_corpus(corpus=corpus, request=request)
|
|
503
|
+
print(result.model_dump_json(indent=2))
|
|
504
|
+
return 0
|
|
505
|
+
|
|
506
|
+
|
|
433
507
|
def build_parser() -> argparse.ArgumentParser:
|
|
434
508
|
"""
|
|
435
509
|
Build the command-line interface argument parser.
|
|
@@ -437,7 +511,6 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
437
511
|
:return: Argument parser instance.
|
|
438
512
|
:rtype: argparse.ArgumentParser
|
|
439
513
|
"""
|
|
440
|
-
|
|
441
514
|
parser = argparse.ArgumentParser(
|
|
442
515
|
prog="biblicus",
|
|
443
516
|
description="Biblicus command-line interface (minimum viable product)",
|
|
@@ -457,14 +530,18 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
457
530
|
|
|
458
531
|
p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
|
|
459
532
|
p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
|
|
460
|
-
p_init.add_argument(
|
|
533
|
+
p_init.add_argument(
|
|
534
|
+
"--force", action="store_true", help="Overwrite existing config if present."
|
|
535
|
+
)
|
|
461
536
|
p_init.set_defaults(func=cmd_init)
|
|
462
537
|
|
|
463
538
|
p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
|
|
464
539
|
_add_common_corpus_arg(p_ingest)
|
|
465
540
|
p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
|
|
466
541
|
p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
|
|
467
|
-
p_ingest.add_argument(
|
|
542
|
+
p_ingest.add_argument(
|
|
543
|
+
"--stdin", action="store_true", help="Read text to ingest from standard input."
|
|
544
|
+
)
|
|
468
545
|
p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
|
|
469
546
|
p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
|
|
470
547
|
p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
|
|
@@ -480,18 +557,26 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
480
557
|
p_show.add_argument("id", help="Item identifier (universally unique identifier).")
|
|
481
558
|
p_show.set_defaults(func=cmd_show)
|
|
482
559
|
|
|
483
|
-
p_reindex = sub.add_parser(
|
|
560
|
+
p_reindex = sub.add_parser(
|
|
561
|
+
"reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
|
|
562
|
+
)
|
|
484
563
|
_add_common_corpus_arg(p_reindex)
|
|
485
564
|
p_reindex.set_defaults(func=cmd_reindex)
|
|
486
565
|
|
|
487
566
|
p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
|
|
488
567
|
_add_common_corpus_arg(p_import_tree)
|
|
489
568
|
p_import_tree.add_argument("path", help="Folder tree root to import.")
|
|
490
|
-
p_import_tree.add_argument(
|
|
491
|
-
|
|
569
|
+
p_import_tree.add_argument(
|
|
570
|
+
"--tags", default=None, help="Comma-separated tags to apply to imported items."
|
|
571
|
+
)
|
|
572
|
+
p_import_tree.add_argument(
|
|
573
|
+
"--tag", action="append", help="Repeatable tag to apply to imported items."
|
|
574
|
+
)
|
|
492
575
|
p_import_tree.set_defaults(func=cmd_import_tree)
|
|
493
576
|
|
|
494
|
-
p_purge = sub.add_parser(
|
|
577
|
+
p_purge = sub.add_parser(
|
|
578
|
+
"purge", help="Delete all items and derived files (requires confirmation)."
|
|
579
|
+
)
|
|
495
580
|
_add_common_corpus_arg(p_purge)
|
|
496
581
|
p_purge.add_argument(
|
|
497
582
|
"--confirm",
|
|
@@ -516,27 +601,53 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
516
601
|
)
|
|
517
602
|
p_build.set_defaults(func=cmd_build)
|
|
518
603
|
|
|
519
|
-
p_extract = sub.add_parser("extract", help="
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
604
|
+
p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
|
|
605
|
+
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
606
|
+
|
|
607
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
|
|
608
|
+
_add_common_corpus_arg(p_extract_build)
|
|
609
|
+
p_extract_build.add_argument(
|
|
610
|
+
"--recipe-name", default="default", help="Human-readable recipe name."
|
|
525
611
|
)
|
|
526
|
-
|
|
527
|
-
p_extract.add_argument(
|
|
612
|
+
p_extract_build.add_argument(
|
|
528
613
|
"--step",
|
|
529
614
|
action="append",
|
|
530
615
|
default=None,
|
|
531
|
-
help="
|
|
616
|
+
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
532
617
|
)
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
618
|
+
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
619
|
+
|
|
620
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
|
|
621
|
+
_add_common_corpus_arg(p_extract_list)
|
|
622
|
+
p_extract_list.add_argument(
|
|
623
|
+
"--extractor-id",
|
|
536
624
|
default=None,
|
|
537
|
-
help="
|
|
625
|
+
help="Optional extractor identifier filter (for example: pipeline).",
|
|
538
626
|
)
|
|
539
|
-
|
|
627
|
+
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
628
|
+
|
|
629
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
|
|
630
|
+
_add_common_corpus_arg(p_extract_show)
|
|
631
|
+
p_extract_show.add_argument(
|
|
632
|
+
"--run",
|
|
633
|
+
required=True,
|
|
634
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
635
|
+
)
|
|
636
|
+
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
637
|
+
|
|
638
|
+
p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
|
|
639
|
+
_add_common_corpus_arg(p_extract_delete)
|
|
640
|
+
p_extract_delete.add_argument(
|
|
641
|
+
"--run",
|
|
642
|
+
required=True,
|
|
643
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
644
|
+
)
|
|
645
|
+
p_extract_delete.add_argument(
|
|
646
|
+
"--confirm",
|
|
647
|
+
required=True,
|
|
648
|
+
help="Type the exact extractor_id:run_id to confirm deletion.",
|
|
649
|
+
)
|
|
650
|
+
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
540
651
|
|
|
541
652
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
542
653
|
_add_common_corpus_arg(p_query)
|
|
@@ -561,6 +672,19 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
561
672
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
562
673
|
p_eval.set_defaults(func=cmd_eval)
|
|
563
674
|
|
|
675
|
+
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
676
|
+
_add_common_corpus_arg(p_crawl)
|
|
677
|
+
p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
|
|
678
|
+
p_crawl.add_argument(
|
|
679
|
+
"--allowed-prefix",
|
|
680
|
+
required=True,
|
|
681
|
+
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
682
|
+
)
|
|
683
|
+
p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
|
|
684
|
+
p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
|
|
685
|
+
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
686
|
+
p_crawl.set_defaults(func=cmd_crawl)
|
|
687
|
+
|
|
564
688
|
return parser
|
|
565
689
|
|
|
566
690
|
|
|
@@ -573,7 +697,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
573
697
|
:return: Exit code.
|
|
574
698
|
:rtype: int
|
|
575
699
|
"""
|
|
576
|
-
|
|
577
700
|
parser = build_parser()
|
|
578
701
|
arguments = parser.parse_args(argument_list)
|
|
579
702
|
try:
|
|
@@ -583,6 +706,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
583
706
|
FileExistsError,
|
|
584
707
|
KeyError,
|
|
585
708
|
ValueError,
|
|
709
|
+
ExtractionRunFatalError,
|
|
586
710
|
NotImplementedError,
|
|
587
711
|
ValidationError,
|
|
588
712
|
) as exception:
|