biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +96 -13
- biblicus/backends/sqlite_full_text_search.py +74 -14
- biblicus/cli.py +126 -19
- biblicus/constants.py +2 -0
- biblicus/corpus.py +455 -45
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +529 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/models.py +107 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +85 -5
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- biblicus-0.3.0.dist-info/METADATA +336 -0
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus-0.1.1.dist-info/METADATA +0 -174
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,14 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
15
15
|
from ..frontmatter import parse_front_matter
|
|
16
|
-
from ..models import
|
|
16
|
+
from ..models import (
|
|
17
|
+
Evidence,
|
|
18
|
+
ExtractionRunReference,
|
|
19
|
+
QueryBudget,
|
|
20
|
+
RetrievalResult,
|
|
21
|
+
RetrievalRun,
|
|
22
|
+
parse_extraction_run_reference,
|
|
23
|
+
)
|
|
17
24
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
18
25
|
from ..time import utc_now_iso
|
|
19
26
|
|
|
@@ -28,6 +35,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
28
35
|
:vartype chunk_overlap: int
|
|
29
36
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
30
37
|
:vartype snippet_characters: int
|
|
38
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
39
|
+
:vartype extraction_run: str or None
|
|
31
40
|
"""
|
|
32
41
|
|
|
33
42
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -35,6 +44,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
35
44
|
chunk_size: int = Field(default=800, ge=1)
|
|
36
45
|
chunk_overlap: int = Field(default=200, ge=0)
|
|
37
46
|
snippet_characters: int = Field(default=400, ge=1)
|
|
47
|
+
extraction_run: Optional[str] = None
|
|
38
48
|
|
|
39
49
|
|
|
40
50
|
class SqliteFullTextSearchBackend:
|
|
@@ -47,7 +57,9 @@ class SqliteFullTextSearchBackend:
|
|
|
47
57
|
|
|
48
58
|
backend_id = "sqlite-full-text-search"
|
|
49
59
|
|
|
50
|
-
def build_run(
|
|
60
|
+
def build_run(
|
|
61
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
62
|
+
) -> RetrievalRun:
|
|
51
63
|
"""
|
|
52
64
|
Build a full-text search version five index for the corpus.
|
|
53
65
|
|
|
@@ -60,7 +72,6 @@ class SqliteFullTextSearchBackend:
|
|
|
60
72
|
:return: Run manifest describing the build.
|
|
61
73
|
:rtype: RetrievalRun
|
|
62
74
|
"""
|
|
63
|
-
|
|
64
75
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
|
|
65
76
|
catalog = corpus.load_catalog()
|
|
66
77
|
recipe = create_recipe_manifest(
|
|
@@ -72,11 +83,13 @@ class SqliteFullTextSearchBackend:
|
|
|
72
83
|
db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
|
|
73
84
|
db_path = corpus.root / db_relpath
|
|
74
85
|
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
75
87
|
stats = _build_full_text_search_index(
|
|
76
88
|
db_path=db_path,
|
|
77
89
|
corpus=corpus,
|
|
78
90
|
items=catalog.items.values(),
|
|
79
91
|
recipe_config=recipe_config,
|
|
92
|
+
extraction_reference=extraction_reference,
|
|
80
93
|
)
|
|
81
94
|
run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
|
|
82
95
|
corpus.write_run(run)
|
|
@@ -104,7 +117,6 @@ class SqliteFullTextSearchBackend:
|
|
|
104
117
|
:return: Retrieval results containing evidence.
|
|
105
118
|
:rtype: RetrievalResult
|
|
106
119
|
"""
|
|
107
|
-
|
|
108
120
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
|
|
109
121
|
db_path = _resolve_run_db_path(corpus, run)
|
|
110
122
|
candidates = _query_full_text_search_index(
|
|
@@ -150,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
150
162
|
:return: Candidate limit for backend search.
|
|
151
163
|
:rtype: int
|
|
152
164
|
"""
|
|
153
|
-
|
|
154
165
|
return max_total_items * 5
|
|
155
166
|
|
|
156
167
|
|
|
@@ -166,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
|
|
|
166
177
|
:rtype: Path
|
|
167
178
|
:raises FileNotFoundError: If the run does not have artifact paths.
|
|
168
179
|
"""
|
|
169
|
-
|
|
170
180
|
if not run.artifact_paths:
|
|
171
181
|
raise FileNotFoundError("Run has no artifact paths to query")
|
|
172
182
|
return corpus.root / run.artifact_paths[0]
|
|
@@ -182,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
|
|
|
182
192
|
:rtype: None
|
|
183
193
|
:raises RuntimeError: If full-text search version five support is unavailable.
|
|
184
194
|
"""
|
|
185
|
-
|
|
186
195
|
try:
|
|
187
196
|
cursor = conn.execute(
|
|
188
197
|
"CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
|
|
@@ -204,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
204
213
|
:return: None.
|
|
205
214
|
:rtype: None
|
|
206
215
|
"""
|
|
207
|
-
|
|
208
216
|
conn.execute(
|
|
209
217
|
"""
|
|
210
218
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
@@ -227,6 +235,7 @@ def _build_full_text_search_index(
|
|
|
227
235
|
corpus: Corpus,
|
|
228
236
|
items: Iterable[object],
|
|
229
237
|
recipe_config: SqliteFullTextSearchRecipeConfig,
|
|
238
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
230
239
|
) -> Dict[str, int]:
|
|
231
240
|
"""
|
|
232
241
|
Build a full-text search index from corpus items.
|
|
@@ -242,7 +251,6 @@ def _build_full_text_search_index(
|
|
|
242
251
|
:return: Index statistics.
|
|
243
252
|
:rtype: dict[str, int]
|
|
244
253
|
"""
|
|
245
|
-
|
|
246
254
|
if db_path.exists():
|
|
247
255
|
db_path.unlink()
|
|
248
256
|
connection = sqlite3.connect(str(db_path))
|
|
@@ -256,7 +264,13 @@ def _build_full_text_search_index(
|
|
|
256
264
|
item_count += 1
|
|
257
265
|
media_type = getattr(catalog_item, "media_type", "")
|
|
258
266
|
relpath = getattr(catalog_item, "relpath", "")
|
|
259
|
-
item_text = _load_text_from_item(
|
|
267
|
+
item_text = _load_text_from_item(
|
|
268
|
+
corpus,
|
|
269
|
+
item_id=str(getattr(catalog_item, "id", "")),
|
|
270
|
+
relpath=str(relpath),
|
|
271
|
+
media_type=str(media_type),
|
|
272
|
+
extraction_reference=extraction_reference,
|
|
273
|
+
)
|
|
260
274
|
if item_text is None:
|
|
261
275
|
continue
|
|
262
276
|
text_item_count += 1
|
|
@@ -302,19 +316,38 @@ def _build_full_text_search_index(
|
|
|
302
316
|
connection.close()
|
|
303
317
|
|
|
304
318
|
|
|
305
|
-
def _load_text_from_item(
|
|
319
|
+
def _load_text_from_item(
|
|
320
|
+
corpus: Corpus,
|
|
321
|
+
*,
|
|
322
|
+
item_id: str,
|
|
323
|
+
relpath: str,
|
|
324
|
+
media_type: str,
|
|
325
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
326
|
+
) -> Optional[str]:
|
|
306
327
|
"""
|
|
307
328
|
Load text content from a catalog item.
|
|
308
329
|
|
|
309
330
|
:param corpus: Corpus containing the content.
|
|
310
331
|
:type corpus: Corpus
|
|
332
|
+
:param item_id: Item identifier.
|
|
333
|
+
:type item_id: str
|
|
311
334
|
:param relpath: Relative path to the content.
|
|
312
335
|
:type relpath: str
|
|
313
336
|
:param media_type: Media type for the content.
|
|
314
337
|
:type media_type: str
|
|
338
|
+
:param extraction_reference: Optional extraction run reference.
|
|
339
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
315
340
|
:return: Text payload or None if not text.
|
|
316
341
|
:rtype: str or None
|
|
317
342
|
"""
|
|
343
|
+
if extraction_reference:
|
|
344
|
+
extracted_text = corpus.read_extracted_text(
|
|
345
|
+
extractor_id=extraction_reference.extractor_id,
|
|
346
|
+
run_id=extraction_reference.run_id,
|
|
347
|
+
item_id=item_id,
|
|
348
|
+
)
|
|
349
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
350
|
+
return extracted_text
|
|
318
351
|
|
|
319
352
|
content_path = corpus.root / relpath
|
|
320
353
|
raw_bytes = content_path.read_bytes()
|
|
@@ -327,7 +360,36 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
|
|
|
327
360
|
return None
|
|
328
361
|
|
|
329
362
|
|
|
330
|
-
def
|
|
363
|
+
def _resolve_extraction_reference(
|
|
364
|
+
corpus: Corpus,
|
|
365
|
+
recipe_config: SqliteFullTextSearchRecipeConfig,
|
|
366
|
+
) -> Optional[ExtractionRunReference]:
|
|
367
|
+
"""
|
|
368
|
+
Resolve an extraction run reference from a recipe config.
|
|
369
|
+
|
|
370
|
+
:param corpus: Corpus associated with the recipe.
|
|
371
|
+
:type corpus: Corpus
|
|
372
|
+
:param recipe_config: Parsed backend recipe configuration.
|
|
373
|
+
:type recipe_config: SqliteFullTextSearchRecipeConfig
|
|
374
|
+
:return: Parsed extraction reference or None.
|
|
375
|
+
:rtype: ExtractionRunReference or None
|
|
376
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
377
|
+
"""
|
|
378
|
+
if not recipe_config.extraction_run:
|
|
379
|
+
return None
|
|
380
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
381
|
+
run_dir = corpus.extraction_run_dir(
|
|
382
|
+
extractor_id=extraction_reference.extractor_id,
|
|
383
|
+
run_id=extraction_reference.run_id,
|
|
384
|
+
)
|
|
385
|
+
if not run_dir.is_dir():
|
|
386
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
387
|
+
return extraction_reference
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _iter_chunks(
|
|
391
|
+
text: str, *, chunk_size: int, chunk_overlap: int
|
|
392
|
+
) -> Iterable[Tuple[int, int, str]]:
|
|
331
393
|
"""
|
|
332
394
|
Yield overlapping chunks of text for indexing.
|
|
333
395
|
|
|
@@ -341,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
|
|
|
341
403
|
:rtype: Iterable[tuple[int, int, str]]
|
|
342
404
|
:raises ValueError: If the overlap is greater than or equal to the chunk size.
|
|
343
405
|
"""
|
|
344
|
-
|
|
345
406
|
if chunk_overlap >= chunk_size:
|
|
346
407
|
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
|
347
408
|
start_offset = 0
|
|
@@ -374,7 +435,6 @@ def _query_full_text_search_index(
|
|
|
374
435
|
:return: Evidence candidates.
|
|
375
436
|
:rtype: list[Evidence]
|
|
376
437
|
"""
|
|
377
|
-
|
|
378
438
|
connection = sqlite3.connect(str(db_path))
|
|
379
439
|
try:
|
|
380
440
|
rows = connection.execute(
|
biblicus/cli.py
CHANGED
|
@@ -14,7 +14,9 @@ from pydantic import ValidationError
|
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
16
|
from .corpus import Corpus
|
|
17
|
+
from .errors import ExtractionRunFatalError
|
|
17
18
|
from .evaluation import evaluate_run, load_dataset
|
|
19
|
+
from .extraction import build_extraction_run
|
|
18
20
|
from .models import QueryBudget
|
|
19
21
|
from .uris import corpus_ref_to_path
|
|
20
22
|
|
|
@@ -28,7 +30,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
|
|
|
28
30
|
:return: None.
|
|
29
31
|
:rtype: None
|
|
30
32
|
"""
|
|
31
|
-
|
|
32
33
|
parser.add_argument(
|
|
33
34
|
"--corpus",
|
|
34
35
|
type=str,
|
|
@@ -50,7 +51,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
|
|
|
50
51
|
:return: Exit code.
|
|
51
52
|
:rtype: int
|
|
52
53
|
"""
|
|
53
|
-
|
|
54
54
|
corpus_path = corpus_ref_to_path(arguments.path)
|
|
55
55
|
corpus = Corpus.init(corpus_path, force=arguments.force)
|
|
56
56
|
print(f"Initialized corpus at {corpus.root}")
|
|
@@ -68,7 +68,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
|
|
|
68
68
|
:return: Deduplicated tag list.
|
|
69
69
|
:rtype: list[str]
|
|
70
70
|
"""
|
|
71
|
-
|
|
72
71
|
parsed_tags: List[str] = []
|
|
73
72
|
if raw:
|
|
74
73
|
parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
|
|
@@ -93,7 +92,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
|
93
92
|
:return: Exit code.
|
|
94
93
|
:rtype: int
|
|
95
94
|
"""
|
|
96
|
-
|
|
97
95
|
corpus = (
|
|
98
96
|
Corpus.open(arguments.corpus)
|
|
99
97
|
if getattr(arguments, "corpus", None)
|
|
@@ -134,7 +132,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
|
|
|
134
132
|
:return: Exit code.
|
|
135
133
|
:rtype: int
|
|
136
134
|
"""
|
|
137
|
-
|
|
138
135
|
corpus = (
|
|
139
136
|
Corpus.open(arguments.corpus)
|
|
140
137
|
if getattr(arguments, "corpus", None)
|
|
@@ -156,7 +153,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
|
|
|
156
153
|
:return: Exit code.
|
|
157
154
|
:rtype: int
|
|
158
155
|
"""
|
|
159
|
-
|
|
160
156
|
corpus = (
|
|
161
157
|
Corpus.open(arguments.corpus)
|
|
162
158
|
if getattr(arguments, "corpus", None)
|
|
@@ -176,7 +172,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
|
176
172
|
:return: Exit code.
|
|
177
173
|
:rtype: int
|
|
178
174
|
"""
|
|
179
|
-
|
|
180
175
|
corpus = (
|
|
181
176
|
Corpus.open(arguments.corpus)
|
|
182
177
|
if getattr(arguments, "corpus", None)
|
|
@@ -187,6 +182,26 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
|
187
182
|
return 0
|
|
188
183
|
|
|
189
184
|
|
|
185
|
+
def cmd_import_tree(arguments: argparse.Namespace) -> int:
|
|
186
|
+
"""
|
|
187
|
+
Import a folder tree into a corpus.
|
|
188
|
+
|
|
189
|
+
:param arguments: Parsed command-line interface arguments.
|
|
190
|
+
:type arguments: argparse.Namespace
|
|
191
|
+
:return: Exit code.
|
|
192
|
+
:rtype: int
|
|
193
|
+
"""
|
|
194
|
+
corpus = (
|
|
195
|
+
Corpus.open(arguments.corpus)
|
|
196
|
+
if getattr(arguments, "corpus", None)
|
|
197
|
+
else Corpus.find(Path.cwd())
|
|
198
|
+
)
|
|
199
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
200
|
+
stats = corpus.import_tree(Path(arguments.path), tags=tags)
|
|
201
|
+
print(json.dumps(stats, indent=2, sort_keys=False))
|
|
202
|
+
return 0
|
|
203
|
+
|
|
204
|
+
|
|
190
205
|
def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
191
206
|
"""
|
|
192
207
|
Purge all items and derived artifacts from a corpus.
|
|
@@ -196,7 +211,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
|
196
211
|
:return: Exit code.
|
|
197
212
|
:rtype: int
|
|
198
213
|
"""
|
|
199
|
-
|
|
200
214
|
corpus = (
|
|
201
215
|
Corpus.open(arguments.corpus)
|
|
202
216
|
if getattr(arguments, "corpus", None)
|
|
@@ -219,7 +233,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
219
233
|
:rtype: dict[str, object]
|
|
220
234
|
:raises ValueError: If any entry is not key=value.
|
|
221
235
|
"""
|
|
222
|
-
|
|
223
236
|
config: Dict[str, object] = {}
|
|
224
237
|
for item in pairs or []:
|
|
225
238
|
if "=" not in item:
|
|
@@ -240,6 +253,43 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
240
253
|
return config
|
|
241
254
|
|
|
242
255
|
|
|
256
|
+
def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
257
|
+
"""
|
|
258
|
+
Parse a pipeline step specification.
|
|
259
|
+
|
|
260
|
+
:param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
|
|
261
|
+
:type raw_step: str
|
|
262
|
+
:return: Tuple of extractor_id and config mapping.
|
|
263
|
+
:rtype: tuple[str, dict[str, object]]
|
|
264
|
+
:raises ValueError: If the step spec is invalid.
|
|
265
|
+
"""
|
|
266
|
+
raw_step = raw_step.strip()
|
|
267
|
+
if not raw_step:
|
|
268
|
+
raise ValueError("Step spec must be non-empty")
|
|
269
|
+
if ":" not in raw_step:
|
|
270
|
+
return raw_step, {}
|
|
271
|
+
extractor_id, raw_pairs = raw_step.split(":", 1)
|
|
272
|
+
extractor_id = extractor_id.strip()
|
|
273
|
+
if not extractor_id:
|
|
274
|
+
raise ValueError("Step spec must start with an extractor identifier")
|
|
275
|
+
config: Dict[str, object] = {}
|
|
276
|
+
raw_pairs = raw_pairs.strip()
|
|
277
|
+
if not raw_pairs:
|
|
278
|
+
return extractor_id, {}
|
|
279
|
+
for token in raw_pairs.split(","):
|
|
280
|
+
token = token.strip()
|
|
281
|
+
if not token:
|
|
282
|
+
continue
|
|
283
|
+
if "=" not in token:
|
|
284
|
+
raise ValueError(f"Step config values must be key=value (got {token!r})")
|
|
285
|
+
key, value = token.split("=", 1)
|
|
286
|
+
key = key.strip()
|
|
287
|
+
if not key:
|
|
288
|
+
raise ValueError("Step config keys must be non-empty")
|
|
289
|
+
config[key] = value
|
|
290
|
+
return extractor_id, config
|
|
291
|
+
|
|
292
|
+
|
|
243
293
|
def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
244
294
|
"""
|
|
245
295
|
Build a QueryBudget from command-line interface arguments.
|
|
@@ -249,7 +299,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
249
299
|
:return: Query budget instance.
|
|
250
300
|
:rtype: QueryBudget
|
|
251
301
|
"""
|
|
252
|
-
|
|
253
302
|
return QueryBudget(
|
|
254
303
|
max_total_items=arguments.max_total_items,
|
|
255
304
|
max_total_characters=arguments.max_total_characters,
|
|
@@ -266,7 +315,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
266
315
|
:return: Exit code.
|
|
267
316
|
:rtype: int
|
|
268
317
|
"""
|
|
269
|
-
|
|
270
318
|
corpus = (
|
|
271
319
|
Corpus.open(arguments.corpus)
|
|
272
320
|
if getattr(arguments, "corpus", None)
|
|
@@ -279,6 +327,38 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
279
327
|
return 0
|
|
280
328
|
|
|
281
329
|
|
|
330
|
+
def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
331
|
+
"""
|
|
332
|
+
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
333
|
+
|
|
334
|
+
:param arguments: Parsed command-line interface arguments.
|
|
335
|
+
:type arguments: argparse.Namespace
|
|
336
|
+
:return: Exit code.
|
|
337
|
+
:rtype: int
|
|
338
|
+
"""
|
|
339
|
+
corpus = (
|
|
340
|
+
Corpus.open(arguments.corpus)
|
|
341
|
+
if getattr(arguments, "corpus", None)
|
|
342
|
+
else Corpus.find(Path.cwd())
|
|
343
|
+
)
|
|
344
|
+
raw_steps = list(arguments.step or [])
|
|
345
|
+
if not raw_steps:
|
|
346
|
+
raise ValueError("Pipeline extraction requires at least one --step")
|
|
347
|
+
steps: List[Dict[str, object]] = []
|
|
348
|
+
for raw_step in raw_steps:
|
|
349
|
+
extractor_id, step_config = _parse_step_spec(raw_step)
|
|
350
|
+
steps.append({"extractor_id": extractor_id, "config": step_config})
|
|
351
|
+
config = {"steps": steps}
|
|
352
|
+
manifest = build_extraction_run(
|
|
353
|
+
corpus,
|
|
354
|
+
extractor_id="pipeline",
|
|
355
|
+
recipe_name=arguments.recipe_name,
|
|
356
|
+
config=config,
|
|
357
|
+
)
|
|
358
|
+
print(manifest.model_dump_json(indent=2))
|
|
359
|
+
return 0
|
|
360
|
+
|
|
361
|
+
|
|
282
362
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
283
363
|
"""
|
|
284
364
|
Execute a retrieval query.
|
|
@@ -288,7 +368,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
|
|
|
288
368
|
:return: Exit code.
|
|
289
369
|
:rtype: int
|
|
290
370
|
"""
|
|
291
|
-
|
|
292
371
|
corpus = (
|
|
293
372
|
Corpus.open(arguments.corpus)
|
|
294
373
|
if getattr(arguments, "corpus", None)
|
|
@@ -319,7 +398,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
319
398
|
:return: Exit code.
|
|
320
399
|
:rtype: int
|
|
321
400
|
"""
|
|
322
|
-
|
|
323
401
|
corpus = (
|
|
324
402
|
Corpus.open(arguments.corpus)
|
|
325
403
|
if getattr(arguments, "corpus", None)
|
|
@@ -343,7 +421,6 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
343
421
|
:return: Argument parser instance.
|
|
344
422
|
:rtype: argparse.ArgumentParser
|
|
345
423
|
"""
|
|
346
|
-
|
|
347
424
|
parser = argparse.ArgumentParser(
|
|
348
425
|
prog="biblicus",
|
|
349
426
|
description="Biblicus command-line interface (minimum viable product)",
|
|
@@ -363,14 +440,18 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
363
440
|
|
|
364
441
|
p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
|
|
365
442
|
p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
|
|
366
|
-
p_init.add_argument(
|
|
443
|
+
p_init.add_argument(
|
|
444
|
+
"--force", action="store_true", help="Overwrite existing config if present."
|
|
445
|
+
)
|
|
367
446
|
p_init.set_defaults(func=cmd_init)
|
|
368
447
|
|
|
369
448
|
p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
|
|
370
449
|
_add_common_corpus_arg(p_ingest)
|
|
371
450
|
p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
|
|
372
451
|
p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
|
|
373
|
-
p_ingest.add_argument(
|
|
452
|
+
p_ingest.add_argument(
|
|
453
|
+
"--stdin", action="store_true", help="Read text to ingest from standard input."
|
|
454
|
+
)
|
|
374
455
|
p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
|
|
375
456
|
p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
|
|
376
457
|
p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
|
|
@@ -386,11 +467,26 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
386
467
|
p_show.add_argument("id", help="Item identifier (universally unique identifier).")
|
|
387
468
|
p_show.set_defaults(func=cmd_show)
|
|
388
469
|
|
|
389
|
-
p_reindex = sub.add_parser(
|
|
470
|
+
p_reindex = sub.add_parser(
|
|
471
|
+
"reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
|
|
472
|
+
)
|
|
390
473
|
_add_common_corpus_arg(p_reindex)
|
|
391
474
|
p_reindex.set_defaults(func=cmd_reindex)
|
|
392
475
|
|
|
393
|
-
|
|
476
|
+
p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
|
|
477
|
+
_add_common_corpus_arg(p_import_tree)
|
|
478
|
+
p_import_tree.add_argument("path", help="Folder tree root to import.")
|
|
479
|
+
p_import_tree.add_argument(
|
|
480
|
+
"--tags", default=None, help="Comma-separated tags to apply to imported items."
|
|
481
|
+
)
|
|
482
|
+
p_import_tree.add_argument(
|
|
483
|
+
"--tag", action="append", help="Repeatable tag to apply to imported items."
|
|
484
|
+
)
|
|
485
|
+
p_import_tree.set_defaults(func=cmd_import_tree)
|
|
486
|
+
|
|
487
|
+
p_purge = sub.add_parser(
|
|
488
|
+
"purge", help="Delete all items and derived files (requires confirmation)."
|
|
489
|
+
)
|
|
394
490
|
_add_common_corpus_arg(p_purge)
|
|
395
491
|
p_purge.add_argument(
|
|
396
492
|
"--confirm",
|
|
@@ -415,6 +511,17 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
415
511
|
)
|
|
416
512
|
p_build.set_defaults(func=cmd_build)
|
|
417
513
|
|
|
514
|
+
p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
|
|
515
|
+
_add_common_corpus_arg(p_extract)
|
|
516
|
+
p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
517
|
+
p_extract.add_argument(
|
|
518
|
+
"--step",
|
|
519
|
+
action="append",
|
|
520
|
+
default=None,
|
|
521
|
+
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
522
|
+
)
|
|
523
|
+
p_extract.set_defaults(func=cmd_extract)
|
|
524
|
+
|
|
418
525
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
419
526
|
_add_common_corpus_arg(p_query)
|
|
420
527
|
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
@@ -450,7 +557,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
450
557
|
:return: Exit code.
|
|
451
558
|
:rtype: int
|
|
452
559
|
"""
|
|
453
|
-
|
|
454
560
|
parser = build_parser()
|
|
455
561
|
arguments = parser.parse_args(argument_list)
|
|
456
562
|
try:
|
|
@@ -460,6 +566,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
|
|
|
460
566
|
FileExistsError,
|
|
461
567
|
KeyError,
|
|
462
568
|
ValueError,
|
|
569
|
+
ExtractionRunFatalError,
|
|
463
570
|
NotImplementedError,
|
|
464
571
|
ValidationError,
|
|
465
572
|
) as exception:
|