biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/backends/scan.py +81 -4
- biblicus/backends/sqlite_full_text_search.py +63 -2
- biblicus/cli.py +123 -0
- biblicus/constants.py +2 -0
- biblicus/corpus.py +431 -2
- biblicus/extraction.py +330 -0
- biblicus/extractors/__init__.py +33 -0
- biblicus/extractors/base.py +61 -0
- biblicus/extractors/cascade.py +101 -0
- biblicus/extractors/metadata_text.py +98 -0
- biblicus/extractors/pass_through_text.py +74 -0
- biblicus/hook_logging.py +185 -0
- biblicus/hook_manager.py +205 -0
- biblicus/hooks.py +265 -0
- biblicus/ignore.py +67 -0
- biblicus/models.py +20 -0
- biblicus/sources.py +45 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/METADATA +101 -1
- biblicus-0.2.0.dist-info/RECORD +32 -0
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/backends/scan.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Optional, Tuple
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
|
|
11
11
|
from ..corpus import Corpus
|
|
12
|
+
from ..extraction import ExtractionRunReference, parse_extraction_run_reference
|
|
12
13
|
from ..frontmatter import parse_front_matter
|
|
13
14
|
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
15
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
@@ -21,11 +22,14 @@ class ScanRecipeConfig(BaseModel):
|
|
|
21
22
|
|
|
22
23
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
23
24
|
:vartype snippet_characters: int
|
|
25
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
26
|
+
:vartype extraction_run: str or None
|
|
24
27
|
"""
|
|
25
28
|
|
|
26
29
|
model_config = ConfigDict(extra="forbid")
|
|
27
30
|
|
|
28
31
|
snippet_characters: int = Field(default=400, ge=1)
|
|
32
|
+
extraction_run: Optional[str] = None
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class ScanBackend:
|
|
@@ -59,7 +63,7 @@ class ScanBackend:
|
|
|
59
63
|
name=recipe_name,
|
|
60
64
|
config=recipe_config.model_dump(),
|
|
61
65
|
)
|
|
62
|
-
stats = {"items": len(catalog.items), "text_items": _count_text_items(catalog.items.values())}
|
|
66
|
+
stats = {"items": len(catalog.items), "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config)}
|
|
63
67
|
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
64
68
|
corpus.write_run(run)
|
|
65
69
|
return run
|
|
@@ -89,12 +93,14 @@ class ScanBackend:
|
|
|
89
93
|
|
|
90
94
|
recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
|
|
91
95
|
catalog = corpus.load_catalog()
|
|
96
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
92
97
|
query_tokens = _tokenize_query(query_text)
|
|
93
98
|
scored_candidates = _score_items(
|
|
94
99
|
corpus,
|
|
95
100
|
catalog.items.values(),
|
|
96
101
|
query_tokens,
|
|
97
102
|
recipe_config.snippet_characters,
|
|
103
|
+
extraction_reference=extraction_reference,
|
|
98
104
|
)
|
|
99
105
|
sorted_candidates = sorted(
|
|
100
106
|
scored_candidates,
|
|
@@ -124,18 +130,60 @@ class ScanBackend:
|
|
|
124
130
|
)
|
|
125
131
|
|
|
126
132
|
|
|
127
|
-
def
|
|
133
|
+
def _resolve_extraction_reference(corpus: Corpus, recipe_config: ScanRecipeConfig) -> Optional[ExtractionRunReference]:
|
|
134
|
+
"""
|
|
135
|
+
Resolve an extraction run reference from a recipe config.
|
|
136
|
+
|
|
137
|
+
:param corpus: Corpus associated with the recipe.
|
|
138
|
+
:type corpus: Corpus
|
|
139
|
+
:param recipe_config: Parsed scan recipe configuration.
|
|
140
|
+
:type recipe_config: ScanRecipeConfig
|
|
141
|
+
:return: Parsed extraction reference or None.
|
|
142
|
+
:rtype: ExtractionRunReference or None
|
|
143
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
if not recipe_config.extraction_run:
|
|
147
|
+
return None
|
|
148
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
149
|
+
run_dir = corpus.extraction_run_dir(
|
|
150
|
+
extractor_id=extraction_reference.extractor_id,
|
|
151
|
+
run_id=extraction_reference.run_id,
|
|
152
|
+
)
|
|
153
|
+
if not run_dir.is_dir():
|
|
154
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
155
|
+
return extraction_reference
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _count_text_items(corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig) -> int:
|
|
128
159
|
"""
|
|
129
160
|
Count catalog items that represent text content.
|
|
130
161
|
|
|
162
|
+
When an extraction run is configured, extracted artifacts are treated as text.
|
|
163
|
+
|
|
164
|
+
:param corpus: Corpus containing the items.
|
|
165
|
+
:type corpus: Corpus
|
|
131
166
|
:param items: Catalog items to inspect.
|
|
132
167
|
:type items: Iterable[object]
|
|
168
|
+
:param recipe_config: Parsed scan recipe configuration.
|
|
169
|
+
:type recipe_config: ScanRecipeConfig
|
|
133
170
|
:return: Number of text items.
|
|
134
171
|
:rtype: int
|
|
135
172
|
"""
|
|
136
173
|
|
|
137
174
|
text_item_count = 0
|
|
175
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
138
176
|
for catalog_item in items:
|
|
177
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
178
|
+
if extraction_reference and item_id:
|
|
179
|
+
extracted_text = corpus.read_extracted_text(
|
|
180
|
+
extractor_id=extraction_reference.extractor_id,
|
|
181
|
+
run_id=extraction_reference.run_id,
|
|
182
|
+
item_id=item_id,
|
|
183
|
+
)
|
|
184
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
185
|
+
text_item_count += 1
|
|
186
|
+
continue
|
|
139
187
|
media_type = getattr(catalog_item, "media_type", "")
|
|
140
188
|
if media_type == "text/markdown" or str(media_type).startswith("text/"):
|
|
141
189
|
text_item_count += 1
|
|
@@ -155,20 +203,40 @@ def _tokenize_query(query_text: str) -> List[str]:
|
|
|
155
203
|
return [token for token in query_text.lower().split() if token]
|
|
156
204
|
|
|
157
205
|
|
|
158
|
-
def _load_text_from_item(
|
|
206
|
+
def _load_text_from_item(
|
|
207
|
+
corpus: Corpus,
|
|
208
|
+
*,
|
|
209
|
+
item_id: str,
|
|
210
|
+
relpath: str,
|
|
211
|
+
media_type: str,
|
|
212
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
213
|
+
) -> Optional[str]:
|
|
159
214
|
"""
|
|
160
215
|
Load a text payload from a catalog item.
|
|
161
216
|
|
|
162
217
|
:param corpus: Corpus containing the item.
|
|
163
218
|
:type corpus: Corpus
|
|
219
|
+
:param item_id: Item identifier.
|
|
220
|
+
:type item_id: str
|
|
164
221
|
:param relpath: Relative path to the stored content.
|
|
165
222
|
:type relpath: str
|
|
166
223
|
:param media_type: Media type for the stored content.
|
|
167
224
|
:type media_type: str
|
|
225
|
+
:param extraction_reference: Optional extraction run reference.
|
|
226
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
168
227
|
:return: Text payload or None if not decodable as text.
|
|
169
228
|
:rtype: str or None
|
|
170
229
|
"""
|
|
171
230
|
|
|
231
|
+
if extraction_reference:
|
|
232
|
+
extracted_text = corpus.read_extracted_text(
|
|
233
|
+
extractor_id=extraction_reference.extractor_id,
|
|
234
|
+
run_id=extraction_reference.run_id,
|
|
235
|
+
item_id=item_id,
|
|
236
|
+
)
|
|
237
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
238
|
+
return extracted_text
|
|
239
|
+
|
|
172
240
|
content_path = corpus.root / relpath
|
|
173
241
|
raw_bytes = content_path.read_bytes()
|
|
174
242
|
if media_type == "text/markdown":
|
|
@@ -240,6 +308,8 @@ def _score_items(
|
|
|
240
308
|
items: Iterable[object],
|
|
241
309
|
tokens: List[str],
|
|
242
310
|
snippet_characters: int,
|
|
311
|
+
*,
|
|
312
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
243
313
|
) -> List[Evidence]:
|
|
244
314
|
"""
|
|
245
315
|
Score catalog items by token frequency and return evidence candidates.
|
|
@@ -260,7 +330,14 @@ def _score_items(
|
|
|
260
330
|
for catalog_item in items:
|
|
261
331
|
media_type = getattr(catalog_item, "media_type", "")
|
|
262
332
|
relpath = getattr(catalog_item, "relpath", "")
|
|
263
|
-
|
|
333
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
334
|
+
item_text = _load_text_from_item(
|
|
335
|
+
corpus,
|
|
336
|
+
item_id=item_id,
|
|
337
|
+
relpath=relpath,
|
|
338
|
+
media_type=str(media_type),
|
|
339
|
+
extraction_reference=extraction_reference,
|
|
340
|
+
)
|
|
264
341
|
if item_text is None:
|
|
265
342
|
continue
|
|
266
343
|
lower_text = item_text.lower()
|
|
@@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
12
12
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
+
from ..extraction import ExtractionRunReference, parse_extraction_run_reference
|
|
15
16
|
from ..frontmatter import parse_front_matter
|
|
16
17
|
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
|
|
17
18
|
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
@@ -28,6 +29,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
28
29
|
:vartype chunk_overlap: int
|
|
29
30
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
30
31
|
:vartype snippet_characters: int
|
|
32
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
33
|
+
:vartype extraction_run: str or None
|
|
31
34
|
"""
|
|
32
35
|
|
|
33
36
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -35,6 +38,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
35
38
|
chunk_size: int = Field(default=800, ge=1)
|
|
36
39
|
chunk_overlap: int = Field(default=200, ge=0)
|
|
37
40
|
snippet_characters: int = Field(default=400, ge=1)
|
|
41
|
+
extraction_run: Optional[str] = None
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
class SqliteFullTextSearchBackend:
|
|
@@ -72,11 +76,13 @@ class SqliteFullTextSearchBackend:
|
|
|
72
76
|
db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
|
|
73
77
|
db_path = corpus.root / db_relpath
|
|
74
78
|
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
75
80
|
stats = _build_full_text_search_index(
|
|
76
81
|
db_path=db_path,
|
|
77
82
|
corpus=corpus,
|
|
78
83
|
items=catalog.items.values(),
|
|
79
84
|
recipe_config=recipe_config,
|
|
85
|
+
extraction_reference=extraction_reference,
|
|
80
86
|
)
|
|
81
87
|
run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
|
|
82
88
|
corpus.write_run(run)
|
|
@@ -227,6 +233,7 @@ def _build_full_text_search_index(
|
|
|
227
233
|
corpus: Corpus,
|
|
228
234
|
items: Iterable[object],
|
|
229
235
|
recipe_config: SqliteFullTextSearchRecipeConfig,
|
|
236
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
230
237
|
) -> Dict[str, int]:
|
|
231
238
|
"""
|
|
232
239
|
Build a full-text search index from corpus items.
|
|
@@ -256,7 +263,13 @@ def _build_full_text_search_index(
|
|
|
256
263
|
item_count += 1
|
|
257
264
|
media_type = getattr(catalog_item, "media_type", "")
|
|
258
265
|
relpath = getattr(catalog_item, "relpath", "")
|
|
259
|
-
item_text = _load_text_from_item(
|
|
266
|
+
item_text = _load_text_from_item(
|
|
267
|
+
corpus,
|
|
268
|
+
item_id=str(getattr(catalog_item, "id", "")),
|
|
269
|
+
relpath=str(relpath),
|
|
270
|
+
media_type=str(media_type),
|
|
271
|
+
extraction_reference=extraction_reference,
|
|
272
|
+
)
|
|
260
273
|
if item_text is None:
|
|
261
274
|
continue
|
|
262
275
|
text_item_count += 1
|
|
@@ -302,20 +315,40 @@ def _build_full_text_search_index(
|
|
|
302
315
|
connection.close()
|
|
303
316
|
|
|
304
317
|
|
|
305
|
-
def _load_text_from_item(
|
|
318
|
+
def _load_text_from_item(
|
|
319
|
+
corpus: Corpus,
|
|
320
|
+
*,
|
|
321
|
+
item_id: str,
|
|
322
|
+
relpath: str,
|
|
323
|
+
media_type: str,
|
|
324
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
325
|
+
) -> Optional[str]:
|
|
306
326
|
"""
|
|
307
327
|
Load text content from a catalog item.
|
|
308
328
|
|
|
309
329
|
:param corpus: Corpus containing the content.
|
|
310
330
|
:type corpus: Corpus
|
|
331
|
+
:param item_id: Item identifier.
|
|
332
|
+
:type item_id: str
|
|
311
333
|
:param relpath: Relative path to the content.
|
|
312
334
|
:type relpath: str
|
|
313
335
|
:param media_type: Media type for the content.
|
|
314
336
|
:type media_type: str
|
|
337
|
+
:param extraction_reference: Optional extraction run reference.
|
|
338
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
315
339
|
:return: Text payload or None if not text.
|
|
316
340
|
:rtype: str or None
|
|
317
341
|
"""
|
|
318
342
|
|
|
343
|
+
if extraction_reference:
|
|
344
|
+
extracted_text = corpus.read_extracted_text(
|
|
345
|
+
extractor_id=extraction_reference.extractor_id,
|
|
346
|
+
run_id=extraction_reference.run_id,
|
|
347
|
+
item_id=item_id,
|
|
348
|
+
)
|
|
349
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
350
|
+
return extracted_text
|
|
351
|
+
|
|
319
352
|
content_path = corpus.root / relpath
|
|
320
353
|
raw_bytes = content_path.read_bytes()
|
|
321
354
|
if media_type == "text/markdown":
|
|
@@ -327,6 +360,34 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
|
|
|
327
360
|
return None
|
|
328
361
|
|
|
329
362
|
|
|
363
|
+
def _resolve_extraction_reference(
|
|
364
|
+
corpus: Corpus,
|
|
365
|
+
recipe_config: SqliteFullTextSearchRecipeConfig,
|
|
366
|
+
) -> Optional[ExtractionRunReference]:
|
|
367
|
+
"""
|
|
368
|
+
Resolve an extraction run reference from a recipe config.
|
|
369
|
+
|
|
370
|
+
:param corpus: Corpus associated with the recipe.
|
|
371
|
+
:type corpus: Corpus
|
|
372
|
+
:param recipe_config: Parsed backend recipe configuration.
|
|
373
|
+
:type recipe_config: SqliteFullTextSearchRecipeConfig
|
|
374
|
+
:return: Parsed extraction reference or None.
|
|
375
|
+
:rtype: ExtractionRunReference or None
|
|
376
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
if not recipe_config.extraction_run:
|
|
380
|
+
return None
|
|
381
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
382
|
+
run_dir = corpus.extraction_run_dir(
|
|
383
|
+
extractor_id=extraction_reference.extractor_id,
|
|
384
|
+
run_id=extraction_reference.run_id,
|
|
385
|
+
)
|
|
386
|
+
if not run_dir.is_dir():
|
|
387
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
388
|
+
return extraction_reference
|
|
389
|
+
|
|
390
|
+
|
|
330
391
|
def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
|
|
331
392
|
"""
|
|
332
393
|
Yield overlapping chunks of text for indexing.
|
biblicus/cli.py
CHANGED
|
@@ -14,6 +14,7 @@ from pydantic import ValidationError
|
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
16
|
from .corpus import Corpus
|
|
17
|
+
from .extraction import build_extraction_run
|
|
17
18
|
from .evaluation import evaluate_run, load_dataset
|
|
18
19
|
from .models import QueryBudget
|
|
19
20
|
from .uris import corpus_ref_to_path
|
|
@@ -187,6 +188,27 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
|
187
188
|
return 0
|
|
188
189
|
|
|
189
190
|
|
|
191
|
+
def cmd_import_tree(arguments: argparse.Namespace) -> int:
|
|
192
|
+
"""
|
|
193
|
+
Import a folder tree into a corpus.
|
|
194
|
+
|
|
195
|
+
:param arguments: Parsed command-line interface arguments.
|
|
196
|
+
:type arguments: argparse.Namespace
|
|
197
|
+
:return: Exit code.
|
|
198
|
+
:rtype: int
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
corpus = (
|
|
202
|
+
Corpus.open(arguments.corpus)
|
|
203
|
+
if getattr(arguments, "corpus", None)
|
|
204
|
+
else Corpus.find(Path.cwd())
|
|
205
|
+
)
|
|
206
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
207
|
+
stats = corpus.import_tree(Path(arguments.path), tags=tags)
|
|
208
|
+
print(json.dumps(stats, indent=2, sort_keys=False))
|
|
209
|
+
return 0
|
|
210
|
+
|
|
211
|
+
|
|
190
212
|
def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
191
213
|
"""
|
|
192
214
|
Purge all items and derived artifacts from a corpus.
|
|
@@ -240,6 +262,44 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
240
262
|
return config
|
|
241
263
|
|
|
242
264
|
|
|
265
|
+
def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
266
|
+
"""
|
|
267
|
+
Parse a cascade step specification.
|
|
268
|
+
|
|
269
|
+
:param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
|
|
270
|
+
:type raw_step: str
|
|
271
|
+
:return: Tuple of extractor_id and config mapping.
|
|
272
|
+
:rtype: tuple[str, dict[str, object]]
|
|
273
|
+
:raises ValueError: If the step spec is invalid.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
raw_step = raw_step.strip()
|
|
277
|
+
if not raw_step:
|
|
278
|
+
raise ValueError("Step spec must be non-empty")
|
|
279
|
+
if ":" not in raw_step:
|
|
280
|
+
return raw_step, {}
|
|
281
|
+
extractor_id, raw_pairs = raw_step.split(":", 1)
|
|
282
|
+
extractor_id = extractor_id.strip()
|
|
283
|
+
if not extractor_id:
|
|
284
|
+
raise ValueError("Step spec must start with an extractor identifier")
|
|
285
|
+
config: Dict[str, object] = {}
|
|
286
|
+
raw_pairs = raw_pairs.strip()
|
|
287
|
+
if not raw_pairs:
|
|
288
|
+
return extractor_id, {}
|
|
289
|
+
for token in raw_pairs.split(","):
|
|
290
|
+
token = token.strip()
|
|
291
|
+
if not token:
|
|
292
|
+
continue
|
|
293
|
+
if "=" not in token:
|
|
294
|
+
raise ValueError(f"Step config values must be key=value (got {token!r})")
|
|
295
|
+
key, value = token.split("=", 1)
|
|
296
|
+
key = key.strip()
|
|
297
|
+
if not key:
|
|
298
|
+
raise ValueError("Step config keys must be non-empty")
|
|
299
|
+
config[key] = value
|
|
300
|
+
return extractor_id, config
|
|
301
|
+
|
|
302
|
+
|
|
243
303
|
def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
244
304
|
"""
|
|
245
305
|
Build a QueryBudget from command-line interface arguments.
|
|
@@ -279,6 +339,40 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
279
339
|
return 0
|
|
280
340
|
|
|
281
341
|
|
|
342
|
+
def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
343
|
+
"""
|
|
344
|
+
Build a text extraction run for the corpus.
|
|
345
|
+
|
|
346
|
+
:param arguments: Parsed command-line interface arguments.
|
|
347
|
+
:type arguments: argparse.Namespace
|
|
348
|
+
:return: Exit code.
|
|
349
|
+
:rtype: int
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
corpus = (
|
|
353
|
+
Corpus.open(arguments.corpus)
|
|
354
|
+
if getattr(arguments, "corpus", None)
|
|
355
|
+
else Corpus.find(Path.cwd())
|
|
356
|
+
)
|
|
357
|
+
config = _parse_config_pairs(arguments.config)
|
|
358
|
+
if getattr(arguments, "step", None):
|
|
359
|
+
if arguments.extractor != "cascade":
|
|
360
|
+
raise ValueError("--step is only supported for the cascade extractor")
|
|
361
|
+
steps: List[Dict[str, object]] = []
|
|
362
|
+
for raw_step in arguments.step:
|
|
363
|
+
extractor_id, step_config = _parse_step_spec(raw_step)
|
|
364
|
+
steps.append({"extractor_id": extractor_id, "config": step_config})
|
|
365
|
+
config = {"steps": steps}
|
|
366
|
+
manifest = build_extraction_run(
|
|
367
|
+
corpus,
|
|
368
|
+
extractor_id=arguments.extractor,
|
|
369
|
+
recipe_name=arguments.recipe_name,
|
|
370
|
+
config=config,
|
|
371
|
+
)
|
|
372
|
+
print(manifest.model_dump_json(indent=2))
|
|
373
|
+
return 0
|
|
374
|
+
|
|
375
|
+
|
|
282
376
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
283
377
|
"""
|
|
284
378
|
Execute a retrieval query.
|
|
@@ -390,6 +484,13 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
390
484
|
_add_common_corpus_arg(p_reindex)
|
|
391
485
|
p_reindex.set_defaults(func=cmd_reindex)
|
|
392
486
|
|
|
487
|
+
p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
|
|
488
|
+
_add_common_corpus_arg(p_import_tree)
|
|
489
|
+
p_import_tree.add_argument("path", help="Folder tree root to import.")
|
|
490
|
+
p_import_tree.add_argument("--tags", default=None, help="Comma-separated tags to apply to imported items.")
|
|
491
|
+
p_import_tree.add_argument("--tag", action="append", help="Repeatable tag to apply to imported items.")
|
|
492
|
+
p_import_tree.set_defaults(func=cmd_import_tree)
|
|
493
|
+
|
|
393
494
|
p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
|
|
394
495
|
_add_common_corpus_arg(p_purge)
|
|
395
496
|
p_purge.add_argument(
|
|
@@ -415,6 +516,28 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
415
516
|
)
|
|
416
517
|
p_build.set_defaults(func=cmd_build)
|
|
417
518
|
|
|
519
|
+
p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
|
|
520
|
+
_add_common_corpus_arg(p_extract)
|
|
521
|
+
p_extract.add_argument(
|
|
522
|
+
"--extractor",
|
|
523
|
+
required=True,
|
|
524
|
+
help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
|
|
525
|
+
)
|
|
526
|
+
p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
527
|
+
p_extract.add_argument(
|
|
528
|
+
"--step",
|
|
529
|
+
action="append",
|
|
530
|
+
default=None,
|
|
531
|
+
help="Cascade step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
532
|
+
)
|
|
533
|
+
p_extract.add_argument(
|
|
534
|
+
"--config",
|
|
535
|
+
action="append",
|
|
536
|
+
default=None,
|
|
537
|
+
help="Extractor config as key=value (repeatable).",
|
|
538
|
+
)
|
|
539
|
+
p_extract.set_defaults(func=cmd_extract)
|
|
540
|
+
|
|
418
541
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
419
542
|
_add_common_corpus_arg(p_query)
|
|
420
543
|
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|