biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/cli.py
ADDED
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
|
|
15
|
+
from .backends import get_backend
|
|
16
|
+
from .context import (
|
|
17
|
+
ContextPackPolicy,
|
|
18
|
+
TokenBudget,
|
|
19
|
+
build_context_pack,
|
|
20
|
+
fit_context_pack_to_token_budget,
|
|
21
|
+
)
|
|
22
|
+
from .corpus import Corpus
|
|
23
|
+
from .crawl import CrawlRequest, crawl_into_corpus
|
|
24
|
+
from .errors import ExtractionRunFatalError
|
|
25
|
+
from .evaluation import evaluate_run, load_dataset
|
|
26
|
+
from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
|
|
27
|
+
from .extraction import build_extraction_run
|
|
28
|
+
from .models import QueryBudget, RetrievalResult, parse_extraction_run_reference
|
|
29
|
+
from .uris import corpus_ref_to_path
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Add the common --corpus argument to a parser.
|
|
35
|
+
|
|
36
|
+
:param parser: Argument parser to modify.
|
|
37
|
+
:type parser: argparse.ArgumentParser
|
|
38
|
+
:return: None.
|
|
39
|
+
:rtype: None
|
|
40
|
+
"""
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--corpus",
|
|
43
|
+
type=str,
|
|
44
|
+
default=argparse.SUPPRESS,
|
|
45
|
+
dest="corpus",
|
|
46
|
+
help=(
|
|
47
|
+
"Corpus path or uniform resource identifier (defaults to searching from the current working directory "
|
|
48
|
+
"upward)."
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def cmd_init(arguments: argparse.Namespace) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Initialize a new corpus from command-line interface arguments.
|
|
56
|
+
|
|
57
|
+
:param arguments: Parsed command-line interface arguments.
|
|
58
|
+
:type arguments: argparse.Namespace
|
|
59
|
+
:return: Exit code.
|
|
60
|
+
:rtype: int
|
|
61
|
+
"""
|
|
62
|
+
corpus_path = corpus_ref_to_path(arguments.path)
|
|
63
|
+
corpus = Corpus.init(corpus_path, force=arguments.force)
|
|
64
|
+
print(f"Initialized corpus at {corpus.root}")
|
|
65
|
+
return 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
|
|
69
|
+
"""
|
|
70
|
+
Parse and deduplicate tag strings.
|
|
71
|
+
|
|
72
|
+
:param raw: Comma-separated tag string.
|
|
73
|
+
:type raw: str or None
|
|
74
|
+
:param raw_list: Repeated tag list.
|
|
75
|
+
:type raw_list: list[str] or None
|
|
76
|
+
:return: Deduplicated tag list.
|
|
77
|
+
:rtype: list[str]
|
|
78
|
+
"""
|
|
79
|
+
parsed_tags: List[str] = []
|
|
80
|
+
if raw:
|
|
81
|
+
parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
|
|
82
|
+
if raw_list:
|
|
83
|
+
parsed_tags.extend([tag.strip() for tag in raw_list if tag.strip()])
|
|
84
|
+
|
|
85
|
+
seen_tags = set()
|
|
86
|
+
deduplicated_tags: List[str] = []
|
|
87
|
+
for tag_value in parsed_tags:
|
|
88
|
+
if tag_value not in seen_tags:
|
|
89
|
+
seen_tags.add(tag_value)
|
|
90
|
+
deduplicated_tags.append(tag_value)
|
|
91
|
+
return deduplicated_tags
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def cmd_ingest(arguments: argparse.Namespace) -> int:
|
|
95
|
+
"""
|
|
96
|
+
Ingest items into a corpus from command-line interface arguments.
|
|
97
|
+
|
|
98
|
+
:param arguments: Parsed command-line interface arguments.
|
|
99
|
+
:type arguments: argparse.Namespace
|
|
100
|
+
:return: Exit code.
|
|
101
|
+
:rtype: int
|
|
102
|
+
"""
|
|
103
|
+
corpus = (
|
|
104
|
+
Corpus.open(arguments.corpus)
|
|
105
|
+
if getattr(arguments, "corpus", None)
|
|
106
|
+
else Corpus.find(Path.cwd())
|
|
107
|
+
)
|
|
108
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
109
|
+
|
|
110
|
+
results = []
|
|
111
|
+
|
|
112
|
+
if arguments.note is not None or arguments.stdin:
|
|
113
|
+
text = arguments.note if arguments.note is not None else sys.stdin.read()
|
|
114
|
+
ingest_result = corpus.ingest_note(
|
|
115
|
+
text,
|
|
116
|
+
title=arguments.title,
|
|
117
|
+
tags=tags,
|
|
118
|
+
source_uri="stdin" if arguments.stdin else "text",
|
|
119
|
+
)
|
|
120
|
+
results.append(ingest_result)
|
|
121
|
+
|
|
122
|
+
for source_path in arguments.files or []:
|
|
123
|
+
results.append(corpus.ingest_source(source_path, tags=tags))
|
|
124
|
+
|
|
125
|
+
if not results:
|
|
126
|
+
print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
|
|
127
|
+
return 2
|
|
128
|
+
|
|
129
|
+
for ingest_result in results:
|
|
130
|
+
print(f"{ingest_result.item_id}\t{ingest_result.relpath}\t{ingest_result.sha256}")
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def cmd_list(arguments: argparse.Namespace) -> int:
|
|
135
|
+
"""
|
|
136
|
+
List items from the corpus.
|
|
137
|
+
|
|
138
|
+
:param arguments: Parsed command-line interface arguments.
|
|
139
|
+
:type arguments: argparse.Namespace
|
|
140
|
+
:return: Exit code.
|
|
141
|
+
:rtype: int
|
|
142
|
+
"""
|
|
143
|
+
corpus = (
|
|
144
|
+
Corpus.open(arguments.corpus)
|
|
145
|
+
if getattr(arguments, "corpus", None)
|
|
146
|
+
else Corpus.find(Path.cwd())
|
|
147
|
+
)
|
|
148
|
+
items = corpus.list_items(limit=arguments.limit)
|
|
149
|
+
for item in items:
|
|
150
|
+
title = item.title or ""
|
|
151
|
+
print(f"{item.id}\t{item.created_at}\t{item.relpath}\t{title}\t{','.join(item.tags)}")
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def cmd_show(arguments: argparse.Namespace) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Show an item from the corpus.
|
|
158
|
+
|
|
159
|
+
:param arguments: Parsed command-line interface arguments.
|
|
160
|
+
:type arguments: argparse.Namespace
|
|
161
|
+
:return: Exit code.
|
|
162
|
+
:rtype: int
|
|
163
|
+
"""
|
|
164
|
+
corpus = (
|
|
165
|
+
Corpus.open(arguments.corpus)
|
|
166
|
+
if getattr(arguments, "corpus", None)
|
|
167
|
+
else Corpus.find(Path.cwd())
|
|
168
|
+
)
|
|
169
|
+
item = corpus.get_item(arguments.id)
|
|
170
|
+
print(item.model_dump_json(indent=2))
|
|
171
|
+
return 0
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def cmd_reindex(arguments: argparse.Namespace) -> int:
|
|
175
|
+
"""
|
|
176
|
+
Rebuild the corpus catalog.
|
|
177
|
+
|
|
178
|
+
:param arguments: Parsed command-line interface arguments.
|
|
179
|
+
:type arguments: argparse.Namespace
|
|
180
|
+
:return: Exit code.
|
|
181
|
+
:rtype: int
|
|
182
|
+
"""
|
|
183
|
+
corpus = (
|
|
184
|
+
Corpus.open(arguments.corpus)
|
|
185
|
+
if getattr(arguments, "corpus", None)
|
|
186
|
+
else Corpus.find(Path.cwd())
|
|
187
|
+
)
|
|
188
|
+
stats = corpus.reindex()
|
|
189
|
+
print(json.dumps(stats, indent=2, sort_keys=False))
|
|
190
|
+
return 0
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def cmd_import_tree(arguments: argparse.Namespace) -> int:
|
|
194
|
+
"""
|
|
195
|
+
Import a folder tree into a corpus.
|
|
196
|
+
|
|
197
|
+
:param arguments: Parsed command-line interface arguments.
|
|
198
|
+
:type arguments: argparse.Namespace
|
|
199
|
+
:return: Exit code.
|
|
200
|
+
:rtype: int
|
|
201
|
+
"""
|
|
202
|
+
corpus = (
|
|
203
|
+
Corpus.open(arguments.corpus)
|
|
204
|
+
if getattr(arguments, "corpus", None)
|
|
205
|
+
else Corpus.find(Path.cwd())
|
|
206
|
+
)
|
|
207
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
208
|
+
stats = corpus.import_tree(Path(arguments.path), tags=tags)
|
|
209
|
+
print(json.dumps(stats, indent=2, sort_keys=False))
|
|
210
|
+
return 0
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
214
|
+
"""
|
|
215
|
+
Purge all items and derived artifacts from a corpus.
|
|
216
|
+
|
|
217
|
+
:param arguments: Parsed command-line interface arguments.
|
|
218
|
+
:type arguments: argparse.Namespace
|
|
219
|
+
:return: Exit code.
|
|
220
|
+
:rtype: int
|
|
221
|
+
"""
|
|
222
|
+
corpus = (
|
|
223
|
+
Corpus.open(arguments.corpus)
|
|
224
|
+
if getattr(arguments, "corpus", None)
|
|
225
|
+
else Corpus.find(Path.cwd())
|
|
226
|
+
)
|
|
227
|
+
if arguments.confirm is None:
|
|
228
|
+
raise ValueError(f"Purging is dangerous: pass --confirm {corpus.name!r} to proceed")
|
|
229
|
+
corpus.purge(confirm=arguments.confirm)
|
|
230
|
+
print(f"Purged corpus {corpus.root}")
|
|
231
|
+
return 0
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
235
|
+
"""
|
|
236
|
+
Parse repeated key=value config pairs.
|
|
237
|
+
|
|
238
|
+
:param pairs: Config pairs supplied via the command-line interface.
|
|
239
|
+
:type pairs: list[str] or None
|
|
240
|
+
:return: Parsed config mapping.
|
|
241
|
+
:rtype: dict[str, object]
|
|
242
|
+
:raises ValueError: If any entry is not key=value.
|
|
243
|
+
"""
|
|
244
|
+
config: Dict[str, object] = {}
|
|
245
|
+
for item in pairs or []:
|
|
246
|
+
if "=" not in item:
|
|
247
|
+
raise ValueError(f"Config values must be key=value (got {item!r})")
|
|
248
|
+
key, raw = item.split("=", 1)
|
|
249
|
+
key = key.strip()
|
|
250
|
+
if not key:
|
|
251
|
+
raise ValueError("Config keys must be non-empty")
|
|
252
|
+
value: object = raw
|
|
253
|
+
if raw.isdigit():
|
|
254
|
+
value = int(raw)
|
|
255
|
+
else:
|
|
256
|
+
try:
|
|
257
|
+
value = float(raw)
|
|
258
|
+
except ValueError:
|
|
259
|
+
value = raw
|
|
260
|
+
config[key] = value
|
|
261
|
+
return config
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
265
|
+
"""
|
|
266
|
+
Parse a pipeline step specification.
|
|
267
|
+
|
|
268
|
+
:param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
|
|
269
|
+
:type raw_step: str
|
|
270
|
+
:return: Tuple of extractor_id and config mapping.
|
|
271
|
+
:rtype: tuple[str, dict[str, object]]
|
|
272
|
+
:raises ValueError: If the step spec is invalid.
|
|
273
|
+
"""
|
|
274
|
+
raw_step = raw_step.strip()
|
|
275
|
+
if not raw_step:
|
|
276
|
+
raise ValueError("Step spec must be non-empty")
|
|
277
|
+
if ":" not in raw_step:
|
|
278
|
+
return raw_step, {}
|
|
279
|
+
extractor_id, raw_pairs = raw_step.split(":", 1)
|
|
280
|
+
extractor_id = extractor_id.strip()
|
|
281
|
+
if not extractor_id:
|
|
282
|
+
raise ValueError("Step spec must start with an extractor identifier")
|
|
283
|
+
config: Dict[str, object] = {}
|
|
284
|
+
raw_pairs = raw_pairs.strip()
|
|
285
|
+
if not raw_pairs:
|
|
286
|
+
return extractor_id, {}
|
|
287
|
+
for token in raw_pairs.split(","):
|
|
288
|
+
token = token.strip()
|
|
289
|
+
if not token:
|
|
290
|
+
continue
|
|
291
|
+
if "=" not in token:
|
|
292
|
+
raise ValueError(f"Step config values must be key=value (got {token!r})")
|
|
293
|
+
key, value = token.split("=", 1)
|
|
294
|
+
key = key.strip()
|
|
295
|
+
if not key:
|
|
296
|
+
raise ValueError("Step config keys must be non-empty")
|
|
297
|
+
config[key] = value
|
|
298
|
+
return extractor_id, config
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
302
|
+
"""
|
|
303
|
+
Build a QueryBudget from command-line interface arguments.
|
|
304
|
+
|
|
305
|
+
:param arguments: Parsed command-line interface arguments.
|
|
306
|
+
:type arguments: argparse.Namespace
|
|
307
|
+
:return: Query budget instance.
|
|
308
|
+
:rtype: QueryBudget
|
|
309
|
+
"""
|
|
310
|
+
return QueryBudget(
|
|
311
|
+
max_total_items=arguments.max_total_items,
|
|
312
|
+
max_total_characters=arguments.max_total_characters,
|
|
313
|
+
max_items_per_source=arguments.max_items_per_source,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def cmd_build(arguments: argparse.Namespace) -> int:
|
|
318
|
+
"""
|
|
319
|
+
Build a retrieval run for a backend.
|
|
320
|
+
|
|
321
|
+
:param arguments: Parsed command-line interface arguments.
|
|
322
|
+
:type arguments: argparse.Namespace
|
|
323
|
+
:return: Exit code.
|
|
324
|
+
:rtype: int
|
|
325
|
+
"""
|
|
326
|
+
corpus = (
|
|
327
|
+
Corpus.open(arguments.corpus)
|
|
328
|
+
if getattr(arguments, "corpus", None)
|
|
329
|
+
else Corpus.find(Path.cwd())
|
|
330
|
+
)
|
|
331
|
+
backend = get_backend(arguments.backend)
|
|
332
|
+
config = _parse_config_pairs(arguments.config)
|
|
333
|
+
run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
|
|
334
|
+
print(run.model_dump_json(indent=2))
|
|
335
|
+
return 0
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
339
|
+
"""
|
|
340
|
+
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
341
|
+
|
|
342
|
+
:param arguments: Parsed command-line interface arguments.
|
|
343
|
+
:type arguments: argparse.Namespace
|
|
344
|
+
:return: Exit code.
|
|
345
|
+
:rtype: int
|
|
346
|
+
"""
|
|
347
|
+
corpus = (
|
|
348
|
+
Corpus.open(arguments.corpus)
|
|
349
|
+
if getattr(arguments, "corpus", None)
|
|
350
|
+
else Corpus.find(Path.cwd())
|
|
351
|
+
)
|
|
352
|
+
raw_steps = list(arguments.step or [])
|
|
353
|
+
if not raw_steps:
|
|
354
|
+
raise ValueError("Pipeline extraction requires at least one --step")
|
|
355
|
+
steps: List[Dict[str, object]] = []
|
|
356
|
+
for raw_step in raw_steps:
|
|
357
|
+
extractor_id, step_config = _parse_step_spec(raw_step)
|
|
358
|
+
steps.append({"extractor_id": extractor_id, "config": step_config})
|
|
359
|
+
config = {"steps": steps}
|
|
360
|
+
manifest = build_extraction_run(
|
|
361
|
+
corpus,
|
|
362
|
+
extractor_id="pipeline",
|
|
363
|
+
recipe_name=arguments.recipe_name,
|
|
364
|
+
config=config,
|
|
365
|
+
)
|
|
366
|
+
print(manifest.model_dump_json(indent=2))
|
|
367
|
+
return 0
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
371
|
+
"""
|
|
372
|
+
List extraction runs stored under the corpus.
|
|
373
|
+
|
|
374
|
+
:param arguments: Parsed command-line interface arguments.
|
|
375
|
+
:type arguments: argparse.Namespace
|
|
376
|
+
:return: Exit code.
|
|
377
|
+
:rtype: int
|
|
378
|
+
"""
|
|
379
|
+
corpus = (
|
|
380
|
+
Corpus.open(arguments.corpus)
|
|
381
|
+
if getattr(arguments, "corpus", None)
|
|
382
|
+
else Corpus.find(Path.cwd())
|
|
383
|
+
)
|
|
384
|
+
runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
|
|
385
|
+
print(json.dumps([entry.model_dump() for entry in runs], indent=2))
|
|
386
|
+
return 0
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
390
|
+
"""
|
|
391
|
+
Show an extraction run manifest.
|
|
392
|
+
|
|
393
|
+
:param arguments: Parsed command-line interface arguments.
|
|
394
|
+
:type arguments: argparse.Namespace
|
|
395
|
+
:return: Exit code.
|
|
396
|
+
:rtype: int
|
|
397
|
+
"""
|
|
398
|
+
corpus = (
|
|
399
|
+
Corpus.open(arguments.corpus)
|
|
400
|
+
if getattr(arguments, "corpus", None)
|
|
401
|
+
else Corpus.find(Path.cwd())
|
|
402
|
+
)
|
|
403
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
404
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
405
|
+
extractor_id=reference.extractor_id, run_id=reference.run_id
|
|
406
|
+
)
|
|
407
|
+
print(manifest.model_dump_json(indent=2))
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
412
|
+
"""
|
|
413
|
+
Delete an extraction run directory and its derived artifacts.
|
|
414
|
+
|
|
415
|
+
:param arguments: Parsed command-line interface arguments.
|
|
416
|
+
:type arguments: argparse.Namespace
|
|
417
|
+
:return: Exit code.
|
|
418
|
+
:rtype: int
|
|
419
|
+
"""
|
|
420
|
+
corpus = (
|
|
421
|
+
Corpus.open(arguments.corpus)
|
|
422
|
+
if getattr(arguments, "corpus", None)
|
|
423
|
+
else Corpus.find(Path.cwd())
|
|
424
|
+
)
|
|
425
|
+
if arguments.confirm != arguments.run:
|
|
426
|
+
raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
|
|
427
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
428
|
+
corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
|
|
429
|
+
print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
|
|
430
|
+
return 0
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
434
|
+
"""
|
|
435
|
+
Execute a retrieval query.
|
|
436
|
+
|
|
437
|
+
:param arguments: Parsed command-line interface arguments.
|
|
438
|
+
:type arguments: argparse.Namespace
|
|
439
|
+
:return: Exit code.
|
|
440
|
+
:rtype: int
|
|
441
|
+
"""
|
|
442
|
+
corpus = (
|
|
443
|
+
Corpus.open(arguments.corpus)
|
|
444
|
+
if getattr(arguments, "corpus", None)
|
|
445
|
+
else Corpus.find(Path.cwd())
|
|
446
|
+
)
|
|
447
|
+
run_id = arguments.run or corpus.latest_run_id
|
|
448
|
+
if not run_id:
|
|
449
|
+
raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
|
|
450
|
+
run = corpus.load_run(run_id)
|
|
451
|
+
if arguments.backend and arguments.backend != run.recipe.backend_id:
|
|
452
|
+
raise ValueError(
|
|
453
|
+
f"Backend mismatch: run uses {run.recipe.backend_id!r} but {arguments.backend!r} was requested"
|
|
454
|
+
)
|
|
455
|
+
backend = get_backend(run.recipe.backend_id)
|
|
456
|
+
query_text = arguments.query if arguments.query is not None else sys.stdin.read()
|
|
457
|
+
budget = _budget_from_args(arguments)
|
|
458
|
+
result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
|
|
459
|
+
processed_evidence = result.evidence
|
|
460
|
+
if getattr(arguments, "reranker_id", None):
|
|
461
|
+
processed_evidence = apply_evidence_reranker(
|
|
462
|
+
reranker_id=arguments.reranker_id,
|
|
463
|
+
query_text=result.query_text,
|
|
464
|
+
evidence=processed_evidence,
|
|
465
|
+
)
|
|
466
|
+
if getattr(arguments, "minimum_score", None) is not None:
|
|
467
|
+
processed_evidence = apply_evidence_filter(
|
|
468
|
+
filter_id="filter-minimum-score",
|
|
469
|
+
query_text=result.query_text,
|
|
470
|
+
evidence=processed_evidence,
|
|
471
|
+
config={"minimum_score": float(arguments.minimum_score)},
|
|
472
|
+
)
|
|
473
|
+
if processed_evidence is not result.evidence:
|
|
474
|
+
result = result.model_copy(update={"evidence": processed_evidence})
|
|
475
|
+
print(result.model_dump_json(indent=2))
|
|
476
|
+
return 0
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
480
|
+
"""
|
|
481
|
+
Build a context pack from a retrieval result.
|
|
482
|
+
|
|
483
|
+
The retrieval result is read from standard input as JavaScript Object Notation.
|
|
484
|
+
|
|
485
|
+
:param arguments: Parsed command-line interface arguments.
|
|
486
|
+
:type arguments: argparse.Namespace
|
|
487
|
+
:return: Exit code.
|
|
488
|
+
:rtype: int
|
|
489
|
+
"""
|
|
490
|
+
input_text = sys.stdin.read()
|
|
491
|
+
if not input_text.strip():
|
|
492
|
+
raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
|
|
493
|
+
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
494
|
+
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
495
|
+
policy = ContextPackPolicy(join_with=join_with)
|
|
496
|
+
context_pack = build_context_pack(retrieval_result, policy=policy)
|
|
497
|
+
if arguments.max_tokens is not None:
|
|
498
|
+
context_pack = fit_context_pack_to_token_budget(
|
|
499
|
+
context_pack,
|
|
500
|
+
policy=policy,
|
|
501
|
+
token_budget=TokenBudget(max_tokens=int(arguments.max_tokens)),
|
|
502
|
+
)
|
|
503
|
+
print(
|
|
504
|
+
json.dumps(
|
|
505
|
+
{
|
|
506
|
+
"policy": policy.model_dump(),
|
|
507
|
+
"context_pack": context_pack.model_dump(),
|
|
508
|
+
},
|
|
509
|
+
indent=2,
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
return 0
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
516
|
+
"""
|
|
517
|
+
Evaluate a retrieval run against a dataset.
|
|
518
|
+
|
|
519
|
+
:param arguments: Parsed command-line interface arguments.
|
|
520
|
+
:type arguments: argparse.Namespace
|
|
521
|
+
:return: Exit code.
|
|
522
|
+
:rtype: int
|
|
523
|
+
"""
|
|
524
|
+
corpus = (
|
|
525
|
+
Corpus.open(arguments.corpus)
|
|
526
|
+
if getattr(arguments, "corpus", None)
|
|
527
|
+
else Corpus.find(Path.cwd())
|
|
528
|
+
)
|
|
529
|
+
run_id = arguments.run or corpus.latest_run_id
|
|
530
|
+
if not run_id:
|
|
531
|
+
raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
|
|
532
|
+
run = corpus.load_run(run_id)
|
|
533
|
+
dataset = load_dataset(Path(arguments.dataset))
|
|
534
|
+
budget = _budget_from_args(arguments)
|
|
535
|
+
result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
|
|
536
|
+
print(result.model_dump_json(indent=2))
|
|
537
|
+
return 0
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def cmd_crawl(arguments: argparse.Namespace) -> int:
|
|
541
|
+
"""
|
|
542
|
+
Crawl a website prefix into a corpus.
|
|
543
|
+
|
|
544
|
+
:param arguments: Parsed command-line interface arguments.
|
|
545
|
+
:type arguments: argparse.Namespace
|
|
546
|
+
:return: Exit code.
|
|
547
|
+
:rtype: int
|
|
548
|
+
"""
|
|
549
|
+
corpus = (
|
|
550
|
+
Corpus.open(arguments.corpus)
|
|
551
|
+
if getattr(arguments, "corpus", None)
|
|
552
|
+
else Corpus.find(Path.cwd())
|
|
553
|
+
)
|
|
554
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
555
|
+
request = CrawlRequest(
|
|
556
|
+
root_url=arguments.root_url,
|
|
557
|
+
allowed_prefix=arguments.allowed_prefix,
|
|
558
|
+
max_items=arguments.max_items,
|
|
559
|
+
tags=tags,
|
|
560
|
+
)
|
|
561
|
+
result = crawl_into_corpus(corpus=corpus, request=request)
|
|
562
|
+
print(result.model_dump_json(indent=2))
|
|
563
|
+
return 0
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
567
|
+
"""
|
|
568
|
+
Build the command-line interface argument parser.
|
|
569
|
+
|
|
570
|
+
:return: Argument parser instance.
|
|
571
|
+
:rtype: argparse.ArgumentParser
|
|
572
|
+
"""
|
|
573
|
+
parser = argparse.ArgumentParser(
|
|
574
|
+
prog="biblicus",
|
|
575
|
+
description="Biblicus command-line interface (minimum viable product)",
|
|
576
|
+
)
|
|
577
|
+
parser.add_argument(
|
|
578
|
+
"--corpus",
|
|
579
|
+
type=str,
|
|
580
|
+
default=None,
|
|
581
|
+
dest="corpus",
|
|
582
|
+
help=(
|
|
583
|
+
"Corpus path or uniform resource identifier (defaults to searching from the current working directory "
|
|
584
|
+
"upward). "
|
|
585
|
+
"Can be provided before or after the subcommand."
|
|
586
|
+
),
|
|
587
|
+
)
|
|
588
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
589
|
+
|
|
590
|
+
p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
|
|
591
|
+
p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
|
|
592
|
+
p_init.add_argument(
|
|
593
|
+
"--force", action="store_true", help="Overwrite existing config if present."
|
|
594
|
+
)
|
|
595
|
+
p_init.set_defaults(func=cmd_init)
|
|
596
|
+
|
|
597
|
+
p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
|
|
598
|
+
_add_common_corpus_arg(p_ingest)
|
|
599
|
+
p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
|
|
600
|
+
p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
|
|
601
|
+
p_ingest.add_argument(
|
|
602
|
+
"--stdin", action="store_true", help="Read text to ingest from standard input."
|
|
603
|
+
)
|
|
604
|
+
p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
|
|
605
|
+
p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
|
|
606
|
+
p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
|
|
607
|
+
p_ingest.set_defaults(func=cmd_ingest)
|
|
608
|
+
|
|
609
|
+
p_list = sub.add_parser("list", help="List recently ingested items.")
|
|
610
|
+
_add_common_corpus_arg(p_list)
|
|
611
|
+
p_list.add_argument("--limit", type=int, default=50)
|
|
612
|
+
p_list.set_defaults(func=cmd_list)
|
|
613
|
+
|
|
614
|
+
p_show = sub.add_parser("show", help="Show metadata for an item identifier.")
|
|
615
|
+
_add_common_corpus_arg(p_show)
|
|
616
|
+
p_show.add_argument("id", help="Item identifier (universally unique identifier).")
|
|
617
|
+
p_show.set_defaults(func=cmd_show)
|
|
618
|
+
|
|
619
|
+
p_reindex = sub.add_parser(
|
|
620
|
+
"reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
|
|
621
|
+
)
|
|
622
|
+
_add_common_corpus_arg(p_reindex)
|
|
623
|
+
p_reindex.set_defaults(func=cmd_reindex)
|
|
624
|
+
|
|
625
|
+
p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
|
|
626
|
+
_add_common_corpus_arg(p_import_tree)
|
|
627
|
+
p_import_tree.add_argument("path", help="Folder tree root to import.")
|
|
628
|
+
p_import_tree.add_argument(
|
|
629
|
+
"--tags", default=None, help="Comma-separated tags to apply to imported items."
|
|
630
|
+
)
|
|
631
|
+
p_import_tree.add_argument(
|
|
632
|
+
"--tag", action="append", help="Repeatable tag to apply to imported items."
|
|
633
|
+
)
|
|
634
|
+
p_import_tree.set_defaults(func=cmd_import_tree)
|
|
635
|
+
|
|
636
|
+
p_purge = sub.add_parser(
|
|
637
|
+
"purge", help="Delete all items and derived files (requires confirmation)."
|
|
638
|
+
)
|
|
639
|
+
_add_common_corpus_arg(p_purge)
|
|
640
|
+
p_purge.add_argument(
|
|
641
|
+
"--confirm",
|
|
642
|
+
default=None,
|
|
643
|
+
help="Type the corpus name (directory basename) to confirm purging.",
|
|
644
|
+
)
|
|
645
|
+
p_purge.set_defaults(func=cmd_purge)
|
|
646
|
+
|
|
647
|
+
p_build = sub.add_parser("build", help="Build a retrieval backend run for the corpus.")
|
|
648
|
+
_add_common_corpus_arg(p_build)
|
|
649
|
+
p_build.add_argument(
|
|
650
|
+
"--backend",
|
|
651
|
+
required=True,
|
|
652
|
+
help="Backend identifier (for example, scan, sqlite-full-text-search).",
|
|
653
|
+
)
|
|
654
|
+
p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
655
|
+
p_build.add_argument(
|
|
656
|
+
"--config",
|
|
657
|
+
action="append",
|
|
658
|
+
default=None,
|
|
659
|
+
help="Backend config as key=value (repeatable).",
|
|
660
|
+
)
|
|
661
|
+
p_build.set_defaults(func=cmd_build)
|
|
662
|
+
|
|
663
|
+
p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
|
|
664
|
+
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
665
|
+
|
|
666
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
|
|
667
|
+
_add_common_corpus_arg(p_extract_build)
|
|
668
|
+
p_extract_build.add_argument(
|
|
669
|
+
"--recipe-name", default="default", help="Human-readable recipe name."
|
|
670
|
+
)
|
|
671
|
+
p_extract_build.add_argument(
|
|
672
|
+
"--step",
|
|
673
|
+
action="append",
|
|
674
|
+
default=None,
|
|
675
|
+
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
676
|
+
)
|
|
677
|
+
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
678
|
+
|
|
679
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
|
|
680
|
+
_add_common_corpus_arg(p_extract_list)
|
|
681
|
+
p_extract_list.add_argument(
|
|
682
|
+
"--extractor-id",
|
|
683
|
+
default=None,
|
|
684
|
+
help="Optional extractor identifier filter (for example: pipeline).",
|
|
685
|
+
)
|
|
686
|
+
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
687
|
+
|
|
688
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
|
|
689
|
+
_add_common_corpus_arg(p_extract_show)
|
|
690
|
+
p_extract_show.add_argument(
|
|
691
|
+
"--run",
|
|
692
|
+
required=True,
|
|
693
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
694
|
+
)
|
|
695
|
+
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
696
|
+
|
|
697
|
+
p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
|
|
698
|
+
_add_common_corpus_arg(p_extract_delete)
|
|
699
|
+
p_extract_delete.add_argument(
|
|
700
|
+
"--run",
|
|
701
|
+
required=True,
|
|
702
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
703
|
+
)
|
|
704
|
+
p_extract_delete.add_argument(
|
|
705
|
+
"--confirm",
|
|
706
|
+
required=True,
|
|
707
|
+
help="Type the exact extractor_id:run_id to confirm deletion.",
|
|
708
|
+
)
|
|
709
|
+
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
710
|
+
|
|
711
|
+
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
712
|
+
_add_common_corpus_arg(p_query)
|
|
713
|
+
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
714
|
+
p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
|
|
715
|
+
p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
|
|
716
|
+
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
717
|
+
p_query.add_argument("--max-total-characters", type=int, default=2000)
|
|
718
|
+
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
719
|
+
p_query.add_argument(
|
|
720
|
+
"--reranker-id",
|
|
721
|
+
default=None,
|
|
722
|
+
help="Optional reranker identifier to apply after retrieval (for example: rerank-longest-text).",
|
|
723
|
+
)
|
|
724
|
+
p_query.add_argument(
|
|
725
|
+
"--minimum-score",
|
|
726
|
+
type=float,
|
|
727
|
+
default=None,
|
|
728
|
+
help="Optional minimum score threshold to filter evidence after retrieval.",
|
|
729
|
+
)
|
|
730
|
+
p_query.set_defaults(func=cmd_query)
|
|
731
|
+
|
|
732
|
+
p_context_pack = sub.add_parser("context-pack", help="Build context pack text from evidence.")
|
|
733
|
+
context_pack_sub = p_context_pack.add_subparsers(dest="context_pack_command", required=True)
|
|
734
|
+
|
|
735
|
+
p_context_pack_build = context_pack_sub.add_parser(
|
|
736
|
+
"build", help="Build a context pack from a retrieval result JavaScript Object Notation."
|
|
737
|
+
)
|
|
738
|
+
p_context_pack_build.add_argument(
|
|
739
|
+
"--join-with",
|
|
740
|
+
default="\\n\\n",
|
|
741
|
+
help="Separator between evidence blocks (escape sequences supported, default is two newlines).",
|
|
742
|
+
)
|
|
743
|
+
p_context_pack_build.add_argument(
|
|
744
|
+
"--max-tokens",
|
|
745
|
+
default=None,
|
|
746
|
+
type=int,
|
|
747
|
+
help="Optional token budget for the final context pack using the naive-whitespace tokenizer.",
|
|
748
|
+
)
|
|
749
|
+
p_context_pack_build.set_defaults(func=cmd_context_pack_build)
|
|
750
|
+
|
|
751
|
+
p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
|
|
752
|
+
_add_common_corpus_arg(p_eval)
|
|
753
|
+
p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
754
|
+
p_eval.add_argument(
|
|
755
|
+
"--dataset",
|
|
756
|
+
required=True,
|
|
757
|
+
help="Path to dataset JavaScript Object Notation file.",
|
|
758
|
+
)
|
|
759
|
+
p_eval.add_argument("--max-total-items", type=int, default=5)
|
|
760
|
+
p_eval.add_argument("--max-total-characters", type=int, default=2000)
|
|
761
|
+
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
762
|
+
p_eval.set_defaults(func=cmd_eval)
|
|
763
|
+
|
|
764
|
+
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
765
|
+
_add_common_corpus_arg(p_crawl)
|
|
766
|
+
p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
|
|
767
|
+
p_crawl.add_argument(
|
|
768
|
+
"--allowed-prefix",
|
|
769
|
+
required=True,
|
|
770
|
+
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
771
|
+
)
|
|
772
|
+
p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
|
|
773
|
+
p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
|
|
774
|
+
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
775
|
+
p_crawl.set_defaults(func=cmd_crawl)
|
|
776
|
+
|
|
777
|
+
return parser
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def main(argument_list: Optional[List[str]] = None) -> int:
|
|
781
|
+
"""
|
|
782
|
+
Entry point for the Biblicus command-line interface.
|
|
783
|
+
|
|
784
|
+
:param argument_list: Optional command-line interface arguments.
|
|
785
|
+
:type argument_list: list[str] or None
|
|
786
|
+
:return: Exit code.
|
|
787
|
+
:rtype: int
|
|
788
|
+
"""
|
|
789
|
+
parser = build_parser()
|
|
790
|
+
arguments = parser.parse_args(argument_list)
|
|
791
|
+
try:
|
|
792
|
+
return int(arguments.func(arguments))
|
|
793
|
+
except (
|
|
794
|
+
FileNotFoundError,
|
|
795
|
+
FileExistsError,
|
|
796
|
+
KeyError,
|
|
797
|
+
ValueError,
|
|
798
|
+
ExtractionRunFatalError,
|
|
799
|
+
NotImplementedError,
|
|
800
|
+
ValidationError,
|
|
801
|
+
) as exception:
|
|
802
|
+
message = exception.args[0] if getattr(exception, "args", None) else str(exception)
|
|
803
|
+
print(str(message), file=sys.stderr)
|
|
804
|
+
return 2
|