biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/cli.py ADDED
@@ -0,0 +1,468 @@
1
+ """
2
+ Command-line interface for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from pydantic import ValidationError
14
+
15
+ from .backends import get_backend
16
+ from .corpus import Corpus
17
+ from .evaluation import evaluate_run, load_dataset
18
+ from .models import QueryBudget
19
+ from .uris import corpus_ref_to_path
20
+
21
+
22
+ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
23
+ """
24
+ Add the common --corpus argument to a parser.
25
+
26
+ :param parser: Argument parser to modify.
27
+ :type parser: argparse.ArgumentParser
28
+ :return: None.
29
+ :rtype: None
30
+ """
31
+
32
+ parser.add_argument(
33
+ "--corpus",
34
+ type=str,
35
+ default=argparse.SUPPRESS,
36
+ dest="corpus",
37
+ help=(
38
+ "Corpus path or uniform resource identifier (defaults to searching from the current working directory "
39
+ "upward)."
40
+ ),
41
+ )
42
+
43
+
44
+ def cmd_init(arguments: argparse.Namespace) -> int:
45
+ """
46
+ Initialize a new corpus from command-line interface arguments.
47
+
48
+ :param arguments: Parsed command-line interface arguments.
49
+ :type arguments: argparse.Namespace
50
+ :return: Exit code.
51
+ :rtype: int
52
+ """
53
+
54
+ corpus_path = corpus_ref_to_path(arguments.path)
55
+ corpus = Corpus.init(corpus_path, force=arguments.force)
56
+ print(f"Initialized corpus at {corpus.root}")
57
+ return 0
58
+
59
+
60
+ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
61
+ """
62
+ Parse and deduplicate tag strings.
63
+
64
+ :param raw: Comma-separated tag string.
65
+ :type raw: str or None
66
+ :param raw_list: Repeated tag list.
67
+ :type raw_list: list[str] or None
68
+ :return: Deduplicated tag list.
69
+ :rtype: list[str]
70
+ """
71
+
72
+ parsed_tags: List[str] = []
73
+ if raw:
74
+ parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
75
+ if raw_list:
76
+ parsed_tags.extend([tag.strip() for tag in raw_list if tag.strip()])
77
+
78
+ seen_tags = set()
79
+ deduplicated_tags: List[str] = []
80
+ for tag_value in parsed_tags:
81
+ if tag_value not in seen_tags:
82
+ seen_tags.add(tag_value)
83
+ deduplicated_tags.append(tag_value)
84
+ return deduplicated_tags
85
+
86
+
87
+ def cmd_ingest(arguments: argparse.Namespace) -> int:
88
+ """
89
+ Ingest items into a corpus from command-line interface arguments.
90
+
91
+ :param arguments: Parsed command-line interface arguments.
92
+ :type arguments: argparse.Namespace
93
+ :return: Exit code.
94
+ :rtype: int
95
+ """
96
+
97
+ corpus = (
98
+ Corpus.open(arguments.corpus)
99
+ if getattr(arguments, "corpus", None)
100
+ else Corpus.find(Path.cwd())
101
+ )
102
+ tags = _parse_tags(arguments.tags, arguments.tag)
103
+
104
+ results = []
105
+
106
+ if arguments.note is not None or arguments.stdin:
107
+ text = arguments.note if arguments.note is not None else sys.stdin.read()
108
+ ingest_result = corpus.ingest_note(
109
+ text,
110
+ title=arguments.title,
111
+ tags=tags,
112
+ source_uri="stdin" if arguments.stdin else "text",
113
+ )
114
+ results.append(ingest_result)
115
+
116
+ for source_path in arguments.files or []:
117
+ results.append(corpus.ingest_source(source_path, tags=tags))
118
+
119
+ if not results:
120
+ print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
121
+ return 2
122
+
123
+ for ingest_result in results:
124
+ print(f"{ingest_result.item_id}\t{ingest_result.relpath}\t{ingest_result.sha256}")
125
+ return 0
126
+
127
+
128
+ def cmd_list(arguments: argparse.Namespace) -> int:
129
+ """
130
+ List items from the corpus.
131
+
132
+ :param arguments: Parsed command-line interface arguments.
133
+ :type arguments: argparse.Namespace
134
+ :return: Exit code.
135
+ :rtype: int
136
+ """
137
+
138
+ corpus = (
139
+ Corpus.open(arguments.corpus)
140
+ if getattr(arguments, "corpus", None)
141
+ else Corpus.find(Path.cwd())
142
+ )
143
+ items = corpus.list_items(limit=arguments.limit)
144
+ for item in items:
145
+ title = item.title or ""
146
+ print(f"{item.id}\t{item.created_at}\t{item.relpath}\t{title}\t{','.join(item.tags)}")
147
+ return 0
148
+
149
+
150
+ def cmd_show(arguments: argparse.Namespace) -> int:
151
+ """
152
+ Show an item from the corpus.
153
+
154
+ :param arguments: Parsed command-line interface arguments.
155
+ :type arguments: argparse.Namespace
156
+ :return: Exit code.
157
+ :rtype: int
158
+ """
159
+
160
+ corpus = (
161
+ Corpus.open(arguments.corpus)
162
+ if getattr(arguments, "corpus", None)
163
+ else Corpus.find(Path.cwd())
164
+ )
165
+ item = corpus.get_item(arguments.id)
166
+ print(item.model_dump_json(indent=2))
167
+ return 0
168
+
169
+
170
+ def cmd_reindex(arguments: argparse.Namespace) -> int:
171
+ """
172
+ Rebuild the corpus catalog.
173
+
174
+ :param arguments: Parsed command-line interface arguments.
175
+ :type arguments: argparse.Namespace
176
+ :return: Exit code.
177
+ :rtype: int
178
+ """
179
+
180
+ corpus = (
181
+ Corpus.open(arguments.corpus)
182
+ if getattr(arguments, "corpus", None)
183
+ else Corpus.find(Path.cwd())
184
+ )
185
+ stats = corpus.reindex()
186
+ print(json.dumps(stats, indent=2, sort_keys=False))
187
+ return 0
188
+
189
+
190
+ def cmd_purge(arguments: argparse.Namespace) -> int:
191
+ """
192
+ Purge all items and derived artifacts from a corpus.
193
+
194
+ :param arguments: Parsed command-line interface arguments.
195
+ :type arguments: argparse.Namespace
196
+ :return: Exit code.
197
+ :rtype: int
198
+ """
199
+
200
+ corpus = (
201
+ Corpus.open(arguments.corpus)
202
+ if getattr(arguments, "corpus", None)
203
+ else Corpus.find(Path.cwd())
204
+ )
205
+ if arguments.confirm is None:
206
+ raise ValueError(f"Purging is dangerous: pass --confirm {corpus.name!r} to proceed")
207
+ corpus.purge(confirm=arguments.confirm)
208
+ print(f"Purged corpus {corpus.root}")
209
+ return 0
210
+
211
+
212
+ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
213
+ """
214
+ Parse repeated key=value config pairs.
215
+
216
+ :param pairs: Config pairs supplied via the command-line interface.
217
+ :type pairs: list[str] or None
218
+ :return: Parsed config mapping.
219
+ :rtype: dict[str, object]
220
+ :raises ValueError: If any entry is not key=value.
221
+ """
222
+
223
+ config: Dict[str, object] = {}
224
+ for item in pairs or []:
225
+ if "=" not in item:
226
+ raise ValueError(f"Config values must be key=value (got {item!r})")
227
+ key, raw = item.split("=", 1)
228
+ key = key.strip()
229
+ if not key:
230
+ raise ValueError("Config keys must be non-empty")
231
+ value: object = raw
232
+ if raw.isdigit():
233
+ value = int(raw)
234
+ else:
235
+ try:
236
+ value = float(raw)
237
+ except ValueError:
238
+ value = raw
239
+ config[key] = value
240
+ return config
241
+
242
+
243
+ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
244
+ """
245
+ Build a QueryBudget from command-line interface arguments.
246
+
247
+ :param arguments: Parsed command-line interface arguments.
248
+ :type arguments: argparse.Namespace
249
+ :return: Query budget instance.
250
+ :rtype: QueryBudget
251
+ """
252
+
253
+ return QueryBudget(
254
+ max_total_items=arguments.max_total_items,
255
+ max_total_characters=arguments.max_total_characters,
256
+ max_items_per_source=arguments.max_items_per_source,
257
+ )
258
+
259
+
260
+ def cmd_build(arguments: argparse.Namespace) -> int:
261
+ """
262
+ Build a retrieval run for a backend.
263
+
264
+ :param arguments: Parsed command-line interface arguments.
265
+ :type arguments: argparse.Namespace
266
+ :return: Exit code.
267
+ :rtype: int
268
+ """
269
+
270
+ corpus = (
271
+ Corpus.open(arguments.corpus)
272
+ if getattr(arguments, "corpus", None)
273
+ else Corpus.find(Path.cwd())
274
+ )
275
+ backend = get_backend(arguments.backend)
276
+ config = _parse_config_pairs(arguments.config)
277
+ run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
278
+ print(run.model_dump_json(indent=2))
279
+ return 0
280
+
281
+
282
+ def cmd_query(arguments: argparse.Namespace) -> int:
283
+ """
284
+ Execute a retrieval query.
285
+
286
+ :param arguments: Parsed command-line interface arguments.
287
+ :type arguments: argparse.Namespace
288
+ :return: Exit code.
289
+ :rtype: int
290
+ """
291
+
292
+ corpus = (
293
+ Corpus.open(arguments.corpus)
294
+ if getattr(arguments, "corpus", None)
295
+ else Corpus.find(Path.cwd())
296
+ )
297
+ run_id = arguments.run or corpus.latest_run_id
298
+ if not run_id:
299
+ raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
300
+ run = corpus.load_run(run_id)
301
+ if arguments.backend and arguments.backend != run.recipe.backend_id:
302
+ raise ValueError(
303
+ f"Backend mismatch: run uses {run.recipe.backend_id!r} but {arguments.backend!r} was requested"
304
+ )
305
+ backend = get_backend(run.recipe.backend_id)
306
+ query_text = arguments.query if arguments.query is not None else sys.stdin.read()
307
+ budget = _budget_from_args(arguments)
308
+ result = backend.query(corpus, run=run, query_text=query_text, budget=budget)
309
+ print(result.model_dump_json(indent=2))
310
+ return 0
311
+
312
+
313
+ def cmd_eval(arguments: argparse.Namespace) -> int:
314
+ """
315
+ Evaluate a retrieval run against a dataset.
316
+
317
+ :param arguments: Parsed command-line interface arguments.
318
+ :type arguments: argparse.Namespace
319
+ :return: Exit code.
320
+ :rtype: int
321
+ """
322
+
323
+ corpus = (
324
+ Corpus.open(arguments.corpus)
325
+ if getattr(arguments, "corpus", None)
326
+ else Corpus.find(Path.cwd())
327
+ )
328
+ run_id = arguments.run or corpus.latest_run_id
329
+ if not run_id:
330
+ raise ValueError("No run identifier provided and no latest run is recorded for this corpus")
331
+ run = corpus.load_run(run_id)
332
+ dataset = load_dataset(Path(arguments.dataset))
333
+ budget = _budget_from_args(arguments)
334
+ result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
335
+ print(result.model_dump_json(indent=2))
336
+ return 0
337
+
338
+
339
+ def build_parser() -> argparse.ArgumentParser:
340
+ """
341
+ Build the command-line interface argument parser.
342
+
343
+ :return: Argument parser instance.
344
+ :rtype: argparse.ArgumentParser
345
+ """
346
+
347
+ parser = argparse.ArgumentParser(
348
+ prog="biblicus",
349
+ description="Biblicus command-line interface (minimum viable product)",
350
+ )
351
+ parser.add_argument(
352
+ "--corpus",
353
+ type=str,
354
+ default=None,
355
+ dest="corpus",
356
+ help=(
357
+ "Corpus path or uniform resource identifier (defaults to searching from the current working directory "
358
+ "upward). "
359
+ "Can be provided before or after the subcommand."
360
+ ),
361
+ )
362
+ sub = parser.add_subparsers(dest="cmd", required=True)
363
+
364
+ p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
365
+ p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
366
+ p_init.add_argument("--force", action="store_true", help="Overwrite existing config if present.")
367
+ p_init.set_defaults(func=cmd_init)
368
+
369
+ p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
370
+ _add_common_corpus_arg(p_ingest)
371
+ p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
372
+ p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
373
+ p_ingest.add_argument("--stdin", action="store_true", help="Read text to ingest from standard input.")
374
+ p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
375
+ p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
376
+ p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
377
+ p_ingest.set_defaults(func=cmd_ingest)
378
+
379
+ p_list = sub.add_parser("list", help="List recently ingested items.")
380
+ _add_common_corpus_arg(p_list)
381
+ p_list.add_argument("--limit", type=int, default=50)
382
+ p_list.set_defaults(func=cmd_list)
383
+
384
+ p_show = sub.add_parser("show", help="Show metadata for an item identifier.")
385
+ _add_common_corpus_arg(p_show)
386
+ p_show.add_argument("id", help="Item identifier (universally unique identifier).")
387
+ p_show.set_defaults(func=cmd_show)
388
+
389
+ p_reindex = sub.add_parser("reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus.")
390
+ _add_common_corpus_arg(p_reindex)
391
+ p_reindex.set_defaults(func=cmd_reindex)
392
+
393
+ p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
394
+ _add_common_corpus_arg(p_purge)
395
+ p_purge.add_argument(
396
+ "--confirm",
397
+ default=None,
398
+ help="Type the corpus name (directory basename) to confirm purging.",
399
+ )
400
+ p_purge.set_defaults(func=cmd_purge)
401
+
402
+ p_build = sub.add_parser("build", help="Build a retrieval backend run for the corpus.")
403
+ _add_common_corpus_arg(p_build)
404
+ p_build.add_argument(
405
+ "--backend",
406
+ required=True,
407
+ help="Backend identifier (for example, scan, sqlite-full-text-search).",
408
+ )
409
+ p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
410
+ p_build.add_argument(
411
+ "--config",
412
+ action="append",
413
+ default=None,
414
+ help="Backend config as key=value (repeatable).",
415
+ )
416
+ p_build.set_defaults(func=cmd_build)
417
+
418
+ p_query = sub.add_parser("query", help="Run a retrieval query.")
419
+ _add_common_corpus_arg(p_query)
420
+ p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
421
+ p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
422
+ p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
423
+ p_query.add_argument("--max-total-items", type=int, default=5)
424
+ p_query.add_argument("--max-total-characters", type=int, default=2000)
425
+ p_query.add_argument("--max-items-per-source", type=int, default=5)
426
+ p_query.set_defaults(func=cmd_query)
427
+
428
+ p_eval = sub.add_parser("eval", help="Evaluate a run against a dataset.")
429
+ _add_common_corpus_arg(p_eval)
430
+ p_eval.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
431
+ p_eval.add_argument(
432
+ "--dataset",
433
+ required=True,
434
+ help="Path to dataset JavaScript Object Notation file.",
435
+ )
436
+ p_eval.add_argument("--max-total-items", type=int, default=5)
437
+ p_eval.add_argument("--max-total-characters", type=int, default=2000)
438
+ p_eval.add_argument("--max-items-per-source", type=int, default=5)
439
+ p_eval.set_defaults(func=cmd_eval)
440
+
441
+ return parser
442
+
443
+
444
+ def main(argument_list: Optional[List[str]] = None) -> int:
445
+ """
446
+ Entry point for the Biblicus command-line interface.
447
+
448
+ :param argument_list: Optional command-line interface arguments.
449
+ :type argument_list: list[str] or None
450
+ :return: Exit code.
451
+ :rtype: int
452
+ """
453
+
454
+ parser = build_parser()
455
+ arguments = parser.parse_args(argument_list)
456
+ try:
457
+ return int(arguments.func(arguments))
458
+ except (
459
+ FileNotFoundError,
460
+ FileExistsError,
461
+ KeyError,
462
+ ValueError,
463
+ NotImplementedError,
464
+ ValidationError,
465
+ ) as exception:
466
+ message = exception.args[0] if getattr(exception, "args", None) else str(exception)
467
+ print(str(message), file=sys.stderr)
468
+ return 2
biblicus/constants.py ADDED
@@ -0,0 +1,10 @@
1
+ """
2
+ Shared constants for Biblicus.
3
+ """
4
+
5
+ SCHEMA_VERSION = 2
6
+ DATASET_SCHEMA_VERSION = 1
7
+ CORPUS_DIR_NAME = ".biblicus"
8
+ DEFAULT_RAW_DIR = "raw"
9
+ SIDECAR_SUFFIX = ".biblicus.yml"
10
+ RUNS_DIR_NAME = "runs"