biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -25,4 +25,4 @@ __all__ = [
25
25
  "RetrievalRun",
26
26
  ]
27
27
 
28
- __version__ = "0.1.1"
28
+ __version__ = "0.2.0"
biblicus/backends/scan.py CHANGED
@@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Optional, Tuple
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
 
11
11
  from ..corpus import Corpus
12
+ from ..extraction import ExtractionRunReference, parse_extraction_run_reference
12
13
  from ..frontmatter import parse_front_matter
13
14
  from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
14
15
  from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
@@ -21,11 +22,14 @@ class ScanRecipeConfig(BaseModel):
21
22
 
22
23
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
23
24
  :vartype snippet_characters: int
25
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
26
+ :vartype extraction_run: str or None
24
27
  """
25
28
 
26
29
  model_config = ConfigDict(extra="forbid")
27
30
 
28
31
  snippet_characters: int = Field(default=400, ge=1)
32
+ extraction_run: Optional[str] = None
29
33
 
30
34
 
31
35
  class ScanBackend:
@@ -59,7 +63,7 @@ class ScanBackend:
59
63
  name=recipe_name,
60
64
  config=recipe_config.model_dump(),
61
65
  )
62
- stats = {"items": len(catalog.items), "text_items": _count_text_items(catalog.items.values())}
66
+ stats = {"items": len(catalog.items), "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config)}
63
67
  run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
64
68
  corpus.write_run(run)
65
69
  return run
@@ -89,12 +93,14 @@ class ScanBackend:
89
93
 
90
94
  recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
91
95
  catalog = corpus.load_catalog()
96
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
92
97
  query_tokens = _tokenize_query(query_text)
93
98
  scored_candidates = _score_items(
94
99
  corpus,
95
100
  catalog.items.values(),
96
101
  query_tokens,
97
102
  recipe_config.snippet_characters,
103
+ extraction_reference=extraction_reference,
98
104
  )
99
105
  sorted_candidates = sorted(
100
106
  scored_candidates,
@@ -124,18 +130,60 @@ class ScanBackend:
124
130
  )
125
131
 
126
132
 
127
- def _count_text_items(items: Iterable[object]) -> int:
133
+ def _resolve_extraction_reference(corpus: Corpus, recipe_config: ScanRecipeConfig) -> Optional[ExtractionRunReference]:
134
+ """
135
+ Resolve an extraction run reference from a recipe config.
136
+
137
+ :param corpus: Corpus associated with the recipe.
138
+ :type corpus: Corpus
139
+ :param recipe_config: Parsed scan recipe configuration.
140
+ :type recipe_config: ScanRecipeConfig
141
+ :return: Parsed extraction reference or None.
142
+ :rtype: ExtractionRunReference or None
143
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
144
+ """
145
+
146
+ if not recipe_config.extraction_run:
147
+ return None
148
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
149
+ run_dir = corpus.extraction_run_dir(
150
+ extractor_id=extraction_reference.extractor_id,
151
+ run_id=extraction_reference.run_id,
152
+ )
153
+ if not run_dir.is_dir():
154
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
155
+ return extraction_reference
156
+
157
+
158
+ def _count_text_items(corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig) -> int:
128
159
  """
129
160
  Count catalog items that represent text content.
130
161
 
162
+ When an extraction run is configured, extracted artifacts are treated as text.
163
+
164
+ :param corpus: Corpus containing the items.
165
+ :type corpus: Corpus
131
166
  :param items: Catalog items to inspect.
132
167
  :type items: Iterable[object]
168
+ :param recipe_config: Parsed scan recipe configuration.
169
+ :type recipe_config: ScanRecipeConfig
133
170
  :return: Number of text items.
134
171
  :rtype: int
135
172
  """
136
173
 
137
174
  text_item_count = 0
175
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
138
176
  for catalog_item in items:
177
+ item_id = str(getattr(catalog_item, "id", ""))
178
+ if extraction_reference and item_id:
179
+ extracted_text = corpus.read_extracted_text(
180
+ extractor_id=extraction_reference.extractor_id,
181
+ run_id=extraction_reference.run_id,
182
+ item_id=item_id,
183
+ )
184
+ if isinstance(extracted_text, str) and extracted_text.strip():
185
+ text_item_count += 1
186
+ continue
139
187
  media_type = getattr(catalog_item, "media_type", "")
140
188
  if media_type == "text/markdown" or str(media_type).startswith("text/"):
141
189
  text_item_count += 1
@@ -155,20 +203,40 @@ def _tokenize_query(query_text: str) -> List[str]:
155
203
  return [token for token in query_text.lower().split() if token]
156
204
 
157
205
 
158
- def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
206
+ def _load_text_from_item(
207
+ corpus: Corpus,
208
+ *,
209
+ item_id: str,
210
+ relpath: str,
211
+ media_type: str,
212
+ extraction_reference: Optional[ExtractionRunReference],
213
+ ) -> Optional[str]:
159
214
  """
160
215
  Load a text payload from a catalog item.
161
216
 
162
217
  :param corpus: Corpus containing the item.
163
218
  :type corpus: Corpus
219
+ :param item_id: Item identifier.
220
+ :type item_id: str
164
221
  :param relpath: Relative path to the stored content.
165
222
  :type relpath: str
166
223
  :param media_type: Media type for the stored content.
167
224
  :type media_type: str
225
+ :param extraction_reference: Optional extraction run reference.
226
+ :type extraction_reference: ExtractionRunReference or None
168
227
  :return: Text payload or None if not decodable as text.
169
228
  :rtype: str or None
170
229
  """
171
230
 
231
+ if extraction_reference:
232
+ extracted_text = corpus.read_extracted_text(
233
+ extractor_id=extraction_reference.extractor_id,
234
+ run_id=extraction_reference.run_id,
235
+ item_id=item_id,
236
+ )
237
+ if isinstance(extracted_text, str) and extracted_text.strip():
238
+ return extracted_text
239
+
172
240
  content_path = corpus.root / relpath
173
241
  raw_bytes = content_path.read_bytes()
174
242
  if media_type == "text/markdown":
@@ -240,6 +308,8 @@ def _score_items(
240
308
  items: Iterable[object],
241
309
  tokens: List[str],
242
310
  snippet_characters: int,
311
+ *,
312
+ extraction_reference: Optional[ExtractionRunReference],
243
313
  ) -> List[Evidence]:
244
314
  """
245
315
  Score catalog items by token frequency and return evidence candidates.
@@ -260,7 +330,14 @@ def _score_items(
260
330
  for catalog_item in items:
261
331
  media_type = getattr(catalog_item, "media_type", "")
262
332
  relpath = getattr(catalog_item, "relpath", "")
263
- item_text = _load_text_from_item(corpus, relpath, media_type)
333
+ item_id = str(getattr(catalog_item, "id", ""))
334
+ item_text = _load_text_from_item(
335
+ corpus,
336
+ item_id=item_id,
337
+ relpath=relpath,
338
+ media_type=str(media_type),
339
+ extraction_reference=extraction_reference,
340
+ )
264
341
  if item_text is None:
265
342
  continue
266
343
  lower_text = item_text.lower()
@@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
15
+ from ..extraction import ExtractionRunReference, parse_extraction_run_reference
15
16
  from ..frontmatter import parse_front_matter
16
17
  from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
17
18
  from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
@@ -28,6 +29,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
28
29
  :vartype chunk_overlap: int
29
30
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
30
31
  :vartype snippet_characters: int
32
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
33
+ :vartype extraction_run: str or None
31
34
  """
32
35
 
33
36
  model_config = ConfigDict(extra="forbid")
@@ -35,6 +38,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
35
38
  chunk_size: int = Field(default=800, ge=1)
36
39
  chunk_overlap: int = Field(default=200, ge=0)
37
40
  snippet_characters: int = Field(default=400, ge=1)
41
+ extraction_run: Optional[str] = None
38
42
 
39
43
 
40
44
  class SqliteFullTextSearchBackend:
@@ -72,11 +76,13 @@ class SqliteFullTextSearchBackend:
72
76
  db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
73
77
  db_path = corpus.root / db_relpath
74
78
  corpus.runs_dir.mkdir(parents=True, exist_ok=True)
79
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
75
80
  stats = _build_full_text_search_index(
76
81
  db_path=db_path,
77
82
  corpus=corpus,
78
83
  items=catalog.items.values(),
79
84
  recipe_config=recipe_config,
85
+ extraction_reference=extraction_reference,
80
86
  )
81
87
  run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
82
88
  corpus.write_run(run)
@@ -227,6 +233,7 @@ def _build_full_text_search_index(
227
233
  corpus: Corpus,
228
234
  items: Iterable[object],
229
235
  recipe_config: SqliteFullTextSearchRecipeConfig,
236
+ extraction_reference: Optional[ExtractionRunReference],
230
237
  ) -> Dict[str, int]:
231
238
  """
232
239
  Build a full-text search index from corpus items.
@@ -256,7 +263,13 @@ def _build_full_text_search_index(
256
263
  item_count += 1
257
264
  media_type = getattr(catalog_item, "media_type", "")
258
265
  relpath = getattr(catalog_item, "relpath", "")
259
- item_text = _load_text_from_item(corpus, relpath, media_type)
266
+ item_text = _load_text_from_item(
267
+ corpus,
268
+ item_id=str(getattr(catalog_item, "id", "")),
269
+ relpath=str(relpath),
270
+ media_type=str(media_type),
271
+ extraction_reference=extraction_reference,
272
+ )
260
273
  if item_text is None:
261
274
  continue
262
275
  text_item_count += 1
@@ -302,20 +315,40 @@ def _build_full_text_search_index(
302
315
  connection.close()
303
316
 
304
317
 
305
- def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
318
+ def _load_text_from_item(
319
+ corpus: Corpus,
320
+ *,
321
+ item_id: str,
322
+ relpath: str,
323
+ media_type: str,
324
+ extraction_reference: Optional[ExtractionRunReference],
325
+ ) -> Optional[str]:
306
326
  """
307
327
  Load text content from a catalog item.
308
328
 
309
329
  :param corpus: Corpus containing the content.
310
330
  :type corpus: Corpus
331
+ :param item_id: Item identifier.
332
+ :type item_id: str
311
333
  :param relpath: Relative path to the content.
312
334
  :type relpath: str
313
335
  :param media_type: Media type for the content.
314
336
  :type media_type: str
337
+ :param extraction_reference: Optional extraction run reference.
338
+ :type extraction_reference: ExtractionRunReference or None
315
339
  :return: Text payload or None if not text.
316
340
  :rtype: str or None
317
341
  """
318
342
 
343
+ if extraction_reference:
344
+ extracted_text = corpus.read_extracted_text(
345
+ extractor_id=extraction_reference.extractor_id,
346
+ run_id=extraction_reference.run_id,
347
+ item_id=item_id,
348
+ )
349
+ if isinstance(extracted_text, str) and extracted_text.strip():
350
+ return extracted_text
351
+
319
352
  content_path = corpus.root / relpath
320
353
  raw_bytes = content_path.read_bytes()
321
354
  if media_type == "text/markdown":
@@ -327,6 +360,34 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
327
360
  return None
328
361
 
329
362
 
363
+ def _resolve_extraction_reference(
364
+ corpus: Corpus,
365
+ recipe_config: SqliteFullTextSearchRecipeConfig,
366
+ ) -> Optional[ExtractionRunReference]:
367
+ """
368
+ Resolve an extraction run reference from a recipe config.
369
+
370
+ :param corpus: Corpus associated with the recipe.
371
+ :type corpus: Corpus
372
+ :param recipe_config: Parsed backend recipe configuration.
373
+ :type recipe_config: SqliteFullTextSearchRecipeConfig
374
+ :return: Parsed extraction reference or None.
375
+ :rtype: ExtractionRunReference or None
376
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
377
+ """
378
+
379
+ if not recipe_config.extraction_run:
380
+ return None
381
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
382
+ run_dir = corpus.extraction_run_dir(
383
+ extractor_id=extraction_reference.extractor_id,
384
+ run_id=extraction_reference.run_id,
385
+ )
386
+ if not run_dir.is_dir():
387
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
388
+ return extraction_reference
389
+
390
+
330
391
  def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
331
392
  """
332
393
  Yield overlapping chunks of text for indexing.
biblicus/cli.py CHANGED
@@ -14,6 +14,7 @@ from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
16
  from .corpus import Corpus
17
+ from .extraction import build_extraction_run
17
18
  from .evaluation import evaluate_run, load_dataset
18
19
  from .models import QueryBudget
19
20
  from .uris import corpus_ref_to_path
@@ -187,6 +188,27 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
187
188
  return 0
188
189
 
189
190
 
191
+ def cmd_import_tree(arguments: argparse.Namespace) -> int:
192
+ """
193
+ Import a folder tree into a corpus.
194
+
195
+ :param arguments: Parsed command-line interface arguments.
196
+ :type arguments: argparse.Namespace
197
+ :return: Exit code.
198
+ :rtype: int
199
+ """
200
+
201
+ corpus = (
202
+ Corpus.open(arguments.corpus)
203
+ if getattr(arguments, "corpus", None)
204
+ else Corpus.find(Path.cwd())
205
+ )
206
+ tags = _parse_tags(arguments.tags, arguments.tag)
207
+ stats = corpus.import_tree(Path(arguments.path), tags=tags)
208
+ print(json.dumps(stats, indent=2, sort_keys=False))
209
+ return 0
210
+
211
+
190
212
  def cmd_purge(arguments: argparse.Namespace) -> int:
191
213
  """
192
214
  Purge all items and derived artifacts from a corpus.
@@ -240,6 +262,44 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
240
262
  return config
241
263
 
242
264
 
265
+ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
266
+ """
267
+ Parse a cascade step specification.
268
+
269
+ :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
270
+ :type raw_step: str
271
+ :return: Tuple of extractor_id and config mapping.
272
+ :rtype: tuple[str, dict[str, object]]
273
+ :raises ValueError: If the step spec is invalid.
274
+ """
275
+
276
+ raw_step = raw_step.strip()
277
+ if not raw_step:
278
+ raise ValueError("Step spec must be non-empty")
279
+ if ":" not in raw_step:
280
+ return raw_step, {}
281
+ extractor_id, raw_pairs = raw_step.split(":", 1)
282
+ extractor_id = extractor_id.strip()
283
+ if not extractor_id:
284
+ raise ValueError("Step spec must start with an extractor identifier")
285
+ config: Dict[str, object] = {}
286
+ raw_pairs = raw_pairs.strip()
287
+ if not raw_pairs:
288
+ return extractor_id, {}
289
+ for token in raw_pairs.split(","):
290
+ token = token.strip()
291
+ if not token:
292
+ continue
293
+ if "=" not in token:
294
+ raise ValueError(f"Step config values must be key=value (got {token!r})")
295
+ key, value = token.split("=", 1)
296
+ key = key.strip()
297
+ if not key:
298
+ raise ValueError("Step config keys must be non-empty")
299
+ config[key] = value
300
+ return extractor_id, config
301
+
302
+
243
303
  def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
244
304
  """
245
305
  Build a QueryBudget from command-line interface arguments.
@@ -279,6 +339,40 @@ def cmd_build(arguments: argparse.Namespace) -> int:
279
339
  return 0
280
340
 
281
341
 
342
+ def cmd_extract(arguments: argparse.Namespace) -> int:
343
+ """
344
+ Build a text extraction run for the corpus.
345
+
346
+ :param arguments: Parsed command-line interface arguments.
347
+ :type arguments: argparse.Namespace
348
+ :return: Exit code.
349
+ :rtype: int
350
+ """
351
+
352
+ corpus = (
353
+ Corpus.open(arguments.corpus)
354
+ if getattr(arguments, "corpus", None)
355
+ else Corpus.find(Path.cwd())
356
+ )
357
+ config = _parse_config_pairs(arguments.config)
358
+ if getattr(arguments, "step", None):
359
+ if arguments.extractor != "cascade":
360
+ raise ValueError("--step is only supported for the cascade extractor")
361
+ steps: List[Dict[str, object]] = []
362
+ for raw_step in arguments.step:
363
+ extractor_id, step_config = _parse_step_spec(raw_step)
364
+ steps.append({"extractor_id": extractor_id, "config": step_config})
365
+ config = {"steps": steps}
366
+ manifest = build_extraction_run(
367
+ corpus,
368
+ extractor_id=arguments.extractor,
369
+ recipe_name=arguments.recipe_name,
370
+ config=config,
371
+ )
372
+ print(manifest.model_dump_json(indent=2))
373
+ return 0
374
+
375
+
282
376
  def cmd_query(arguments: argparse.Namespace) -> int:
283
377
  """
284
378
  Execute a retrieval query.
@@ -390,6 +484,13 @@ def build_parser() -> argparse.ArgumentParser:
390
484
  _add_common_corpus_arg(p_reindex)
391
485
  p_reindex.set_defaults(func=cmd_reindex)
392
486
 
487
+ p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
488
+ _add_common_corpus_arg(p_import_tree)
489
+ p_import_tree.add_argument("path", help="Folder tree root to import.")
490
+ p_import_tree.add_argument("--tags", default=None, help="Comma-separated tags to apply to imported items.")
491
+ p_import_tree.add_argument("--tag", action="append", help="Repeatable tag to apply to imported items.")
492
+ p_import_tree.set_defaults(func=cmd_import_tree)
493
+
393
494
  p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
394
495
  _add_common_corpus_arg(p_purge)
395
496
  p_purge.add_argument(
@@ -415,6 +516,28 @@ def build_parser() -> argparse.ArgumentParser:
415
516
  )
416
517
  p_build.set_defaults(func=cmd_build)
417
518
 
519
+ p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
520
+ _add_common_corpus_arg(p_extract)
521
+ p_extract.add_argument(
522
+ "--extractor",
523
+ required=True,
524
+ help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
525
+ )
526
+ p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
527
+ p_extract.add_argument(
528
+ "--step",
529
+ action="append",
530
+ default=None,
531
+ help="Cascade step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
532
+ )
533
+ p_extract.add_argument(
534
+ "--config",
535
+ action="append",
536
+ default=None,
537
+ help="Extractor config as key=value (repeatable).",
538
+ )
539
+ p_extract.set_defaults(func=cmd_extract)
540
+
418
541
  p_query = sub.add_parser("query", help="Run a retrieval query.")
419
542
  _add_common_corpus_arg(p_query)
420
543
  p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
biblicus/constants.py CHANGED
@@ -8,3 +8,5 @@ CORPUS_DIR_NAME = ".biblicus"
8
8
  DEFAULT_RAW_DIR = "raw"
9
9
  SIDECAR_SUFFIX = ".biblicus.yml"
10
10
  RUNS_DIR_NAME = "runs"
11
+ EXTRACTION_RUNS_DIR_NAME = "extraction"
12
+ HOOK_LOGS_DIR_NAME = "hook_logs"