biblicus 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -25,4 +25,4 @@ __all__ = [
25
25
  "RetrievalRun",
26
26
  ]
27
27
 
28
- __version__ = "0.3.0"
28
+ __version__ = "0.4.0"
biblicus/cli.py CHANGED
@@ -14,10 +14,11 @@ from pydantic import ValidationError
14
14
 
15
15
  from .backends import get_backend
16
16
  from .corpus import Corpus
17
+ from .crawl import CrawlRequest, crawl_into_corpus
17
18
  from .errors import ExtractionRunFatalError
18
19
  from .evaluation import evaluate_run, load_dataset
19
20
  from .extraction import build_extraction_run
20
- from .models import QueryBudget
21
+ from .models import QueryBudget, parse_extraction_run_reference
21
22
  from .uris import corpus_ref_to_path
22
23
 
23
24
 
@@ -327,7 +328,7 @@ def cmd_build(arguments: argparse.Namespace) -> int:
327
328
  return 0
328
329
 
329
330
 
330
- def cmd_extract(arguments: argparse.Namespace) -> int:
331
+ def cmd_extract_build(arguments: argparse.Namespace) -> int:
331
332
  """
332
333
  Build a text extraction run for the corpus using a pipeline of extractors.
333
334
 
@@ -359,6 +360,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
359
360
  return 0
360
361
 
361
362
 
363
+ def cmd_extract_list(arguments: argparse.Namespace) -> int:
364
+ """
365
+ List extraction runs stored under the corpus.
366
+
367
+ :param arguments: Parsed command-line interface arguments.
368
+ :type arguments: argparse.Namespace
369
+ :return: Exit code.
370
+ :rtype: int
371
+ """
372
+ corpus = (
373
+ Corpus.open(arguments.corpus)
374
+ if getattr(arguments, "corpus", None)
375
+ else Corpus.find(Path.cwd())
376
+ )
377
+ runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
378
+ print(json.dumps([entry.model_dump() for entry in runs], indent=2))
379
+ return 0
380
+
381
+
382
+ def cmd_extract_show(arguments: argparse.Namespace) -> int:
383
+ """
384
+ Show an extraction run manifest.
385
+
386
+ :param arguments: Parsed command-line interface arguments.
387
+ :type arguments: argparse.Namespace
388
+ :return: Exit code.
389
+ :rtype: int
390
+ """
391
+ corpus = (
392
+ Corpus.open(arguments.corpus)
393
+ if getattr(arguments, "corpus", None)
394
+ else Corpus.find(Path.cwd())
395
+ )
396
+ reference = parse_extraction_run_reference(arguments.run)
397
+ manifest = corpus.load_extraction_run_manifest(
398
+ extractor_id=reference.extractor_id, run_id=reference.run_id
399
+ )
400
+ print(manifest.model_dump_json(indent=2))
401
+ return 0
402
+
403
+
404
+ def cmd_extract_delete(arguments: argparse.Namespace) -> int:
405
+ """
406
+ Delete an extraction run directory and its derived artifacts.
407
+
408
+ :param arguments: Parsed command-line interface arguments.
409
+ :type arguments: argparse.Namespace
410
+ :return: Exit code.
411
+ :rtype: int
412
+ """
413
+ corpus = (
414
+ Corpus.open(arguments.corpus)
415
+ if getattr(arguments, "corpus", None)
416
+ else Corpus.find(Path.cwd())
417
+ )
418
+ if arguments.confirm != arguments.run:
419
+ raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
420
+ reference = parse_extraction_run_reference(arguments.run)
421
+ corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
422
+ print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
423
+ return 0
424
+
425
+
362
426
  def cmd_query(arguments: argparse.Namespace) -> int:
363
427
  """
364
428
  Execute a retrieval query.
@@ -414,6 +478,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
414
478
  return 0
415
479
 
416
480
 
481
+ def cmd_crawl(arguments: argparse.Namespace) -> int:
482
+ """
483
+ Crawl a website prefix into a corpus.
484
+
485
+ :param arguments: Parsed command-line interface arguments.
486
+ :type arguments: argparse.Namespace
487
+ :return: Exit code.
488
+ :rtype: int
489
+ """
490
+ corpus = (
491
+ Corpus.open(arguments.corpus)
492
+ if getattr(arguments, "corpus", None)
493
+ else Corpus.find(Path.cwd())
494
+ )
495
+ tags = _parse_tags(arguments.tags, arguments.tag)
496
+ request = CrawlRequest(
497
+ root_url=arguments.root_url,
498
+ allowed_prefix=arguments.allowed_prefix,
499
+ max_items=arguments.max_items,
500
+ tags=tags,
501
+ )
502
+ result = crawl_into_corpus(corpus=corpus, request=request)
503
+ print(result.model_dump_json(indent=2))
504
+ return 0
505
+
506
+
417
507
  def build_parser() -> argparse.ArgumentParser:
418
508
  """
419
509
  Build the command-line interface argument parser.
@@ -511,16 +601,53 @@ def build_parser() -> argparse.ArgumentParser:
511
601
  )
512
602
  p_build.set_defaults(func=cmd_build)
513
603
 
514
- p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
515
- _add_common_corpus_arg(p_extract)
516
- p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
517
- p_extract.add_argument(
604
+ p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
605
+ extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
606
+
607
+ p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
608
+ _add_common_corpus_arg(p_extract_build)
609
+ p_extract_build.add_argument(
610
+ "--recipe-name", default="default", help="Human-readable recipe name."
611
+ )
612
+ p_extract_build.add_argument(
518
613
  "--step",
519
614
  action="append",
520
615
  default=None,
521
616
  help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
522
617
  )
523
- p_extract.set_defaults(func=cmd_extract)
618
+ p_extract_build.set_defaults(func=cmd_extract_build)
619
+
620
+ p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
621
+ _add_common_corpus_arg(p_extract_list)
622
+ p_extract_list.add_argument(
623
+ "--extractor-id",
624
+ default=None,
625
+ help="Optional extractor identifier filter (for example: pipeline).",
626
+ )
627
+ p_extract_list.set_defaults(func=cmd_extract_list)
628
+
629
+ p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
630
+ _add_common_corpus_arg(p_extract_show)
631
+ p_extract_show.add_argument(
632
+ "--run",
633
+ required=True,
634
+ help="Extraction run reference in the form extractor_id:run_id.",
635
+ )
636
+ p_extract_show.set_defaults(func=cmd_extract_show)
637
+
638
+ p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
639
+ _add_common_corpus_arg(p_extract_delete)
640
+ p_extract_delete.add_argument(
641
+ "--run",
642
+ required=True,
643
+ help="Extraction run reference in the form extractor_id:run_id.",
644
+ )
645
+ p_extract_delete.add_argument(
646
+ "--confirm",
647
+ required=True,
648
+ help="Type the exact extractor_id:run_id to confirm deletion.",
649
+ )
650
+ p_extract_delete.set_defaults(func=cmd_extract_delete)
524
651
 
525
652
  p_query = sub.add_parser("query", help="Run a retrieval query.")
526
653
  _add_common_corpus_arg(p_query)
@@ -545,6 +672,19 @@ def build_parser() -> argparse.ArgumentParser:
545
672
  p_eval.add_argument("--max-items-per-source", type=int, default=5)
546
673
  p_eval.set_defaults(func=cmd_eval)
547
674
 
675
+ p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
676
+ _add_common_corpus_arg(p_crawl)
677
+ p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
678
+ p_crawl.add_argument(
679
+ "--allowed-prefix",
680
+ required=True,
681
+ help="Uniform resource locator prefix that limits which links are eligible for crawl.",
682
+ )
683
+ p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
684
+ p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
685
+ p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
686
+ p_crawl.set_defaults(func=cmd_crawl)
687
+
548
688
  return parser
549
689
 
550
690
 
biblicus/corpus.py CHANGED
@@ -27,7 +27,14 @@ from .frontmatter import parse_front_matter, render_front_matter
27
27
  from .hook_manager import HookManager
28
28
  from .hooks import HookPoint
29
29
  from .ignore import load_corpus_ignore_spec
30
- from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
30
+ from .models import (
31
+ CatalogItem,
32
+ CorpusCatalog,
33
+ CorpusConfig,
34
+ ExtractionRunListEntry,
35
+ IngestResult,
36
+ RetrievalRun,
37
+ )
31
38
  from .sources import load_source
32
39
  from .time import utc_now_iso
33
40
  from .uris import corpus_ref_to_path, normalize_corpus_uri
@@ -567,6 +574,96 @@ class Corpus:
567
574
  return None
568
575
  return path.read_text(encoding="utf-8")
569
576
 
577
+ def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
578
+ """
579
+ Load an extraction run manifest from the corpus.
580
+
581
+ :param extractor_id: Extractor plugin identifier.
582
+ :type extractor_id: str
583
+ :param run_id: Extraction run identifier.
584
+ :type run_id: str
585
+ :return: Parsed extraction run manifest.
586
+ :rtype: biblicus.extraction.ExtractionRunManifest
587
+ :raises FileNotFoundError: If the manifest file does not exist.
588
+ :raises ValueError: If the manifest data is invalid.
589
+ """
590
+ from .extraction import ExtractionRunManifest
591
+
592
+ manifest_path = (
593
+ self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
594
+ )
595
+ if not manifest_path.is_file():
596
+ raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
597
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
598
+ return ExtractionRunManifest.model_validate(data)
599
+
600
+ def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
601
+ """
602
+ List extraction runs stored under the corpus.
603
+
604
+ :param extractor_id: Optional extractor identifier filter.
605
+ :type extractor_id: str or None
606
+ :return: Summary list entries for each run.
607
+ :rtype: list[biblicus.models.ExtractionRunListEntry]
608
+ """
609
+ runs_root = self.extraction_runs_dir
610
+ if not runs_root.is_dir():
611
+ return []
612
+
613
+ extractor_dirs: List[Path]
614
+ if extractor_id is None:
615
+ extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
616
+ else:
617
+ extractor_path = runs_root / extractor_id
618
+ extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
619
+
620
+ entries: List[ExtractionRunListEntry] = []
621
+ for extractor_dir in extractor_dirs:
622
+ for run_dir in sorted(extractor_dir.iterdir()):
623
+ if not run_dir.is_dir():
624
+ continue
625
+ manifest_path = run_dir / "manifest.json"
626
+ if not manifest_path.is_file():
627
+ continue
628
+ try:
629
+ manifest = self.load_extraction_run_manifest(
630
+ extractor_id=extractor_dir.name,
631
+ run_id=run_dir.name,
632
+ )
633
+ except (FileNotFoundError, ValueError):
634
+ continue
635
+ entries.append(
636
+ ExtractionRunListEntry(
637
+ extractor_id=extractor_dir.name,
638
+ run_id=run_dir.name,
639
+ recipe_id=manifest.recipe.recipe_id,
640
+ recipe_name=manifest.recipe.name,
641
+ catalog_generated_at=manifest.catalog_generated_at,
642
+ created_at=manifest.created_at,
643
+ stats=dict(manifest.stats),
644
+ )
645
+ )
646
+
647
+ entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
648
+ return entries
649
+
650
+ def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
651
+ """
652
+ Delete an extraction run directory and its derived artifacts.
653
+
654
+ :param extractor_id: Extractor plugin identifier.
655
+ :type extractor_id: str
656
+ :param run_id: Extraction run identifier.
657
+ :type run_id: str
658
+ :return: None.
659
+ :rtype: None
660
+ :raises FileNotFoundError: If the extraction run directory does not exist.
661
+ """
662
+ run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
663
+ if not run_dir.is_dir():
664
+ raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
665
+ shutil.rmtree(run_dir)
666
+
570
667
  def _ensure_runs_dir(self) -> None:
571
668
  """
572
669
  Ensure the retrieval runs directory exists.
@@ -1185,6 +1282,78 @@ class Corpus:
1185
1282
  raise KeyError(f"Unknown item identifier: {item_id}")
1186
1283
  return item
1187
1284
 
1285
+ def create_crawl_id(self) -> str:
1286
+ """
1287
+ Create a new crawl identifier.
1288
+
1289
+ :return: Crawl identifier.
1290
+ :rtype: str
1291
+ """
1292
+ return str(uuid.uuid4())
1293
+
1294
+ def ingest_crawled_payload(
1295
+ self,
1296
+ *,
1297
+ crawl_id: str,
1298
+ relative_path: str,
1299
+ data: bytes,
1300
+ filename: str,
1301
+ media_type: str,
1302
+ source_uri: str,
1303
+ tags: Sequence[str],
1304
+ ) -> None:
1305
+ """
1306
+ Ingest a crawled payload under a crawl import namespace.
1307
+
1308
+ :param crawl_id: Crawl identifier used to group crawled artifacts.
1309
+ :type crawl_id: str
1310
+ :param relative_path: Relative path within the crawl prefix.
1311
+ :type relative_path: str
1312
+ :param data: Raw payload bytes.
1313
+ :type data: bytes
1314
+ :param filename: Suggested filename from the payload metadata.
1315
+ :type filename: str
1316
+ :param media_type: Internet Assigned Numbers Authority media type.
1317
+ :type media_type: str
1318
+ :param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
1319
+ :type source_uri: str
1320
+ :param tags: Tags to attach to the stored item.
1321
+ :type tags: Sequence[str]
1322
+ :return: None.
1323
+ :rtype: None
1324
+ """
1325
+ _ = filename
1326
+ item_id = str(uuid.uuid4())
1327
+ destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1328
+ destination_path = (self.root / destination_relpath).resolve()
1329
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ destination_path.write_bytes(data)
1331
+
1332
+ sha256_digest = _sha256_bytes(data)
1333
+
1334
+ sidecar: Dict[str, Any] = {}
1335
+ sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
1336
+ sidecar["media_type"] = media_type
1337
+ sidecar["biblicus"] = {"id": item_id, "source": source_uri}
1338
+ _write_sidecar(destination_path, sidecar)
1339
+
1340
+ merged_metadata = _merge_metadata({}, sidecar)
1341
+ resolved_tags = _merge_tags([], merged_metadata.get("tags"))
1342
+
1343
+ item_record = CatalogItem(
1344
+ id=item_id,
1345
+ relpath=destination_relpath,
1346
+ sha256=sha256_digest,
1347
+ bytes=len(data),
1348
+ media_type=media_type,
1349
+ title=None,
1350
+ tags=list(resolved_tags),
1351
+ metadata=dict(merged_metadata or {}),
1352
+ created_at=utc_now_iso(),
1353
+ source_uri=source_uri,
1354
+ )
1355
+ self._upsert_catalog_item(item_record)
1356
+
1188
1357
  def reindex(self) -> Dict[str, int]:
1189
1358
  """
1190
1359
  Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
biblicus/crawl.py ADDED
@@ -0,0 +1,186 @@
1
+ """
2
+ Website crawl utilities for Biblicus corpora.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections import deque
8
+ from html.parser import HTMLParser
9
+ from typing import Deque, List, Optional, Set
10
+ from urllib.parse import urldefrag, urljoin
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .ignore import load_corpus_ignore_spec
15
+ from .sources import load_source
16
+
17
+
18
+ class CrawlRequest(BaseModel):
19
+ """
20
+ Request describing a website crawl into a corpus.
21
+
22
+ :ivar root_url: Initial uniform resource locator to fetch.
23
+ :vartype root_url: str
24
+ :ivar allowed_prefix: Uniform resource locator prefix that limits which links are eligible for crawl.
25
+ :vartype allowed_prefix: str
26
+ :ivar max_items: Maximum number of items to store during the crawl.
27
+ :vartype max_items: int
28
+ :ivar tags: Tags to apply to stored items.
29
+ :vartype tags: list[str]
30
+ """
31
+
32
+ model_config = ConfigDict(extra="forbid")
33
+
34
+ root_url: str = Field(min_length=1)
35
+ allowed_prefix: str = Field(min_length=1)
36
+ max_items: int = Field(default=50, ge=1)
37
+ tags: List[str] = Field(default_factory=list)
38
+
39
+
40
+ class CrawlResult(BaseModel):
41
+ """
42
+ Summary result for a crawl execution.
43
+
44
+ :ivar crawl_id: Crawl identifier used in the corpus raw import namespace.
45
+ :vartype crawl_id: str
46
+ :ivar discovered_items: Total number of distinct uniform resource locators discovered.
47
+ :vartype discovered_items: int
48
+ :ivar fetched_items: Number of eligible items fetched over hypertext transfer protocol.
49
+ :vartype fetched_items: int
50
+ :ivar stored_items: Number of items stored into the corpus.
51
+ :vartype stored_items: int
52
+ :ivar skipped_outside_prefix_items: Number of discovered items outside the allowed prefix.
53
+ :vartype skipped_outside_prefix_items: int
54
+ :ivar skipped_ignored_items: Number of eligible items skipped due to corpus ignore rules.
55
+ :vartype skipped_ignored_items: int
56
+ :ivar errored_items: Number of eligible items that failed to fetch or store.
57
+ :vartype errored_items: int
58
+ """
59
+
60
+ model_config = ConfigDict(extra="forbid")
61
+
62
+ crawl_id: str
63
+ discovered_items: int = Field(default=0, ge=0)
64
+ fetched_items: int = Field(default=0, ge=0)
65
+ stored_items: int = Field(default=0, ge=0)
66
+ skipped_outside_prefix_items: int = Field(default=0, ge=0)
67
+ skipped_ignored_items: int = Field(default=0, ge=0)
68
+ errored_items: int = Field(default=0, ge=0)
69
+
70
+
71
+ class _LinkExtractor(HTMLParser):
72
+ def __init__(self) -> None:
73
+ super().__init__()
74
+ self.links: List[str] = []
75
+
76
+ def handle_starttag(self, tag: str, attrs): # type: ignore[no-untyped-def]
77
+ _ = tag
78
+ for key, value in attrs:
79
+ if key in {"href", "src"} and isinstance(value, str) and value.strip():
80
+ self.links.append(value.strip())
81
+
82
+
83
+ def _normalize_crawl_url(candidate: str, *, base_url: str) -> Optional[str]:
84
+ joined = urljoin(base_url, candidate)
85
+ joined, _fragment = urldefrag(joined)
86
+ joined = joined.strip()
87
+ if joined.startswith(("mailto:", "javascript:")):
88
+ return None
89
+ return joined
90
+
91
+
92
+ def _crawl_relative_path(url: str, *, allowed_prefix: str) -> str:
93
+ relative = url[len(allowed_prefix) :].lstrip("/")
94
+ if not relative or relative.endswith("/"):
95
+ relative = relative.rstrip("/") + "/index.html" if relative else "index.html"
96
+ return relative
97
+
98
+
99
+ def _should_parse_links(media_type: str) -> bool:
100
+ return media_type.startswith("text/html")
101
+
102
+
103
+ def _discover_links(html_text: str, *, base_url: str) -> List[str]:
104
+ parser = _LinkExtractor()
105
+ parser.feed(html_text)
106
+ discovered: List[str] = []
107
+ for raw in parser.links:
108
+ normalized = _normalize_crawl_url(raw, base_url=base_url)
109
+ if normalized is not None:
110
+ discovered.append(normalized)
111
+ return discovered
112
+
113
+
114
+ def crawl_into_corpus(*, corpus, request: CrawlRequest) -> CrawlResult: # type: ignore[no-untyped-def]
115
+ """
116
+ Crawl a website prefix into a corpus.
117
+
118
+ :param corpus: Target corpus to receive crawled items.
119
+ :type corpus: biblicus.corpus.Corpus
120
+ :param request: Crawl request describing limits and allowed prefix.
121
+ :type request: CrawlRequest
122
+ :return: Crawl result summary.
123
+ :rtype: CrawlResult
124
+ """
125
+ ignore_spec = load_corpus_ignore_spec(corpus.root)
126
+ allowed_prefix = request.allowed_prefix
127
+ root_url = request.root_url
128
+
129
+ crawl_id = corpus.create_crawl_id()
130
+
131
+ queue: Deque[str] = deque([root_url])
132
+ seen: Set[str] = set()
133
+ stored_count = 0
134
+ fetched_count = 0
135
+ skipped_outside_prefix_count = 0
136
+ skipped_ignored_count = 0
137
+ errored_count = 0
138
+ discovered_urls: Set[str] = set()
139
+
140
+ while queue and stored_count < request.max_items:
141
+ url = queue.popleft()
142
+ if url in seen:
143
+ continue
144
+ seen.add(url)
145
+ discovered_urls.add(url)
146
+
147
+ if not url.startswith(allowed_prefix):
148
+ skipped_outside_prefix_count += 1
149
+ continue
150
+
151
+ relative_path = _crawl_relative_path(url, allowed_prefix=allowed_prefix)
152
+ if ignore_spec.matches(relative_path):
153
+ skipped_ignored_count += 1
154
+ continue
155
+
156
+ try:
157
+ payload = load_source(url)
158
+ fetched_count += 1
159
+ corpus.ingest_crawled_payload(
160
+ crawl_id=crawl_id,
161
+ relative_path=relative_path,
162
+ data=payload.data,
163
+ filename=payload.filename,
164
+ media_type=payload.media_type,
165
+ source_uri=payload.source_uri,
166
+ tags=request.tags,
167
+ )
168
+ stored_count += 1
169
+ except Exception:
170
+ errored_count += 1
171
+ continue
172
+
173
+ if _should_parse_links(payload.media_type):
174
+ text = payload.data.decode("utf-8", errors="replace")
175
+ for discovered in _discover_links(text, base_url=url):
176
+ queue.append(discovered)
177
+
178
+ return CrawlResult(
179
+ crawl_id=crawl_id,
180
+ discovered_items=len(discovered_urls),
181
+ fetched_items=fetched_count,
182
+ stored_items=stored_count,
183
+ skipped_outside_prefix_items=skipped_outside_prefix_count,
184
+ skipped_ignored_items=skipped_ignored_count,
185
+ errored_items=errored_count,
186
+ )
biblicus/extraction.py CHANGED
@@ -7,7 +7,6 @@ from __future__ import annotations
7
7
  import json
8
8
  from pathlib import Path
9
9
  from typing import Any, Dict, List, Optional, Tuple
10
- from uuid import uuid4
11
10
 
12
11
  from pydantic import BaseModel, ConfigDict, Field
13
12
 
@@ -196,8 +195,9 @@ def create_extraction_run_manifest(
196
195
  :rtype: ExtractionRunManifest
197
196
  """
198
197
  catalog = corpus.load_catalog()
198
+ run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
199
199
  return ExtractionRunManifest(
200
- run_id=str(uuid4()),
200
+ run_id=run_id,
201
201
  recipe=recipe,
202
202
  corpus_uri=corpus.uri,
203
203
  catalog_generated_at=catalog.generated_at,
@@ -341,6 +341,8 @@ def build_extraction_run(
341
341
  )
342
342
  manifest = create_extraction_run_manifest(corpus, recipe=recipe)
343
343
  run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
344
+ if run_dir.exists():
345
+ return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
344
346
  run_dir.mkdir(parents=True, exist_ok=False)
345
347
 
346
348
  catalog = corpus.load_catalog()
biblicus/models.py CHANGED
@@ -189,6 +189,37 @@ def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
189
189
  return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
190
190
 
191
191
 
192
+ class ExtractionRunListEntry(BaseModel):
193
+ """
194
+ Summary entry for an extraction run stored in a corpus.
195
+
196
+ :ivar extractor_id: Extractor plugin identifier.
197
+ :vartype extractor_id: str
198
+ :ivar run_id: Extraction run identifier.
199
+ :vartype run_id: str
200
+ :ivar recipe_id: Deterministic recipe identifier.
201
+ :vartype recipe_id: str
202
+ :ivar recipe_name: Human-readable recipe name.
203
+ :vartype recipe_name: str
204
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
205
+ :vartype catalog_generated_at: str
206
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
207
+ :vartype created_at: str
208
+ :ivar stats: Run statistics.
209
+ :vartype stats: dict[str, object]
210
+ """
211
+
212
+ model_config = ConfigDict(extra="forbid")
213
+
214
+ extractor_id: str = Field(min_length=1)
215
+ run_id: str = Field(min_length=1)
216
+ recipe_id: str = Field(min_length=1)
217
+ recipe_name: str = Field(min_length=1)
218
+ catalog_generated_at: str = Field(min_length=1)
219
+ created_at: str = Field(min_length=1)
220
+ stats: Dict[str, object] = Field(default_factory=dict)
221
+
222
+
192
223
  class QueryBudget(BaseModel):
193
224
  """
194
225
  Evidence selection budget for retrieval.
biblicus/time.py CHANGED
@@ -14,4 +14,4 @@ def utc_now_iso() -> str:
14
14
  :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
15
15
  :rtype: str
16
16
  """
17
- return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
17
+ return datetime.now(timezone.utc).isoformat(timespec="microseconds")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -77,10 +77,7 @@ flowchart LR
77
77
  direction LR
78
78
  LegendArtifact[Stored artifact or evidence]
79
79
  LegendStep[Step]
80
- LegendStable[Stable region]
81
- LegendPluggable[Pluggable region]
82
80
  LegendArtifact --- LegendStep
83
- LegendStable --- LegendPluggable
84
81
  end
85
82
 
86
83
  subgraph Main[" "]
@@ -93,14 +90,14 @@ flowchart LR
93
90
  Raw --> Catalog[Catalog file]
94
91
  end
95
92
 
96
- subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
93
+ subgraph PluggableExtractionPipeline[Pluggable: extraction pipeline]
97
94
  direction TB
98
95
  Catalog --> Extract[Extract pipeline]
99
96
  Extract --> ExtractedText[Extracted text artifacts]
100
97
  ExtractedText --> ExtractionRun[Extraction run manifest]
101
98
  end
102
99
 
103
- subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
100
+ subgraph PluggableRetrievalBackend[Pluggable: retrieval backend]
104
101
  direction LR
105
102
 
106
103
  subgraph BackendIngestionIndexing[Ingestion and indexing]
@@ -154,8 +151,6 @@ flowchart LR
154
151
  style Main fill:#ffffff,stroke:#ffffff,color:#111111
155
152
  style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
156
153
  style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
157
- style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
158
- style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
159
154
  ```
160
155
 
161
156
  ## Practical value
@@ -168,6 +163,7 @@ flowchart LR
168
163
 
169
164
  - Initialize a corpus folder.
170
165
  - Ingest items from file paths, web addresses, or text input.
166
+ - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
171
167
  - Run extraction when you want derived text artifacts from non-text sources.
172
168
  - Reindex to refresh the catalog after edits.
173
169
  - Build a retrieval run with a backend.
@@ -205,11 +201,22 @@ biblicus init corpora/example
205
201
  biblicus ingest --corpus corpora/example notes/example.txt
206
202
  echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
207
203
  biblicus list --corpus corpora/example
208
- biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
204
+ biblicus extract build --corpus corpora/example --step pass-through-text --step metadata-text
205
+ biblicus extract list --corpus corpora/example
209
206
  biblicus build --corpus corpora/example --backend scan
210
207
  biblicus query --corpus corpora/example --query "note"
211
208
  ```
212
209
 
210
+ If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
211
+
212
+ ```
213
+ biblicus crawl --corpus corpora/example \\
214
+ --root-url https://example.com/docs/index.html \\
215
+ --allowed-prefix https://example.com/docs/ \\
216
+ --max-items 50 \\
217
+ --tag crawled
218
+ ```
219
+
213
220
  ## Python usage
214
221
 
215
222
  From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
@@ -233,7 +240,7 @@ In an assistant system, retrieval usually produces context for a model call. Thi
233
240
 
234
241
  ## Learn more
235
242
 
236
- Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
243
+ Full documentation is published on GitHub Pages: https://anthusai.github.io/Biblicus/
237
244
 
238
245
  The documents below are written to be read in order.
239
246
 
@@ -262,7 +269,16 @@ corpus/
262
269
  config.json
263
270
  catalog.json
264
271
  runs/
265
- run-id.json
272
+ extraction/
273
+ pipeline/
274
+ <run id>/
275
+ manifest.json
276
+ text/
277
+ <item id>.txt
278
+ retrieval/
279
+ <backend id>/
280
+ <run id>/
281
+ manifest.json
266
282
  ```
267
283
 
268
284
  ## Retrieval backends
@@ -313,7 +329,7 @@ python3 -m pip install -e ".[dev]"
313
329
  Build the documentation:
314
330
 
315
331
  ```
316
- python3 -m sphinx -b html docs docs/_build
332
+ python3 -m sphinx -b html docs docs/_build/html
317
333
  ```
318
334
 
319
335
  ## License
@@ -333,4 +349,4 @@ License terms are in `LICENSE`.
333
349
 
334
350
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
335
351
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
336
- [documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
352
+ [documentation-badge]: https://img.shields.io/badge/docs-GitHub%20Pages-blue
@@ -1,20 +1,21 @@
1
- biblicus/__init__.py,sha256=1vPJokNgr7JcDO9eJ2SRR8VLkFG44ZaSACSaalogvYQ,432
1
+ biblicus/__init__.py,sha256=6TFpzDiMJlFyBfVrHfS6xnGd8P7Zybj6DxpWkCJqyf4,432
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=k09mMToSawDC7TbetwtK0RItTLO84EOJCZQKDRA-b9Y,19229
3
+ biblicus/cli.py,sha256=MtYTkJh0lVOTrbwY3u6V8ti5WZUsb96fLatqh23cUYg,24289
4
4
  biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
5
- biblicus/corpus.py,sha256=oBg5nbDoDBkkXaW180ixtvU9Yh0y9nOiZDEMKomtrVU,47688
5
+ biblicus/corpus.py,sha256=gF1RNl6fdz7wplzpHEIkEBkhYxHgKTKguBR_kD9IgUw,54109
6
+ biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
6
7
  biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
7
8
  biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
8
- biblicus/extraction.py,sha256=MYaHkhj0NWBKNcaohnLvNiHLwyps9JyZGaTxX5gHR-A,19281
9
+ biblicus/extraction.py,sha256=VEjBjIpaBboftGgEcpDj7z7um41e5uDZpP_7acQg7fw,19448
9
10
  biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
10
11
  biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
11
12
  biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
12
13
  biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
13
14
  biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
14
- biblicus/models.py,sha256=fdpPRtWmtirjEKpOPL_6ZVRY0vpA2WRqMwNrOqPaauM,14204
15
+ biblicus/models.py,sha256=6SWQ2Czg9O3zjuam8a4m8V3LlEgcGLbEctYDB6F1rRs,15317
15
16
  biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
16
17
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
17
- biblicus/time.py,sha256=NEHkJLJ3RH1PdJVAWMYbNCBnCb6UW9DVBLo7Qh1zO88,485
18
+ biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
18
19
  biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
19
20
  biblicus/user_config.py,sha256=DqO08yLn82DhTiFpmIyyLj_J0nMbrtE8xieTj2Cgd6A,4287
20
21
  biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
@@ -36,9 +37,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
36
37
  biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
37
38
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
38
39
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
39
- biblicus-0.3.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
40
- biblicus-0.3.0.dist-info/METADATA,sha256=MHE8tAh9jGiMwk5X9jPSnhRFB6uAZa3T8jo_c1zrIZM,13202
41
- biblicus-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
42
- biblicus-0.3.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
43
- biblicus-0.3.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
44
- biblicus-0.3.0.dist-info/RECORD,,
40
+ biblicus-0.4.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
41
+ biblicus-0.4.0.dist-info/METADATA,sha256=JMZjfIOMEmbWFHgzjUHsQDUUg11jdxudtJdRK8Iu29U,13586
42
+ biblicus-0.4.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
43
+ biblicus-0.4.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
44
+ biblicus-0.4.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
45
+ biblicus-0.4.0.dist-info/RECORD,,