biblicus 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/cli.py +147 -7
- biblicus/corpus.py +170 -1
- biblicus/crawl.py +186 -0
- biblicus/extraction.py +4 -2
- biblicus/models.py +31 -0
- biblicus/time.py +1 -1
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +29 -13
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/RECORD +13 -12
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.3.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/cli.py
CHANGED
|
@@ -14,10 +14,11 @@ from pydantic import ValidationError
|
|
|
14
14
|
|
|
15
15
|
from .backends import get_backend
|
|
16
16
|
from .corpus import Corpus
|
|
17
|
+
from .crawl import CrawlRequest, crawl_into_corpus
|
|
17
18
|
from .errors import ExtractionRunFatalError
|
|
18
19
|
from .evaluation import evaluate_run, load_dataset
|
|
19
20
|
from .extraction import build_extraction_run
|
|
20
|
-
from .models import QueryBudget
|
|
21
|
+
from .models import QueryBudget, parse_extraction_run_reference
|
|
21
22
|
from .uris import corpus_ref_to_path
|
|
22
23
|
|
|
23
24
|
|
|
@@ -327,7 +328,7 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
327
328
|
return 0
|
|
328
329
|
|
|
329
330
|
|
|
330
|
-
def
|
|
331
|
+
def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
331
332
|
"""
|
|
332
333
|
Build a text extraction run for the corpus using a pipeline of extractors.
|
|
333
334
|
|
|
@@ -359,6 +360,69 @@ def cmd_extract(arguments: argparse.Namespace) -> int:
|
|
|
359
360
|
return 0
|
|
360
361
|
|
|
361
362
|
|
|
363
|
+
def cmd_extract_list(arguments: argparse.Namespace) -> int:
|
|
364
|
+
"""
|
|
365
|
+
List extraction runs stored under the corpus.
|
|
366
|
+
|
|
367
|
+
:param arguments: Parsed command-line interface arguments.
|
|
368
|
+
:type arguments: argparse.Namespace
|
|
369
|
+
:return: Exit code.
|
|
370
|
+
:rtype: int
|
|
371
|
+
"""
|
|
372
|
+
corpus = (
|
|
373
|
+
Corpus.open(arguments.corpus)
|
|
374
|
+
if getattr(arguments, "corpus", None)
|
|
375
|
+
else Corpus.find(Path.cwd())
|
|
376
|
+
)
|
|
377
|
+
runs = corpus.list_extraction_runs(extractor_id=arguments.extractor_id)
|
|
378
|
+
print(json.dumps([entry.model_dump() for entry in runs], indent=2))
|
|
379
|
+
return 0
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def cmd_extract_show(arguments: argparse.Namespace) -> int:
|
|
383
|
+
"""
|
|
384
|
+
Show an extraction run manifest.
|
|
385
|
+
|
|
386
|
+
:param arguments: Parsed command-line interface arguments.
|
|
387
|
+
:type arguments: argparse.Namespace
|
|
388
|
+
:return: Exit code.
|
|
389
|
+
:rtype: int
|
|
390
|
+
"""
|
|
391
|
+
corpus = (
|
|
392
|
+
Corpus.open(arguments.corpus)
|
|
393
|
+
if getattr(arguments, "corpus", None)
|
|
394
|
+
else Corpus.find(Path.cwd())
|
|
395
|
+
)
|
|
396
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
397
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
398
|
+
extractor_id=reference.extractor_id, run_id=reference.run_id
|
|
399
|
+
)
|
|
400
|
+
print(manifest.model_dump_json(indent=2))
|
|
401
|
+
return 0
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def cmd_extract_delete(arguments: argparse.Namespace) -> int:
|
|
405
|
+
"""
|
|
406
|
+
Delete an extraction run directory and its derived artifacts.
|
|
407
|
+
|
|
408
|
+
:param arguments: Parsed command-line interface arguments.
|
|
409
|
+
:type arguments: argparse.Namespace
|
|
410
|
+
:return: Exit code.
|
|
411
|
+
:rtype: int
|
|
412
|
+
"""
|
|
413
|
+
corpus = (
|
|
414
|
+
Corpus.open(arguments.corpus)
|
|
415
|
+
if getattr(arguments, "corpus", None)
|
|
416
|
+
else Corpus.find(Path.cwd())
|
|
417
|
+
)
|
|
418
|
+
if arguments.confirm != arguments.run:
|
|
419
|
+
raise ValueError("Refusing to delete extraction run without an exact --confirm match.")
|
|
420
|
+
reference = parse_extraction_run_reference(arguments.run)
|
|
421
|
+
corpus.delete_extraction_run(extractor_id=reference.extractor_id, run_id=reference.run_id)
|
|
422
|
+
print(json.dumps({"deleted": True, "run": arguments.run}, indent=2))
|
|
423
|
+
return 0
|
|
424
|
+
|
|
425
|
+
|
|
362
426
|
def cmd_query(arguments: argparse.Namespace) -> int:
|
|
363
427
|
"""
|
|
364
428
|
Execute a retrieval query.
|
|
@@ -414,6 +478,32 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
|
|
|
414
478
|
return 0
|
|
415
479
|
|
|
416
480
|
|
|
481
|
+
def cmd_crawl(arguments: argparse.Namespace) -> int:
|
|
482
|
+
"""
|
|
483
|
+
Crawl a website prefix into a corpus.
|
|
484
|
+
|
|
485
|
+
:param arguments: Parsed command-line interface arguments.
|
|
486
|
+
:type arguments: argparse.Namespace
|
|
487
|
+
:return: Exit code.
|
|
488
|
+
:rtype: int
|
|
489
|
+
"""
|
|
490
|
+
corpus = (
|
|
491
|
+
Corpus.open(arguments.corpus)
|
|
492
|
+
if getattr(arguments, "corpus", None)
|
|
493
|
+
else Corpus.find(Path.cwd())
|
|
494
|
+
)
|
|
495
|
+
tags = _parse_tags(arguments.tags, arguments.tag)
|
|
496
|
+
request = CrawlRequest(
|
|
497
|
+
root_url=arguments.root_url,
|
|
498
|
+
allowed_prefix=arguments.allowed_prefix,
|
|
499
|
+
max_items=arguments.max_items,
|
|
500
|
+
tags=tags,
|
|
501
|
+
)
|
|
502
|
+
result = crawl_into_corpus(corpus=corpus, request=request)
|
|
503
|
+
print(result.model_dump_json(indent=2))
|
|
504
|
+
return 0
|
|
505
|
+
|
|
506
|
+
|
|
417
507
|
def build_parser() -> argparse.ArgumentParser:
|
|
418
508
|
"""
|
|
419
509
|
Build the command-line interface argument parser.
|
|
@@ -511,16 +601,53 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
511
601
|
)
|
|
512
602
|
p_build.set_defaults(func=cmd_build)
|
|
513
603
|
|
|
514
|
-
p_extract = sub.add_parser("extract", help="
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
604
|
+
p_extract = sub.add_parser("extract", help="Work with text extraction runs for the corpus.")
|
|
605
|
+
extract_sub = p_extract.add_subparsers(dest="extract_command", required=True)
|
|
606
|
+
|
|
607
|
+
p_extract_build = extract_sub.add_parser("build", help="Build a text extraction run.")
|
|
608
|
+
_add_common_corpus_arg(p_extract_build)
|
|
609
|
+
p_extract_build.add_argument(
|
|
610
|
+
"--recipe-name", default="default", help="Human-readable recipe name."
|
|
611
|
+
)
|
|
612
|
+
p_extract_build.add_argument(
|
|
518
613
|
"--step",
|
|
519
614
|
action="append",
|
|
520
615
|
default=None,
|
|
521
616
|
help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
|
|
522
617
|
)
|
|
523
|
-
|
|
618
|
+
p_extract_build.set_defaults(func=cmd_extract_build)
|
|
619
|
+
|
|
620
|
+
p_extract_list = extract_sub.add_parser("list", help="List extraction runs.")
|
|
621
|
+
_add_common_corpus_arg(p_extract_list)
|
|
622
|
+
p_extract_list.add_argument(
|
|
623
|
+
"--extractor-id",
|
|
624
|
+
default=None,
|
|
625
|
+
help="Optional extractor identifier filter (for example: pipeline).",
|
|
626
|
+
)
|
|
627
|
+
p_extract_list.set_defaults(func=cmd_extract_list)
|
|
628
|
+
|
|
629
|
+
p_extract_show = extract_sub.add_parser("show", help="Show an extraction run manifest.")
|
|
630
|
+
_add_common_corpus_arg(p_extract_show)
|
|
631
|
+
p_extract_show.add_argument(
|
|
632
|
+
"--run",
|
|
633
|
+
required=True,
|
|
634
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
635
|
+
)
|
|
636
|
+
p_extract_show.set_defaults(func=cmd_extract_show)
|
|
637
|
+
|
|
638
|
+
p_extract_delete = extract_sub.add_parser("delete", help="Delete an extraction run directory.")
|
|
639
|
+
_add_common_corpus_arg(p_extract_delete)
|
|
640
|
+
p_extract_delete.add_argument(
|
|
641
|
+
"--run",
|
|
642
|
+
required=True,
|
|
643
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
644
|
+
)
|
|
645
|
+
p_extract_delete.add_argument(
|
|
646
|
+
"--confirm",
|
|
647
|
+
required=True,
|
|
648
|
+
help="Type the exact extractor_id:run_id to confirm deletion.",
|
|
649
|
+
)
|
|
650
|
+
p_extract_delete.set_defaults(func=cmd_extract_delete)
|
|
524
651
|
|
|
525
652
|
p_query = sub.add_parser("query", help="Run a retrieval query.")
|
|
526
653
|
_add_common_corpus_arg(p_query)
|
|
@@ -545,6 +672,19 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
545
672
|
p_eval.add_argument("--max-items-per-source", type=int, default=5)
|
|
546
673
|
p_eval.set_defaults(func=cmd_eval)
|
|
547
674
|
|
|
675
|
+
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
676
|
+
_add_common_corpus_arg(p_crawl)
|
|
677
|
+
p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
|
|
678
|
+
p_crawl.add_argument(
|
|
679
|
+
"--allowed-prefix",
|
|
680
|
+
required=True,
|
|
681
|
+
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
682
|
+
)
|
|
683
|
+
p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
|
|
684
|
+
p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
|
|
685
|
+
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
686
|
+
p_crawl.set_defaults(func=cmd_crawl)
|
|
687
|
+
|
|
548
688
|
return parser
|
|
549
689
|
|
|
550
690
|
|
biblicus/corpus.py
CHANGED
|
@@ -27,7 +27,14 @@ from .frontmatter import parse_front_matter, render_front_matter
|
|
|
27
27
|
from .hook_manager import HookManager
|
|
28
28
|
from .hooks import HookPoint
|
|
29
29
|
from .ignore import load_corpus_ignore_spec
|
|
30
|
-
from .models import
|
|
30
|
+
from .models import (
|
|
31
|
+
CatalogItem,
|
|
32
|
+
CorpusCatalog,
|
|
33
|
+
CorpusConfig,
|
|
34
|
+
ExtractionRunListEntry,
|
|
35
|
+
IngestResult,
|
|
36
|
+
RetrievalRun,
|
|
37
|
+
)
|
|
31
38
|
from .sources import load_source
|
|
32
39
|
from .time import utc_now_iso
|
|
33
40
|
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
@@ -567,6 +574,96 @@ class Corpus:
|
|
|
567
574
|
return None
|
|
568
575
|
return path.read_text(encoding="utf-8")
|
|
569
576
|
|
|
577
|
+
def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
|
|
578
|
+
"""
|
|
579
|
+
Load an extraction run manifest from the corpus.
|
|
580
|
+
|
|
581
|
+
:param extractor_id: Extractor plugin identifier.
|
|
582
|
+
:type extractor_id: str
|
|
583
|
+
:param run_id: Extraction run identifier.
|
|
584
|
+
:type run_id: str
|
|
585
|
+
:return: Parsed extraction run manifest.
|
|
586
|
+
:rtype: biblicus.extraction.ExtractionRunManifest
|
|
587
|
+
:raises FileNotFoundError: If the manifest file does not exist.
|
|
588
|
+
:raises ValueError: If the manifest data is invalid.
|
|
589
|
+
"""
|
|
590
|
+
from .extraction import ExtractionRunManifest
|
|
591
|
+
|
|
592
|
+
manifest_path = (
|
|
593
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
|
|
594
|
+
)
|
|
595
|
+
if not manifest_path.is_file():
|
|
596
|
+
raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
|
|
597
|
+
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
598
|
+
return ExtractionRunManifest.model_validate(data)
|
|
599
|
+
|
|
600
|
+
def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
|
|
601
|
+
"""
|
|
602
|
+
List extraction runs stored under the corpus.
|
|
603
|
+
|
|
604
|
+
:param extractor_id: Optional extractor identifier filter.
|
|
605
|
+
:type extractor_id: str or None
|
|
606
|
+
:return: Summary list entries for each run.
|
|
607
|
+
:rtype: list[biblicus.models.ExtractionRunListEntry]
|
|
608
|
+
"""
|
|
609
|
+
runs_root = self.extraction_runs_dir
|
|
610
|
+
if not runs_root.is_dir():
|
|
611
|
+
return []
|
|
612
|
+
|
|
613
|
+
extractor_dirs: List[Path]
|
|
614
|
+
if extractor_id is None:
|
|
615
|
+
extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
|
|
616
|
+
else:
|
|
617
|
+
extractor_path = runs_root / extractor_id
|
|
618
|
+
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
619
|
+
|
|
620
|
+
entries: List[ExtractionRunListEntry] = []
|
|
621
|
+
for extractor_dir in extractor_dirs:
|
|
622
|
+
for run_dir in sorted(extractor_dir.iterdir()):
|
|
623
|
+
if not run_dir.is_dir():
|
|
624
|
+
continue
|
|
625
|
+
manifest_path = run_dir / "manifest.json"
|
|
626
|
+
if not manifest_path.is_file():
|
|
627
|
+
continue
|
|
628
|
+
try:
|
|
629
|
+
manifest = self.load_extraction_run_manifest(
|
|
630
|
+
extractor_id=extractor_dir.name,
|
|
631
|
+
run_id=run_dir.name,
|
|
632
|
+
)
|
|
633
|
+
except (FileNotFoundError, ValueError):
|
|
634
|
+
continue
|
|
635
|
+
entries.append(
|
|
636
|
+
ExtractionRunListEntry(
|
|
637
|
+
extractor_id=extractor_dir.name,
|
|
638
|
+
run_id=run_dir.name,
|
|
639
|
+
recipe_id=manifest.recipe.recipe_id,
|
|
640
|
+
recipe_name=manifest.recipe.name,
|
|
641
|
+
catalog_generated_at=manifest.catalog_generated_at,
|
|
642
|
+
created_at=manifest.created_at,
|
|
643
|
+
stats=dict(manifest.stats),
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
|
|
648
|
+
return entries
|
|
649
|
+
|
|
650
|
+
def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
|
|
651
|
+
"""
|
|
652
|
+
Delete an extraction run directory and its derived artifacts.
|
|
653
|
+
|
|
654
|
+
:param extractor_id: Extractor plugin identifier.
|
|
655
|
+
:type extractor_id: str
|
|
656
|
+
:param run_id: Extraction run identifier.
|
|
657
|
+
:type run_id: str
|
|
658
|
+
:return: None.
|
|
659
|
+
:rtype: None
|
|
660
|
+
:raises FileNotFoundError: If the extraction run directory does not exist.
|
|
661
|
+
"""
|
|
662
|
+
run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
663
|
+
if not run_dir.is_dir():
|
|
664
|
+
raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
|
|
665
|
+
shutil.rmtree(run_dir)
|
|
666
|
+
|
|
570
667
|
def _ensure_runs_dir(self) -> None:
|
|
571
668
|
"""
|
|
572
669
|
Ensure the retrieval runs directory exists.
|
|
@@ -1185,6 +1282,78 @@ class Corpus:
|
|
|
1185
1282
|
raise KeyError(f"Unknown item identifier: {item_id}")
|
|
1186
1283
|
return item
|
|
1187
1284
|
|
|
1285
|
+
def create_crawl_id(self) -> str:
|
|
1286
|
+
"""
|
|
1287
|
+
Create a new crawl identifier.
|
|
1288
|
+
|
|
1289
|
+
:return: Crawl identifier.
|
|
1290
|
+
:rtype: str
|
|
1291
|
+
"""
|
|
1292
|
+
return str(uuid.uuid4())
|
|
1293
|
+
|
|
1294
|
+
def ingest_crawled_payload(
|
|
1295
|
+
self,
|
|
1296
|
+
*,
|
|
1297
|
+
crawl_id: str,
|
|
1298
|
+
relative_path: str,
|
|
1299
|
+
data: bytes,
|
|
1300
|
+
filename: str,
|
|
1301
|
+
media_type: str,
|
|
1302
|
+
source_uri: str,
|
|
1303
|
+
tags: Sequence[str],
|
|
1304
|
+
) -> None:
|
|
1305
|
+
"""
|
|
1306
|
+
Ingest a crawled payload under a crawl import namespace.
|
|
1307
|
+
|
|
1308
|
+
:param crawl_id: Crawl identifier used to group crawled artifacts.
|
|
1309
|
+
:type crawl_id: str
|
|
1310
|
+
:param relative_path: Relative path within the crawl prefix.
|
|
1311
|
+
:type relative_path: str
|
|
1312
|
+
:param data: Raw payload bytes.
|
|
1313
|
+
:type data: bytes
|
|
1314
|
+
:param filename: Suggested filename from the payload metadata.
|
|
1315
|
+
:type filename: str
|
|
1316
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
1317
|
+
:type media_type: str
|
|
1318
|
+
:param source_uri: Source uniform resource identifier (typically an http or https uniform resource locator).
|
|
1319
|
+
:type source_uri: str
|
|
1320
|
+
:param tags: Tags to attach to the stored item.
|
|
1321
|
+
:type tags: Sequence[str]
|
|
1322
|
+
:return: None.
|
|
1323
|
+
:rtype: None
|
|
1324
|
+
"""
|
|
1325
|
+
_ = filename
|
|
1326
|
+
item_id = str(uuid.uuid4())
|
|
1327
|
+
destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
|
|
1328
|
+
destination_path = (self.root / destination_relpath).resolve()
|
|
1329
|
+
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1330
|
+
destination_path.write_bytes(data)
|
|
1331
|
+
|
|
1332
|
+
sha256_digest = _sha256_bytes(data)
|
|
1333
|
+
|
|
1334
|
+
sidecar: Dict[str, Any] = {}
|
|
1335
|
+
sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
1336
|
+
sidecar["media_type"] = media_type
|
|
1337
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
1338
|
+
_write_sidecar(destination_path, sidecar)
|
|
1339
|
+
|
|
1340
|
+
merged_metadata = _merge_metadata({}, sidecar)
|
|
1341
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
1342
|
+
|
|
1343
|
+
item_record = CatalogItem(
|
|
1344
|
+
id=item_id,
|
|
1345
|
+
relpath=destination_relpath,
|
|
1346
|
+
sha256=sha256_digest,
|
|
1347
|
+
bytes=len(data),
|
|
1348
|
+
media_type=media_type,
|
|
1349
|
+
title=None,
|
|
1350
|
+
tags=list(resolved_tags),
|
|
1351
|
+
metadata=dict(merged_metadata or {}),
|
|
1352
|
+
created_at=utc_now_iso(),
|
|
1353
|
+
source_uri=source_uri,
|
|
1354
|
+
)
|
|
1355
|
+
self._upsert_catalog_item(item_record)
|
|
1356
|
+
|
|
1188
1357
|
def reindex(self) -> Dict[str, int]:
|
|
1189
1358
|
"""
|
|
1190
1359
|
Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
|
biblicus/crawl.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Website crawl utilities for Biblicus corpora.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections import deque
|
|
8
|
+
from html.parser import HTMLParser
|
|
9
|
+
from typing import Deque, List, Optional, Set
|
|
10
|
+
from urllib.parse import urldefrag, urljoin
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from .ignore import load_corpus_ignore_spec
|
|
15
|
+
from .sources import load_source
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CrawlRequest(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Request describing a website crawl into a corpus.
|
|
21
|
+
|
|
22
|
+
:ivar root_url: Initial uniform resource locator to fetch.
|
|
23
|
+
:vartype root_url: str
|
|
24
|
+
:ivar allowed_prefix: Uniform resource locator prefix that limits which links are eligible for crawl.
|
|
25
|
+
:vartype allowed_prefix: str
|
|
26
|
+
:ivar max_items: Maximum number of items to store during the crawl.
|
|
27
|
+
:vartype max_items: int
|
|
28
|
+
:ivar tags: Tags to apply to stored items.
|
|
29
|
+
:vartype tags: list[str]
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(extra="forbid")
|
|
33
|
+
|
|
34
|
+
root_url: str = Field(min_length=1)
|
|
35
|
+
allowed_prefix: str = Field(min_length=1)
|
|
36
|
+
max_items: int = Field(default=50, ge=1)
|
|
37
|
+
tags: List[str] = Field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CrawlResult(BaseModel):
|
|
41
|
+
"""
|
|
42
|
+
Summary result for a crawl execution.
|
|
43
|
+
|
|
44
|
+
:ivar crawl_id: Crawl identifier used in the corpus raw import namespace.
|
|
45
|
+
:vartype crawl_id: str
|
|
46
|
+
:ivar discovered_items: Total number of distinct uniform resource locators discovered.
|
|
47
|
+
:vartype discovered_items: int
|
|
48
|
+
:ivar fetched_items: Number of eligible items fetched over hypertext transfer protocol.
|
|
49
|
+
:vartype fetched_items: int
|
|
50
|
+
:ivar stored_items: Number of items stored into the corpus.
|
|
51
|
+
:vartype stored_items: int
|
|
52
|
+
:ivar skipped_outside_prefix_items: Number of discovered items outside the allowed prefix.
|
|
53
|
+
:vartype skipped_outside_prefix_items: int
|
|
54
|
+
:ivar skipped_ignored_items: Number of eligible items skipped due to corpus ignore rules.
|
|
55
|
+
:vartype skipped_ignored_items: int
|
|
56
|
+
:ivar errored_items: Number of eligible items that failed to fetch or store.
|
|
57
|
+
:vartype errored_items: int
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(extra="forbid")
|
|
61
|
+
|
|
62
|
+
crawl_id: str
|
|
63
|
+
discovered_items: int = Field(default=0, ge=0)
|
|
64
|
+
fetched_items: int = Field(default=0, ge=0)
|
|
65
|
+
stored_items: int = Field(default=0, ge=0)
|
|
66
|
+
skipped_outside_prefix_items: int = Field(default=0, ge=0)
|
|
67
|
+
skipped_ignored_items: int = Field(default=0, ge=0)
|
|
68
|
+
errored_items: int = Field(default=0, ge=0)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class _LinkExtractor(HTMLParser):
|
|
72
|
+
def __init__(self) -> None:
|
|
73
|
+
super().__init__()
|
|
74
|
+
self.links: List[str] = []
|
|
75
|
+
|
|
76
|
+
def handle_starttag(self, tag: str, attrs): # type: ignore[no-untyped-def]
|
|
77
|
+
_ = tag
|
|
78
|
+
for key, value in attrs:
|
|
79
|
+
if key in {"href", "src"} and isinstance(value, str) and value.strip():
|
|
80
|
+
self.links.append(value.strip())
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _normalize_crawl_url(candidate: str, *, base_url: str) -> Optional[str]:
|
|
84
|
+
joined = urljoin(base_url, candidate)
|
|
85
|
+
joined, _fragment = urldefrag(joined)
|
|
86
|
+
joined = joined.strip()
|
|
87
|
+
if joined.startswith(("mailto:", "javascript:")):
|
|
88
|
+
return None
|
|
89
|
+
return joined
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _crawl_relative_path(url: str, *, allowed_prefix: str) -> str:
|
|
93
|
+
relative = url[len(allowed_prefix) :].lstrip("/")
|
|
94
|
+
if not relative or relative.endswith("/"):
|
|
95
|
+
relative = relative.rstrip("/") + "/index.html" if relative else "index.html"
|
|
96
|
+
return relative
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _should_parse_links(media_type: str) -> bool:
|
|
100
|
+
return media_type.startswith("text/html")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _discover_links(html_text: str, *, base_url: str) -> List[str]:
|
|
104
|
+
parser = _LinkExtractor()
|
|
105
|
+
parser.feed(html_text)
|
|
106
|
+
discovered: List[str] = []
|
|
107
|
+
for raw in parser.links:
|
|
108
|
+
normalized = _normalize_crawl_url(raw, base_url=base_url)
|
|
109
|
+
if normalized is not None:
|
|
110
|
+
discovered.append(normalized)
|
|
111
|
+
return discovered
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def crawl_into_corpus(*, corpus, request: CrawlRequest) -> CrawlResult: # type: ignore[no-untyped-def]
|
|
115
|
+
"""
|
|
116
|
+
Crawl a website prefix into a corpus.
|
|
117
|
+
|
|
118
|
+
:param corpus: Target corpus to receive crawled items.
|
|
119
|
+
:type corpus: biblicus.corpus.Corpus
|
|
120
|
+
:param request: Crawl request describing limits and allowed prefix.
|
|
121
|
+
:type request: CrawlRequest
|
|
122
|
+
:return: Crawl result summary.
|
|
123
|
+
:rtype: CrawlResult
|
|
124
|
+
"""
|
|
125
|
+
ignore_spec = load_corpus_ignore_spec(corpus.root)
|
|
126
|
+
allowed_prefix = request.allowed_prefix
|
|
127
|
+
root_url = request.root_url
|
|
128
|
+
|
|
129
|
+
crawl_id = corpus.create_crawl_id()
|
|
130
|
+
|
|
131
|
+
queue: Deque[str] = deque([root_url])
|
|
132
|
+
seen: Set[str] = set()
|
|
133
|
+
stored_count = 0
|
|
134
|
+
fetched_count = 0
|
|
135
|
+
skipped_outside_prefix_count = 0
|
|
136
|
+
skipped_ignored_count = 0
|
|
137
|
+
errored_count = 0
|
|
138
|
+
discovered_urls: Set[str] = set()
|
|
139
|
+
|
|
140
|
+
while queue and stored_count < request.max_items:
|
|
141
|
+
url = queue.popleft()
|
|
142
|
+
if url in seen:
|
|
143
|
+
continue
|
|
144
|
+
seen.add(url)
|
|
145
|
+
discovered_urls.add(url)
|
|
146
|
+
|
|
147
|
+
if not url.startswith(allowed_prefix):
|
|
148
|
+
skipped_outside_prefix_count += 1
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
relative_path = _crawl_relative_path(url, allowed_prefix=allowed_prefix)
|
|
152
|
+
if ignore_spec.matches(relative_path):
|
|
153
|
+
skipped_ignored_count += 1
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
payload = load_source(url)
|
|
158
|
+
fetched_count += 1
|
|
159
|
+
corpus.ingest_crawled_payload(
|
|
160
|
+
crawl_id=crawl_id,
|
|
161
|
+
relative_path=relative_path,
|
|
162
|
+
data=payload.data,
|
|
163
|
+
filename=payload.filename,
|
|
164
|
+
media_type=payload.media_type,
|
|
165
|
+
source_uri=payload.source_uri,
|
|
166
|
+
tags=request.tags,
|
|
167
|
+
)
|
|
168
|
+
stored_count += 1
|
|
169
|
+
except Exception:
|
|
170
|
+
errored_count += 1
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if _should_parse_links(payload.media_type):
|
|
174
|
+
text = payload.data.decode("utf-8", errors="replace")
|
|
175
|
+
for discovered in _discover_links(text, base_url=url):
|
|
176
|
+
queue.append(discovered)
|
|
177
|
+
|
|
178
|
+
return CrawlResult(
|
|
179
|
+
crawl_id=crawl_id,
|
|
180
|
+
discovered_items=len(discovered_urls),
|
|
181
|
+
fetched_items=fetched_count,
|
|
182
|
+
stored_items=stored_count,
|
|
183
|
+
skipped_outside_prefix_items=skipped_outside_prefix_count,
|
|
184
|
+
skipped_ignored_items=skipped_ignored_count,
|
|
185
|
+
errored_items=errored_count,
|
|
186
|
+
)
|
biblicus/extraction.py
CHANGED
|
@@ -7,7 +7,6 @@ from __future__ import annotations
|
|
|
7
7
|
import json
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple
|
|
10
|
-
from uuid import uuid4
|
|
11
10
|
|
|
12
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
12
|
|
|
@@ -196,8 +195,9 @@ def create_extraction_run_manifest(
|
|
|
196
195
|
:rtype: ExtractionRunManifest
|
|
197
196
|
"""
|
|
198
197
|
catalog = corpus.load_catalog()
|
|
198
|
+
run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
|
|
199
199
|
return ExtractionRunManifest(
|
|
200
|
-
run_id=
|
|
200
|
+
run_id=run_id,
|
|
201
201
|
recipe=recipe,
|
|
202
202
|
corpus_uri=corpus.uri,
|
|
203
203
|
catalog_generated_at=catalog.generated_at,
|
|
@@ -341,6 +341,8 @@ def build_extraction_run(
|
|
|
341
341
|
)
|
|
342
342
|
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
343
343
|
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
344
|
+
if run_dir.exists():
|
|
345
|
+
return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
344
346
|
run_dir.mkdir(parents=True, exist_ok=False)
|
|
345
347
|
|
|
346
348
|
catalog = corpus.load_catalog()
|
biblicus/models.py
CHANGED
|
@@ -189,6 +189,37 @@ def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
|
|
|
189
189
|
return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
|
|
190
190
|
|
|
191
191
|
|
|
192
|
+
class ExtractionRunListEntry(BaseModel):
|
|
193
|
+
"""
|
|
194
|
+
Summary entry for an extraction run stored in a corpus.
|
|
195
|
+
|
|
196
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
197
|
+
:vartype extractor_id: str
|
|
198
|
+
:ivar run_id: Extraction run identifier.
|
|
199
|
+
:vartype run_id: str
|
|
200
|
+
:ivar recipe_id: Deterministic recipe identifier.
|
|
201
|
+
:vartype recipe_id: str
|
|
202
|
+
:ivar recipe_name: Human-readable recipe name.
|
|
203
|
+
:vartype recipe_name: str
|
|
204
|
+
:ivar catalog_generated_at: Catalog timestamp used for the run.
|
|
205
|
+
:vartype catalog_generated_at: str
|
|
206
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
|
|
207
|
+
:vartype created_at: str
|
|
208
|
+
:ivar stats: Run statistics.
|
|
209
|
+
:vartype stats: dict[str, object]
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
model_config = ConfigDict(extra="forbid")
|
|
213
|
+
|
|
214
|
+
extractor_id: str = Field(min_length=1)
|
|
215
|
+
run_id: str = Field(min_length=1)
|
|
216
|
+
recipe_id: str = Field(min_length=1)
|
|
217
|
+
recipe_name: str = Field(min_length=1)
|
|
218
|
+
catalog_generated_at: str = Field(min_length=1)
|
|
219
|
+
created_at: str = Field(min_length=1)
|
|
220
|
+
stats: Dict[str, object] = Field(default_factory=dict)
|
|
221
|
+
|
|
222
|
+
|
|
192
223
|
class QueryBudget(BaseModel):
|
|
193
224
|
"""
|
|
194
225
|
Evidence selection budget for retrieval.
|
biblicus/time.py
CHANGED
|
@@ -14,4 +14,4 @@ def utc_now_iso() -> str:
|
|
|
14
14
|
:return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
|
|
15
15
|
:rtype: str
|
|
16
16
|
"""
|
|
17
|
-
return datetime.now(timezone.utc).
|
|
17
|
+
return datetime.now(timezone.utc).isoformat(timespec="microseconds")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -77,10 +77,7 @@ flowchart LR
|
|
|
77
77
|
direction LR
|
|
78
78
|
LegendArtifact[Stored artifact or evidence]
|
|
79
79
|
LegendStep[Step]
|
|
80
|
-
LegendStable[Stable region]
|
|
81
|
-
LegendPluggable[Pluggable region]
|
|
82
80
|
LegendArtifact --- LegendStep
|
|
83
|
-
LegendStable --- LegendPluggable
|
|
84
81
|
end
|
|
85
82
|
|
|
86
83
|
subgraph Main[" "]
|
|
@@ -93,14 +90,14 @@ flowchart LR
|
|
|
93
90
|
Raw --> Catalog[Catalog file]
|
|
94
91
|
end
|
|
95
92
|
|
|
96
|
-
subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
|
|
93
|
+
subgraph PluggableExtractionPipeline[Pluggable: extraction pipeline]
|
|
97
94
|
direction TB
|
|
98
95
|
Catalog --> Extract[Extract pipeline]
|
|
99
96
|
Extract --> ExtractedText[Extracted text artifacts]
|
|
100
97
|
ExtractedText --> ExtractionRun[Extraction run manifest]
|
|
101
98
|
end
|
|
102
99
|
|
|
103
|
-
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
100
|
+
subgraph PluggableRetrievalBackend[Pluggable: retrieval backend]
|
|
104
101
|
direction LR
|
|
105
102
|
|
|
106
103
|
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
@@ -154,8 +151,6 @@ flowchart LR
|
|
|
154
151
|
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
155
152
|
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
156
153
|
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
157
|
-
style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
158
|
-
style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
159
154
|
```
|
|
160
155
|
|
|
161
156
|
## Practical value
|
|
@@ -168,6 +163,7 @@ flowchart LR
|
|
|
168
163
|
|
|
169
164
|
- Initialize a corpus folder.
|
|
170
165
|
- Ingest items from file paths, web addresses, or text input.
|
|
166
|
+
- Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
|
|
171
167
|
- Run extraction when you want derived text artifacts from non-text sources.
|
|
172
168
|
- Reindex to refresh the catalog after edits.
|
|
173
169
|
- Build a retrieval run with a backend.
|
|
@@ -205,11 +201,22 @@ biblicus init corpora/example
|
|
|
205
201
|
biblicus ingest --corpus corpora/example notes/example.txt
|
|
206
202
|
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
207
203
|
biblicus list --corpus corpora/example
|
|
208
|
-
biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
|
|
204
|
+
biblicus extract build --corpus corpora/example --step pass-through-text --step metadata-text
|
|
205
|
+
biblicus extract list --corpus corpora/example
|
|
209
206
|
biblicus build --corpus corpora/example --backend scan
|
|
210
207
|
biblicus query --corpus corpora/example --query "note"
|
|
211
208
|
```
|
|
212
209
|
|
|
210
|
+
If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
biblicus crawl --corpus corpora/example \\
|
|
214
|
+
--root-url https://example.com/docs/index.html \\
|
|
215
|
+
--allowed-prefix https://example.com/docs/ \\
|
|
216
|
+
--max-items 50 \\
|
|
217
|
+
--tag crawled
|
|
218
|
+
```
|
|
219
|
+
|
|
213
220
|
## Python usage
|
|
214
221
|
|
|
215
222
|
From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
|
|
@@ -233,7 +240,7 @@ In an assistant system, retrieval usually produces context for a model call. Thi
|
|
|
233
240
|
|
|
234
241
|
## Learn more
|
|
235
242
|
|
|
236
|
-
Full documentation is
|
|
243
|
+
Full documentation is published on GitHub Pages: https://anthusai.github.io/Biblicus/
|
|
237
244
|
|
|
238
245
|
The documents below are written to be read in order.
|
|
239
246
|
|
|
@@ -262,7 +269,16 @@ corpus/
|
|
|
262
269
|
config.json
|
|
263
270
|
catalog.json
|
|
264
271
|
runs/
|
|
265
|
-
|
|
272
|
+
extraction/
|
|
273
|
+
pipeline/
|
|
274
|
+
<run id>/
|
|
275
|
+
manifest.json
|
|
276
|
+
text/
|
|
277
|
+
<item id>.txt
|
|
278
|
+
retrieval/
|
|
279
|
+
<backend id>/
|
|
280
|
+
<run id>/
|
|
281
|
+
manifest.json
|
|
266
282
|
```
|
|
267
283
|
|
|
268
284
|
## Retrieval backends
|
|
@@ -313,7 +329,7 @@ python3 -m pip install -e ".[dev]"
|
|
|
313
329
|
Build the documentation:
|
|
314
330
|
|
|
315
331
|
```
|
|
316
|
-
python3 -m sphinx -b html docs docs/_build
|
|
332
|
+
python3 -m sphinx -b html docs docs/_build/html
|
|
317
333
|
```
|
|
318
334
|
|
|
319
335
|
## License
|
|
@@ -333,4 +349,4 @@ License terms are in `LICENSE`.
|
|
|
333
349
|
|
|
334
350
|
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
335
351
|
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
336
|
-
[documentation-badge]: https://
|
|
352
|
+
[documentation-badge]: https://img.shields.io/badge/docs-GitHub%20Pages-blue
|
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=6TFpzDiMJlFyBfVrHfS6xnGd8P7Zybj6DxpWkCJqyf4,432
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=MtYTkJh0lVOTrbwY3u6V8ti5WZUsb96fLatqh23cUYg,24289
|
|
4
4
|
biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
|
|
5
|
-
biblicus/corpus.py,sha256=
|
|
5
|
+
biblicus/corpus.py,sha256=gF1RNl6fdz7wplzpHEIkEBkhYxHgKTKguBR_kD9IgUw,54109
|
|
6
|
+
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
6
7
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
7
8
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
8
|
-
biblicus/extraction.py,sha256=
|
|
9
|
+
biblicus/extraction.py,sha256=VEjBjIpaBboftGgEcpDj7z7um41e5uDZpP_7acQg7fw,19448
|
|
9
10
|
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
10
11
|
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
11
12
|
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
12
13
|
biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
13
14
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
14
|
-
biblicus/models.py,sha256=
|
|
15
|
+
biblicus/models.py,sha256=6SWQ2Czg9O3zjuam8a4m8V3LlEgcGLbEctYDB6F1rRs,15317
|
|
15
16
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
16
17
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
17
|
-
biblicus/time.py,sha256=
|
|
18
|
+
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
18
19
|
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
19
20
|
biblicus/user_config.py,sha256=DqO08yLn82DhTiFpmIyyLj_J0nMbrtE8xieTj2Cgd6A,4287
|
|
20
21
|
biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
|
|
@@ -36,9 +37,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
|
|
|
36
37
|
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
37
38
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
38
39
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
39
|
-
biblicus-0.
|
|
40
|
-
biblicus-0.
|
|
41
|
-
biblicus-0.
|
|
42
|
-
biblicus-0.
|
|
43
|
-
biblicus-0.
|
|
44
|
-
biblicus-0.
|
|
40
|
+
biblicus-0.4.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
41
|
+
biblicus-0.4.0.dist-info/METADATA,sha256=JMZjfIOMEmbWFHgzjUHsQDUUg11jdxudtJdRK8Iu29U,13586
|
|
42
|
+
biblicus-0.4.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
43
|
+
biblicus-0.4.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
44
|
+
biblicus-0.4.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
45
|
+
biblicus-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|