biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 1.0.0
3
+ Version: 1.1.1
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -80,10 +80,10 @@ See [retrieval augmented generation overview] for a short introduction to the id
80
80
  ## Analysis highlights
81
81
 
82
82
  - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
83
- - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
83
+ - YAML configurations support cascading composition plus dotted `--config key=value` overrides.
84
84
  - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
85
- - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
86
- - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
85
+ - See `docs/markov-analysis.md` for Markov analysis details and runnable demos.
86
+ - See `docs/text-extract.md` for the text extract utility and examples.
87
87
 
88
88
  ## Start with a knowledge base
89
89
 
@@ -167,7 +167,7 @@ sequenceDiagram
167
167
 
168
168
  - You can ingest raw material once, then try many retrieval approaches over time.
169
169
  - You can keep raw files readable and portable, without locking your data inside a database.
170
- - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
170
+ - You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
171
171
 
172
172
  ## Typical flow
173
173
 
@@ -176,7 +176,7 @@ sequenceDiagram
176
176
  - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
177
177
  - Run extraction when you want derived text artifacts from non-text sources.
178
178
  - Reindex to refresh the catalog after edits.
179
- - Build a retrieval run with a backend.
179
+ - Build a retrieval snapshot with a backend.
180
180
  - Query the run to collect evidence and evaluate it with datasets.
181
181
 
182
182
  ## Install
@@ -292,7 +292,7 @@ for note_title, note_text in notes:
292
292
  corpus.ingest_note(note_text, title=note_title, tags=["memory"])
293
293
 
294
294
  backend = get_backend("scan")
295
- run = backend.build_run(corpus, recipe_name="Story demo", config={})
295
+ run = backend.build_run(corpus, configuration_name="Story demo", config={})
296
296
  budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
297
297
  result = backend.query(
298
298
  corpus,
@@ -336,8 +336,8 @@ Example output:
336
336
  "maximum_total_characters": 2000,
337
337
  "max_items_per_source": null
338
338
  },
339
- "run_id": "RUN_ID",
340
- "recipe_id": "RECIPE_ID",
339
+ "snapshot_id": "RUN_ID",
340
+ "configuration_id": "RECIPE_ID",
341
341
  "backend_id": "scan",
342
342
  "generated_at": "2026-01-29T00:00:00.000000Z",
343
343
  "evidence": [
@@ -352,8 +352,8 @@ Example output:
352
352
  "span_start": null,
353
353
  "span_end": null,
354
354
  "stage": "scan",
355
- "recipe_id": "RECIPE_ID",
356
- "run_id": "RUN_ID",
355
+ "configuration_id": "RECIPE_ID",
356
+ "snapshot_id": "RUN_ID",
357
357
  "hash": null
358
358
  }
359
359
  ],
@@ -422,7 +422,7 @@ flowchart TB
422
422
 
423
423
  subgraph RowExtraction[Pluggable: extraction pipeline]
424
424
  direction TB
425
- Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction run manifest]
425
+ Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
426
426
  end
427
427
 
428
428
  subgraph RowRetrieval[Pluggable: retrieval backend]
@@ -484,7 +484,7 @@ From Python, the same flow is available through the Corpus class and backend int
484
484
  - Ingest notes with `Corpus.ingest_note`.
485
485
  - Ingest files or web addresses with `Corpus.ingest_source`.
486
486
  - List items with `Corpus.list_items`.
487
- - Build a retrieval run with `get_backend` and `backend.build_run`.
487
+ - Build a retrieval snapshot with `get_backend` and `backend.build_run`.
488
488
  - Query a run with `backend.query`.
489
489
  - Evaluate with `evaluate_run`.
490
490
 
@@ -530,13 +530,13 @@ corpus/
530
530
  runs/
531
531
  extraction/
532
532
  pipeline/
533
- <run id>/
533
+ <snapshot id>/
534
534
  manifest.json
535
535
  text/
536
536
  <item id>.txt
537
537
  retrieval/
538
538
  <backend id>/
539
- <run id>/
539
+ <snapshot id>/
540
540
  manifest.json
541
541
  ```
542
542
 
@@ -552,9 +552,9 @@ For detailed documentation including configuration options, performance characte
552
552
 
553
553
  ## Retrieval documentation
554
554
 
555
- For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
556
- (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
557
- and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
555
+ For the retrieval pipeline overview and snapshot artifacts, see `docs/retrieval.md`. For retrieval quality upgrades
556
+ (tuned lexical baseline, reranking, hybrid retrieval), see `docs/retrieval-quality.md`. For evaluation workflows
557
+ and dataset formats, see `docs/retrieval-evaluation.md`. For a runnable walkthrough, use the retrieval evaluation lab
558
558
  script (`scripts/retrieval_evaluation_lab.py`).
559
559
 
560
560
  ## Extraction backends
@@ -594,7 +594,7 @@ These extractors are built in. Optional ones require extra dependencies. See [te
594
594
  For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
595
595
 
596
596
  For extraction evaluation workflows, dataset formats, and report interpretation, see
597
- `docs/EXTRACTION_EVALUATION.md`.
597
+ `docs/extraction-evaluation.md`.
598
598
 
599
599
  ## Text extract utility
600
600
 
@@ -602,39 +602,39 @@ Text extract is a reusable analysis utility that lets a model insert XML tags in
602
602
  entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
603
603
  analysis.
604
604
 
605
- See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
605
+ See `docs/text-extract.md` for the utility API and examples, and `docs/markov-analysis.md` for the Markov integration.
606
606
 
607
607
  ## Text slice utility
608
608
 
609
609
  Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
610
610
  re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
611
611
 
612
- See `docs/TEXT_SLICE.md` for the utility API and examples.
612
+ See `docs/text-slice.md` for the utility API and examples.
613
613
 
614
614
  ## Topic modeling analysis
615
615
 
616
616
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
617
617
  are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
618
- an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
618
+ an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
619
619
  optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
620
620
 
621
- See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
622
- `docs/TOPIC_MODELING.md` for topic modeling details.
621
+ See `docs/analysis.md` for the analysis pipeline overview, `docs/profiling.md` for profiling, and
622
+ `docs/topic-modeling.md` for topic modeling details.
623
623
 
624
- Run a topic analysis using a recipe file:
624
+ Run a topic analysis using a configuration file:
625
625
 
626
626
  ```
627
- biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
627
+ biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
628
628
  ```
629
629
 
630
- If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
630
+ If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
631
631
  reproducibility. The analysis output is stored under:
632
632
 
633
633
  ```
634
- .biblicus/runs/analysis/topic-modeling/<run_id>/output.json
634
+ .biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
635
635
  ```
636
636
 
637
- Minimal recipe example:
637
+ Minimal configuration example:
638
638
 
639
639
  ```yaml
640
640
  schema_version: 1
@@ -659,7 +659,7 @@ llm_fine_tuning:
659
659
  ```
660
660
 
661
661
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
662
- Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
662
+ Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
663
663
  AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
664
664
 
665
665
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
@@ -668,7 +668,7 @@ For a repeatable, real-world integration run that downloads AG News and executes
668
668
  python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
669
669
  ```
670
670
 
671
- See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
671
+ See `docs/topic-modeling.md` for parameter examples and per-topic output behavior.
672
672
 
673
673
  ## Integration corpus and evaluation dataset
674
674
 
@@ -712,25 +712,34 @@ Build the documentation:
712
712
  python -m sphinx -b html docs docs/_build/html
713
713
  ```
714
714
 
715
+ Preview the documentation locally:
716
+
717
+ ```
718
+ cd docs/_build/html
719
+ python -m http.server
720
+ ```
721
+
722
+ Open `http://localhost:8000` in your browser.
723
+
715
724
  ## License
716
725
 
717
726
  License terms are in `LICENSE`.
718
727
 
719
728
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
720
- [architecture]: docs/ARCHITECTURE.md
721
- [roadmap]: docs/ROADMAP.md
722
- [feature-index]: docs/FEATURE_INDEX.md
723
- [corpus]: docs/CORPUS.md
724
- [knowledge-base]: docs/KNOWLEDGE_BASE.md
725
- [text-extraction]: docs/EXTRACTION.md
729
+ [architecture]: docs/architecture.md
730
+ [roadmap]: docs/roadmap.md
731
+ [feature-index]: docs/feature-index.md
732
+ [corpus]: docs/corpus.md
733
+ [knowledge-base]: docs/knowledge-base.md
734
+ [text-extraction]: docs/extraction.md
726
735
  [extractor-reference]: docs/extractors/index.md
727
736
  [backend-reference]: docs/backends/index.md
728
- [speech-to-text]: docs/STT.md
729
- [user-configuration]: docs/USER_CONFIGURATION.md
730
- [backends]: docs/BACKENDS.md
731
- [context-packs]: docs/CONTEXT_PACK.md
732
- [demos]: docs/DEMOS.md
733
- [testing]: docs/TESTING.md
737
+ [speech-to-text]: docs/stt.md
738
+ [user-configuration]: docs/user-configuration.md
739
+ [backends]: docs/backends.md
740
+ [context-packs]: docs/context-pack.md
741
+ [demos]: docs/demos.md
742
+ [testing]: docs/testing.md
734
743
 
735
744
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
736
745
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
@@ -0,0 +1,91 @@
1
+ biblicus/__init__.py,sha256=KGQ2mjQRe9i8OyE25LZHJAG5jg_fDKOiWMTOprp-NPc,1013
2
+ biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
+ biblicus/chunking.py,sha256=GdJr0skAAI0Su99mr7dXqCgR7eJ0sJu8n2XesVGyddY,13206
4
+ biblicus/cli.py,sha256=GN7L0-s0k9tAj_lthvBrJlfo_DG9y53vYc6k_IhSea0,45797
5
+ biblicus/configuration.py,sha256=JzQU-2pzO4hY7pBw8J79Ci0Glc9cvh4KrRvzSMK2d5w,4329
6
+ biblicus/constants.py,sha256=VVjfZvdmoiCNsiQv0JVI-cA6JKXWUsvGL_IjnTxlEI8,386
7
+ biblicus/context.py,sha256=I7L86ag2AbNr_QgiP5YSt1uwwULGx1cH73eR2nE9T3g,10842
8
+ biblicus/corpus.py,sha256=D9O1Z8lQ7yFNXQDkaKR9fSTRDMSwtrYTGavh_GM7Eww,60374
9
+ biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
10
+ biblicus/embedding_providers.py,sha256=phWEsq1vryyTFRRs6uZ0sx9FhrqWIkDsS3I52I64zqM,3839
11
+ biblicus/errors.py,sha256=7fAGJbe_pCD8ygnfbTn6bNRV6pam0Vx3xjIpLrxrucg,1382
12
+ biblicus/evaluation.py,sha256=XnQKPbUcUBnELllh7cNEzvTK8EKU1Ub0q3u_sIhXB5E,8372
13
+ biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
14
+ biblicus/extraction.py,sha256=YiJqLWY3mglYokSJqA8-oIxpFBPW4Hz0TEeeNp0PtWA,20581
15
+ biblicus/extraction_evaluation.py,sha256=kFbyKcHzZK_z0OgCmQ3Olj55zgGoxin0Ir3dUA50TLI,10641
16
+ biblicus/frontmatter.py,sha256=uFC4iIrgpnTDiP1gvAnT_CbFYdNuUVtETX7tZ3a9g-Y,2517
17
+ biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
18
+ biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
19
+ biblicus/hooks.py,sha256=-ZcKZ4scK9ctas_PcseOmJJOLCkwxpnIxrACcz1qUus,7907
20
+ biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
21
+ biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
22
+ biblicus/knowledge_base.py,sha256=pDZQlihjMB7AF61LccVG21rWEAjisgRtkEcn-dymZTM,6915
23
+ biblicus/models.py,sha256=UlaqdvdqPZHd2__4Gcd4pryA_DBVgSiv86uI2AYD8Ag,16990
24
+ biblicus/retrieval.py,sha256=9RA3KGw43dBOD1EFZwt9sqcVf334UtXb1qNHUqYW6As,4646
25
+ biblicus/sources.py,sha256=FNwW1FWts0jxWIL3AHon7D6c5ZatyG9AGFqzn1Id5mE,8504
26
+ biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
27
+ biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
28
+ biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
29
+ biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
30
+ biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
31
+ biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
32
+ biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
33
+ biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
34
+ biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
35
+ biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
36
+ biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
37
+ biblicus/analysis/__init__.py,sha256=d1q11tEx3JkrOPMaiGMNCHhN9tCOTr_QpQP-tI1J2Wk,1389
38
+ biblicus/analysis/base.py,sha256=HErFLn3gv1qf9ckAUxbolHF2k9sJDNZjPjdboCMhyBE,1349
39
+ biblicus/analysis/markov.py,sha256=pLtKvt4gtsqa1CASizh8bBJ4CQW2e0wGaQ-BgdP7Pfg,63766
40
+ biblicus/analysis/models.py,sha256=dYnm5gwUzTk5HvrHZjQx4vug_TZLnXU9qN6CLIRyLng,56495
41
+ biblicus/analysis/profiling.py,sha256=IynvrgcopqFj6lMUPHS1prwd0FxN8FzIa5p3JInDFCc,11185
42
+ biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
43
+ biblicus/analysis/topic_modeling.py,sha256=dsNHuqxcEoCKO_8aDAM9yEOa0kWCjPWS2NvcQayIyXQ,22623
44
+ biblicus/context_engine/__init__.py,sha256=cIJWTUwOewW1x13a2n0YKfr4-XU0IwlVdAH_0pckfKk,1337
45
+ biblicus/context_engine/assembler.py,sha256=E7VPdqUJ9peZUoonM0Ooa1wsaklFOuLCt2IH9nFxAfM,44260
46
+ biblicus/context_engine/compaction.py,sha256=2bLaCpT48d1TL7vt9rrcRCgfdHeWWp9LX85Cgij12o0,2921
47
+ biblicus/context_engine/models.py,sha256=jesVd83ZQcatO-7yNlzwKkactSQ-e1znYuWof4rxVFg,12762
48
+ biblicus/context_engine/retrieval.py,sha256=A0w6C5uPrDY_aeGeirRkSGr6I-gU0U0cY6ElvrLhe0Q,4425
49
+ biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
50
+ biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
51
+ biblicus/extractors/deepgram_stt.py,sha256=xx_zrROGRHotF5aht23Qey9dpCnU3KHZ0unwa933Pto,6358
52
+ biblicus/extractors/docling_granite_text.py,sha256=iDHWZVgqZd86Q3Zu-fcdCq7ia00xTpbFbTU7JbDNZ38,6953
53
+ biblicus/extractors/docling_smol_text.py,sha256=qI7m93Odrjmob0RW-Yvnt5Ck4AgFcgdVjwatLQA5krI,6885
54
+ biblicus/extractors/markitdown_text.py,sha256=kYixZbVxaIyeWtpezocnrSxC3z_9KqWuBzeK8sI4s1o,4567
55
+ biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
56
+ biblicus/extractors/openai_stt.py,sha256=d2CaVhxapfkXaeI_QZcoXwdVm5Bj5YwGLdmTZDcqgTc,7197
57
+ biblicus/extractors/paddleocr_vl_text.py,sha256=2xoHA1Jviw8zzeBvHBI74Lkx4SX_vSarCe3wxvYf6c4,11794
58
+ biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
59
+ biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
60
+ biblicus/extractors/pipeline.py,sha256=qdlBBSUVNdg2V4izHacv_8a2DikCGlVMAdpkZkzNvyY,3288
61
+ biblicus/extractors/rapidocr_text.py,sha256=5adSCiOmyHiCgX3jBMcl1OiQlGzYLxmgJQzo9GHSecs,4791
62
+ biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
63
+ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_mvjehiSec,4014
64
+ biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
65
+ biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
66
+ biblicus/extractors/unstructured_text.py,sha256=WXr_fu4KQ0NODkbb05e4HrAX-trOWRKiDOmznh9_pLI,3579
67
+ biblicus/retrievers/__init__.py,sha256=LOxhUYgph1sPAeY6PmSmXH4Os4bIGGOtw88iOdI9S2k,1704
68
+ biblicus/retrievers/base.py,sha256=DSf5Ve5IFeunIyV9zt7T1vEUvSkJWO4iBj96co5F0Qo,1891
69
+ biblicus/retrievers/embedding_index_common.py,sha256=63_dUds-yIALhq2L9_0oDNpoh-_h7v2j1kexbdVw1-o,11756
70
+ biblicus/retrievers/embedding_index_file.py,sha256=mvtXqRX-_eQpi9bRxQ2yqFxY26YhP8Vn2WGcoWVtMtc,10668
71
+ biblicus/retrievers/embedding_index_inmemory.py,sha256=8csrdjCGkkR7DgvmLZ72oD1gm4duWWUnxJsjw6nLicU,10525
72
+ biblicus/retrievers/hybrid.py,sha256=kaH-kIi4wxYyUWnKNFT7UNBbHFkRtcGlwjjiJpx-TJY,11789
73
+ biblicus/retrievers/scan.py,sha256=ccDGVnqBS9a2ymKeBEXdfJz8XLahsBeYWYyYXQcg2KQ,13147
74
+ biblicus/retrievers/sqlite_full_text_search.py,sha256=7rzYfzpRhPbsKuXjXi8x2-rmq8-z1em3amUF9UPAomI,25392
75
+ biblicus/retrievers/tf_vector.py,sha256=rkcRG1GU5S_3t8GRbQTBThITj-eHT5fs1dyVzXPLg8w,15776
76
+ biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
77
+ biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
78
+ biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
79
+ biblicus/text/link.py,sha256=2IdOi3WgyBKPFau0bpS1eToV1q2v_6wq5RK5_P_qUDg,20448
80
+ biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
81
+ biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
82
+ biblicus/text/prompts.py,sha256=9dx1cWpJb6oBY4AhDHxlkRUYs7DfbySH0gb-uBTNvtk,7567
83
+ biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
84
+ biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
85
+ biblicus/text/tool_loop.py,sha256=dFeIEcCUA-yR8GMqsJ_n4007fHVmn9zK2hhlm6NlWyg,14161
86
+ biblicus-1.1.1.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
87
+ biblicus-1.1.1.dist-info/METADATA,sha256=WtfRn0ON4GjFGAHca6TFcjt-tpdMUGz5uOR4-ePJ_ys,31202
88
+ biblicus-1.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
89
+ biblicus-1.1.1.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
90
+ biblicus-1.1.1.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
91
+ biblicus-1.1.1.dist-info/RECORD,,
@@ -1,50 +0,0 @@
1
- """
2
- Backend registry for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from typing import Dict, Type
8
-
9
- from .base import RetrievalBackend
10
- from .embedding_index_file import EmbeddingIndexFileBackend
11
- from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
12
- from .hybrid import HybridBackend
13
- from .scan import ScanBackend
14
- from .sqlite_full_text_search import SqliteFullTextSearchBackend
15
- from .tf_vector import TfVectorBackend
16
-
17
-
18
- def available_backends() -> Dict[str, Type[RetrievalBackend]]:
19
- """
20
- Return the registered retrieval backends.
21
-
22
- :return: Mapping of backend identifiers to backend classes.
23
- :rtype: dict[str, Type[RetrievalBackend]]
24
- """
25
- return {
26
- EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
27
- EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
28
- HybridBackend.backend_id: HybridBackend,
29
- ScanBackend.backend_id: ScanBackend,
30
- SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
31
- TfVectorBackend.backend_id: TfVectorBackend,
32
- }
33
-
34
-
35
- def get_backend(backend_id: str) -> RetrievalBackend:
36
- """
37
- Instantiate a retrieval backend by identifier.
38
-
39
- :param backend_id: Backend identifier.
40
- :type backend_id: str
41
- :return: Backend instance.
42
- :rtype: RetrievalBackend
43
- :raises KeyError: If the backend identifier is unknown.
44
- """
45
- registry = available_backends()
46
- backend_class = registry.get(backend_id)
47
- if backend_class is None:
48
- known = ", ".join(sorted(registry))
49
- raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
50
- return backend_class()
biblicus/backends/base.py DELETED
@@ -1,65 +0,0 @@
1
- """
2
- Backend interface for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from abc import ABC, abstractmethod
8
- from typing import Dict
9
-
10
- from ..corpus import Corpus
11
- from ..models import QueryBudget, RetrievalResult, RetrievalRun
12
-
13
-
14
- class RetrievalBackend(ABC):
15
- """
16
- Abstract interface for retrieval backends.
17
-
18
- :ivar backend_id: Identifier string for the backend.
19
- :vartype backend_id: str
20
- """
21
-
22
- backend_id: str
23
-
24
- @abstractmethod
25
- def build_run(
26
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
27
- ) -> RetrievalRun:
28
- """
29
- Build or register a retrieval run for the backend.
30
-
31
- :param corpus: Corpus to build against.
32
- :type corpus: Corpus
33
- :param recipe_name: Human name for the recipe.
34
- :type recipe_name: str
35
- :param config: Backend-specific configuration values.
36
- :type config: dict[str, object]
37
- :return: Run manifest describing the build.
38
- :rtype: RetrievalRun
39
- """
40
- raise NotImplementedError
41
-
42
- @abstractmethod
43
- def query(
44
- self,
45
- corpus: Corpus,
46
- *,
47
- run: RetrievalRun,
48
- query_text: str,
49
- budget: QueryBudget,
50
- ) -> RetrievalResult:
51
- """
52
- Run a retrieval query against a backend.
53
-
54
- :param corpus: Corpus associated with the run.
55
- :type corpus: Corpus
56
- :param run: Run manifest to use for querying.
57
- :type run: RetrievalRun
58
- :param query_text: Query text to execute.
59
- :type query_text: str
60
- :param budget: Evidence selection budget.
61
- :type budget: QueryBudget
62
- :return: Retrieval results containing evidence.
63
- :rtype: RetrievalResult
64
- """
65
- raise NotImplementedError