biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -80,7 +80,7 @@ See [retrieval augmented generation overview] for a short introduction to the id
80
80
  ## Analysis highlights
81
81
 
82
82
  - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
83
- - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
83
+ - YAML configurations support cascading composition plus dotted `--config key=value` overrides.
84
84
  - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
85
85
  - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
86
86
  - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
@@ -167,7 +167,7 @@ sequenceDiagram
167
167
 
168
168
  - You can ingest raw material once, then try many retrieval approaches over time.
169
169
  - You can keep raw files readable and portable, without locking your data inside a database.
170
- - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
170
+ - You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
171
171
 
172
172
  ## Typical flow
173
173
 
@@ -176,7 +176,7 @@ sequenceDiagram
176
176
  - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
177
177
  - Run extraction when you want derived text artifacts from non-text sources.
178
178
  - Reindex to refresh the catalog after edits.
179
- - Build a retrieval run with a backend.
179
+ - Build a retrieval snapshot with a backend.
180
180
  - Query the run to collect evidence and evaluate it with datasets.
181
181
 
182
182
  ## Install
@@ -292,7 +292,7 @@ for note_title, note_text in notes:
292
292
  corpus.ingest_note(note_text, title=note_title, tags=["memory"])
293
293
 
294
294
  backend = get_backend("scan")
295
- run = backend.build_run(corpus, recipe_name="Story demo", config={})
295
+ run = backend.build_run(corpus, configuration_name="Story demo", config={})
296
296
  budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
297
297
  result = backend.query(
298
298
  corpus,
@@ -336,8 +336,8 @@ Example output:
336
336
  "maximum_total_characters": 2000,
337
337
  "max_items_per_source": null
338
338
  },
339
- "run_id": "RUN_ID",
340
- "recipe_id": "RECIPE_ID",
339
+ "snapshot_id": "RUN_ID",
340
+ "configuration_id": "RECIPE_ID",
341
341
  "backend_id": "scan",
342
342
  "generated_at": "2026-01-29T00:00:00.000000Z",
343
343
  "evidence": [
@@ -352,8 +352,8 @@ Example output:
352
352
  "span_start": null,
353
353
  "span_end": null,
354
354
  "stage": "scan",
355
- "recipe_id": "RECIPE_ID",
356
- "run_id": "RUN_ID",
355
+ "configuration_id": "RECIPE_ID",
356
+ "snapshot_id": "RUN_ID",
357
357
  "hash": null
358
358
  }
359
359
  ],
@@ -422,7 +422,7 @@ flowchart TB
422
422
 
423
423
  subgraph RowExtraction[Pluggable: extraction pipeline]
424
424
  direction TB
425
- Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction run manifest]
425
+ Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
426
426
  end
427
427
 
428
428
  subgraph RowRetrieval[Pluggable: retrieval backend]
@@ -484,7 +484,7 @@ From Python, the same flow is available through the Corpus class and backend int
484
484
  - Ingest notes with `Corpus.ingest_note`.
485
485
  - Ingest files or web addresses with `Corpus.ingest_source`.
486
486
  - List items with `Corpus.list_items`.
487
- - Build a retrieval run with `get_backend` and `backend.build_run`.
487
+ - Build a retrieval snapshot with `get_backend` and `backend.build_run`.
488
488
  - Query a run with `backend.query`.
489
489
  - Evaluate with `evaluate_run`.
490
490
 
@@ -530,13 +530,13 @@ corpus/
530
530
  runs/
531
531
  extraction/
532
532
  pipeline/
533
- <run id>/
533
+ <snapshot id>/
534
534
  manifest.json
535
535
  text/
536
536
  <item id>.txt
537
537
  retrieval/
538
538
  <backend id>/
539
- <run id>/
539
+ <snapshot id>/
540
540
  manifest.json
541
541
  ```
542
542
 
@@ -552,7 +552,7 @@ For detailed documentation including configuration options, performance characte
552
552
 
553
553
  ## Retrieval documentation
554
554
 
555
- For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
555
+ For the retrieval pipeline overview and snapshot artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
556
556
  (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
557
557
  and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
558
558
  script (`scripts/retrieval_evaluation_lab.py`).
@@ -615,26 +615,26 @@ See `docs/TEXT_SLICE.md` for the utility API and examples.
615
615
 
616
616
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
617
617
  are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
618
- an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
618
+ an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
619
619
  optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
620
620
 
621
621
  See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
622
622
  `docs/TOPIC_MODELING.md` for topic modeling details.
623
623
 
624
- Run a topic analysis using a recipe file:
624
+ Run a topic analysis using a configuration file:
625
625
 
626
626
  ```
627
- biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
627
+ biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
628
628
  ```
629
629
 
630
- If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
630
+ If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
631
631
  reproducibility. The analysis output is stored under:
632
632
 
633
633
  ```
634
- .biblicus/runs/analysis/topic-modeling/<run_id>/output.json
634
+ .biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
635
635
  ```
636
636
 
637
- Minimal recipe example:
637
+ Minimal configuration example:
638
638
 
639
639
  ```yaml
640
640
  schema_version: 1
@@ -659,7 +659,7 @@ llm_fine_tuning:
659
659
  ```
660
660
 
661
661
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
662
- Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
662
+ Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
663
663
  AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
664
664
 
665
665
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
@@ -712,6 +712,15 @@ Build the documentation:
712
712
  python -m sphinx -b html docs docs/_build/html
713
713
  ```
714
714
 
715
+ Preview the documentation locally:
716
+
717
+ ```
718
+ cd docs/_build/html
719
+ python -m http.server
720
+ ```
721
+
722
+ Open `http://localhost:8000` in your browser.
723
+
715
724
  ## License
716
725
 
717
726
  License terms are in `LICENSE`.
@@ -0,0 +1,91 @@
1
+ biblicus/__init__.py,sha256=O9FlaC1aaafCfDoI3sIsbtUsjNKJpBI6sP-RTp_kCaI,1013
2
+ biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
+ biblicus/chunking.py,sha256=GdJr0skAAI0Su99mr7dXqCgR7eJ0sJu8n2XesVGyddY,13206
4
+ biblicus/cli.py,sha256=GN7L0-s0k9tAj_lthvBrJlfo_DG9y53vYc6k_IhSea0,45797
5
+ biblicus/configuration.py,sha256=JzQU-2pzO4hY7pBw8J79Ci0Glc9cvh4KrRvzSMK2d5w,4329
6
+ biblicus/constants.py,sha256=VVjfZvdmoiCNsiQv0JVI-cA6JKXWUsvGL_IjnTxlEI8,386
7
+ biblicus/context.py,sha256=I7L86ag2AbNr_QgiP5YSt1uwwULGx1cH73eR2nE9T3g,10842
8
+ biblicus/corpus.py,sha256=D9O1Z8lQ7yFNXQDkaKR9fSTRDMSwtrYTGavh_GM7Eww,60374
9
+ biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
10
+ biblicus/embedding_providers.py,sha256=phWEsq1vryyTFRRs6uZ0sx9FhrqWIkDsS3I52I64zqM,3839
11
+ biblicus/errors.py,sha256=7fAGJbe_pCD8ygnfbTn6bNRV6pam0Vx3xjIpLrxrucg,1382
12
+ biblicus/evaluation.py,sha256=XnQKPbUcUBnELllh7cNEzvTK8EKU1Ub0q3u_sIhXB5E,8372
13
+ biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
14
+ biblicus/extraction.py,sha256=YiJqLWY3mglYokSJqA8-oIxpFBPW4Hz0TEeeNp0PtWA,20581
15
+ biblicus/extraction_evaluation.py,sha256=kFbyKcHzZK_z0OgCmQ3Olj55zgGoxin0Ir3dUA50TLI,10641
16
+ biblicus/frontmatter.py,sha256=uFC4iIrgpnTDiP1gvAnT_CbFYdNuUVtETX7tZ3a9g-Y,2517
17
+ biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
18
+ biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
19
+ biblicus/hooks.py,sha256=-ZcKZ4scK9ctas_PcseOmJJOLCkwxpnIxrACcz1qUus,7907
20
+ biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
21
+ biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
22
+ biblicus/knowledge_base.py,sha256=pDZQlihjMB7AF61LccVG21rWEAjisgRtkEcn-dymZTM,6915
23
+ biblicus/models.py,sha256=UlaqdvdqPZHd2__4Gcd4pryA_DBVgSiv86uI2AYD8Ag,16990
24
+ biblicus/retrieval.py,sha256=9RA3KGw43dBOD1EFZwt9sqcVf334UtXb1qNHUqYW6As,4646
25
+ biblicus/sources.py,sha256=FNwW1FWts0jxWIL3AHon7D6c5ZatyG9AGFqzn1Id5mE,8504
26
+ biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
27
+ biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
28
+ biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
29
+ biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
30
+ biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
31
+ biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
32
+ biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
33
+ biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
34
+ biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
35
+ biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
36
+ biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
37
+ biblicus/analysis/__init__.py,sha256=d1q11tEx3JkrOPMaiGMNCHhN9tCOTr_QpQP-tI1J2Wk,1389
38
+ biblicus/analysis/base.py,sha256=HErFLn3gv1qf9ckAUxbolHF2k9sJDNZjPjdboCMhyBE,1349
39
+ biblicus/analysis/markov.py,sha256=pLtKvt4gtsqa1CASizh8bBJ4CQW2e0wGaQ-BgdP7Pfg,63766
40
+ biblicus/analysis/models.py,sha256=dYnm5gwUzTk5HvrHZjQx4vug_TZLnXU9qN6CLIRyLng,56495
41
+ biblicus/analysis/profiling.py,sha256=IynvrgcopqFj6lMUPHS1prwd0FxN8FzIa5p3JInDFCc,11185
42
+ biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
43
+ biblicus/analysis/topic_modeling.py,sha256=dsNHuqxcEoCKO_8aDAM9yEOa0kWCjPWS2NvcQayIyXQ,22623
44
+ biblicus/context_engine/__init__.py,sha256=cIJWTUwOewW1x13a2n0YKfr4-XU0IwlVdAH_0pckfKk,1337
45
+ biblicus/context_engine/assembler.py,sha256=E7VPdqUJ9peZUoonM0Ooa1wsaklFOuLCt2IH9nFxAfM,44260
46
+ biblicus/context_engine/compaction.py,sha256=2bLaCpT48d1TL7vt9rrcRCgfdHeWWp9LX85Cgij12o0,2921
47
+ biblicus/context_engine/models.py,sha256=jesVd83ZQcatO-7yNlzwKkactSQ-e1znYuWof4rxVFg,12762
48
+ biblicus/context_engine/retrieval.py,sha256=A0w6C5uPrDY_aeGeirRkSGr6I-gU0U0cY6ElvrLhe0Q,4425
49
+ biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
50
+ biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
51
+ biblicus/extractors/deepgram_stt.py,sha256=xx_zrROGRHotF5aht23Qey9dpCnU3KHZ0unwa933Pto,6358
52
+ biblicus/extractors/docling_granite_text.py,sha256=iDHWZVgqZd86Q3Zu-fcdCq7ia00xTpbFbTU7JbDNZ38,6953
53
+ biblicus/extractors/docling_smol_text.py,sha256=qI7m93Odrjmob0RW-Yvnt5Ck4AgFcgdVjwatLQA5krI,6885
54
+ biblicus/extractors/markitdown_text.py,sha256=kYixZbVxaIyeWtpezocnrSxC3z_9KqWuBzeK8sI4s1o,4567
55
+ biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
56
+ biblicus/extractors/openai_stt.py,sha256=d2CaVhxapfkXaeI_QZcoXwdVm5Bj5YwGLdmTZDcqgTc,7197
57
+ biblicus/extractors/paddleocr_vl_text.py,sha256=2xoHA1Jviw8zzeBvHBI74Lkx4SX_vSarCe3wxvYf6c4,11794
58
+ biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
59
+ biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
60
+ biblicus/extractors/pipeline.py,sha256=qdlBBSUVNdg2V4izHacv_8a2DikCGlVMAdpkZkzNvyY,3288
61
+ biblicus/extractors/rapidocr_text.py,sha256=5adSCiOmyHiCgX3jBMcl1OiQlGzYLxmgJQzo9GHSecs,4791
62
+ biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
63
+ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_mvjehiSec,4014
64
+ biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
65
+ biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
66
+ biblicus/extractors/unstructured_text.py,sha256=WXr_fu4KQ0NODkbb05e4HrAX-trOWRKiDOmznh9_pLI,3579
67
+ biblicus/retrievers/__init__.py,sha256=LOxhUYgph1sPAeY6PmSmXH4Os4bIGGOtw88iOdI9S2k,1704
68
+ biblicus/retrievers/base.py,sha256=DSf5Ve5IFeunIyV9zt7T1vEUvSkJWO4iBj96co5F0Qo,1891
69
+ biblicus/retrievers/embedding_index_common.py,sha256=63_dUds-yIALhq2L9_0oDNpoh-_h7v2j1kexbdVw1-o,11756
70
+ biblicus/retrievers/embedding_index_file.py,sha256=mvtXqRX-_eQpi9bRxQ2yqFxY26YhP8Vn2WGcoWVtMtc,10668
71
+ biblicus/retrievers/embedding_index_inmemory.py,sha256=8csrdjCGkkR7DgvmLZ72oD1gm4duWWUnxJsjw6nLicU,10525
72
+ biblicus/retrievers/hybrid.py,sha256=kaH-kIi4wxYyUWnKNFT7UNBbHFkRtcGlwjjiJpx-TJY,11789
73
+ biblicus/retrievers/scan.py,sha256=ccDGVnqBS9a2ymKeBEXdfJz8XLahsBeYWYyYXQcg2KQ,13147
74
+ biblicus/retrievers/sqlite_full_text_search.py,sha256=7rzYfzpRhPbsKuXjXi8x2-rmq8-z1em3amUF9UPAomI,25392
75
+ biblicus/retrievers/tf_vector.py,sha256=rkcRG1GU5S_3t8GRbQTBThITj-eHT5fs1dyVzXPLg8w,15776
76
+ biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
77
+ biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
78
+ biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
79
+ biblicus/text/link.py,sha256=2IdOi3WgyBKPFau0bpS1eToV1q2v_6wq5RK5_P_qUDg,20448
80
+ biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
81
+ biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
82
+ biblicus/text/prompts.py,sha256=9dx1cWpJb6oBY4AhDHxlkRUYs7DfbySH0gb-uBTNvtk,7567
83
+ biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
84
+ biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
85
+ biblicus/text/tool_loop.py,sha256=dFeIEcCUA-yR8GMqsJ_n4007fHVmn9zK2hhlm6NlWyg,14161
86
+ biblicus-1.1.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
87
+ biblicus-1.1.0.dist-info/METADATA,sha256=8hRnC6tlf8crtWxf6FPbGANZH9lxL6kiAtOtcxqJ3Ig,31202
88
+ biblicus-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
89
+ biblicus-1.1.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
90
+ biblicus-1.1.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
91
+ biblicus-1.1.0.dist-info/RECORD,,
@@ -1,50 +0,0 @@
1
- """
2
- Backend registry for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from typing import Dict, Type
8
-
9
- from .base import RetrievalBackend
10
- from .embedding_index_file import EmbeddingIndexFileBackend
11
- from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
12
- from .hybrid import HybridBackend
13
- from .scan import ScanBackend
14
- from .sqlite_full_text_search import SqliteFullTextSearchBackend
15
- from .tf_vector import TfVectorBackend
16
-
17
-
18
- def available_backends() -> Dict[str, Type[RetrievalBackend]]:
19
- """
20
- Return the registered retrieval backends.
21
-
22
- :return: Mapping of backend identifiers to backend classes.
23
- :rtype: dict[str, Type[RetrievalBackend]]
24
- """
25
- return {
26
- EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
27
- EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
28
- HybridBackend.backend_id: HybridBackend,
29
- ScanBackend.backend_id: ScanBackend,
30
- SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
31
- TfVectorBackend.backend_id: TfVectorBackend,
32
- }
33
-
34
-
35
- def get_backend(backend_id: str) -> RetrievalBackend:
36
- """
37
- Instantiate a retrieval backend by identifier.
38
-
39
- :param backend_id: Backend identifier.
40
- :type backend_id: str
41
- :return: Backend instance.
42
- :rtype: RetrievalBackend
43
- :raises KeyError: If the backend identifier is unknown.
44
- """
45
- registry = available_backends()
46
- backend_class = registry.get(backend_id)
47
- if backend_class is None:
48
- known = ", ".join(sorted(registry))
49
- raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
50
- return backend_class()
biblicus/backends/base.py DELETED
@@ -1,65 +0,0 @@
1
- """
2
- Backend interface for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from abc import ABC, abstractmethod
8
- from typing import Dict
9
-
10
- from ..corpus import Corpus
11
- from ..models import QueryBudget, RetrievalResult, RetrievalRun
12
-
13
-
14
- class RetrievalBackend(ABC):
15
- """
16
- Abstract interface for retrieval backends.
17
-
18
- :ivar backend_id: Identifier string for the backend.
19
- :vartype backend_id: str
20
- """
21
-
22
- backend_id: str
23
-
24
- @abstractmethod
25
- def build_run(
26
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
27
- ) -> RetrievalRun:
28
- """
29
- Build or register a retrieval run for the backend.
30
-
31
- :param corpus: Corpus to build against.
32
- :type corpus: Corpus
33
- :param recipe_name: Human name for the recipe.
34
- :type recipe_name: str
35
- :param config: Backend-specific configuration values.
36
- :type config: dict[str, object]
37
- :return: Run manifest describing the build.
38
- :rtype: RetrievalRun
39
- """
40
- raise NotImplementedError
41
-
42
- @abstractmethod
43
- def query(
44
- self,
45
- corpus: Corpus,
46
- *,
47
- run: RetrievalRun,
48
- query_text: str,
49
- budget: QueryBudget,
50
- ) -> RetrievalResult:
51
- """
52
- Run a retrieval query against a backend.
53
-
54
- :param corpus: Corpus associated with the run.
55
- :type corpus: Corpus
56
- :param run: Run manifest to use for querying.
57
- :type run: RetrievalRun
58
- :param query_text: Query text to execute.
59
- :type query_text: str
60
- :param budget: Evidence selection budget.
61
- :type budget: QueryBudget
62
- :return: Retrieval results containing evidence.
63
- :rtype: RetrievalResult
64
- """
65
- raise NotImplementedError
@@ -1,292 +0,0 @@
1
- """
2
- Hybrid retrieval backend combining lexical and vector results.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from typing import Dict, List, Optional
8
-
9
- from pydantic import BaseModel, ConfigDict, Field, model_validator
10
-
11
- from ..corpus import Corpus
12
- from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
13
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest
14
- from ..time import utc_now_iso
15
-
16
-
17
- class HybridRecipeConfig(BaseModel):
18
- """
19
- Configuration for hybrid retrieval fusion.
20
-
21
- :ivar lexical_backend: Backend identifier for lexical retrieval.
22
- :vartype lexical_backend: str
23
- :ivar embedding_backend: Backend identifier for embedding retrieval.
24
- :vartype embedding_backend: str
25
- :ivar lexical_weight: Weight for lexical scores.
26
- :vartype lexical_weight: float
27
- :ivar embedding_weight: Weight for embedding scores.
28
- :vartype embedding_weight: float
29
- :ivar lexical_config: Optional lexical backend configuration.
30
- :vartype lexical_config: dict[str, object]
31
- :ivar embedding_config: Optional embedding backend configuration.
32
- :vartype embedding_config: dict[str, object]
33
- """
34
-
35
- model_config = ConfigDict(extra="forbid")
36
-
37
- lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
38
- embedding_backend: str = Field(default="tf-vector", min_length=1)
39
- lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
- embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
- lexical_config: Dict[str, object] = Field(default_factory=dict)
42
- embedding_config: Dict[str, object] = Field(default_factory=dict)
43
-
44
- @model_validator(mode="after")
45
- def _validate_weights(self) -> "HybridRecipeConfig":
46
- if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
47
- raise ValueError("weights must sum to 1")
48
- return self
49
-
50
-
51
- class HybridBackend:
52
- """
53
- Hybrid backend that fuses lexical and embedding retrieval.
54
-
55
- :ivar backend_id: Backend identifier.
56
- :vartype backend_id: str
57
- """
58
-
59
- backend_id = "hybrid"
60
-
61
- def build_run(
62
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
63
- ) -> RetrievalRun:
64
- """
65
- Build or register a hybrid retrieval run.
66
-
67
- :param corpus: Corpus to build against.
68
- :type corpus: Corpus
69
- :param recipe_name: Human-readable recipe name.
70
- :type recipe_name: str
71
- :param config: Backend-specific configuration values.
72
- :type config: dict[str, object]
73
- :return: Run manifest describing the build.
74
- :rtype: RetrievalRun
75
- """
76
- recipe_config = HybridRecipeConfig.model_validate(config)
77
- _ensure_backend_supported(recipe_config)
78
- lexical_backend = _resolve_backend(recipe_config.lexical_backend)
79
- embedding_backend = _resolve_backend(recipe_config.embedding_backend)
80
- lexical_run = lexical_backend.build_run(
81
- corpus, recipe_name=f"{recipe_name}-lexical", config=recipe_config.lexical_config
82
- )
83
- embedding_run = embedding_backend.build_run(
84
- corpus, recipe_name=f"{recipe_name}-embedding", config=recipe_config.embedding_config
85
- )
86
- recipe = create_recipe_manifest(
87
- backend_id=self.backend_id,
88
- name=recipe_name,
89
- config=recipe_config.model_dump(),
90
- )
91
- stats = {
92
- "lexical_run_id": lexical_run.run_id,
93
- "embedding_run_id": embedding_run.run_id,
94
- }
95
- run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
96
- corpus.write_run(run)
97
- return run
98
-
99
- def query(
100
- self,
101
- corpus: Corpus,
102
- *,
103
- run: RetrievalRun,
104
- query_text: str,
105
- budget: QueryBudget,
106
- ) -> RetrievalResult:
107
- """
108
- Query using both lexical and embedding backends and fuse scores.
109
-
110
- :param corpus: Corpus associated with the run.
111
- :type corpus: Corpus
112
- :param run: Run manifest to use for querying.
113
- :type run: RetrievalRun
114
- :param query_text: Query text to execute.
115
- :type query_text: str
116
- :param budget: Evidence selection budget.
117
- :type budget: QueryBudget
118
- :return: Retrieval results containing evidence.
119
- :rtype: RetrievalResult
120
- """
121
- recipe_config = HybridRecipeConfig.model_validate(run.recipe.config)
122
- _ensure_backend_supported(recipe_config)
123
- lexical_backend = _resolve_backend(recipe_config.lexical_backend)
124
- embedding_backend = _resolve_backend(recipe_config.embedding_backend)
125
- lexical_run_id = run.stats.get("lexical_run_id")
126
- embedding_run_id = run.stats.get("embedding_run_id")
127
- if not lexical_run_id or not embedding_run_id:
128
- raise ValueError("Hybrid run missing lexical or embedding run identifiers")
129
- lexical_run = corpus.load_run(str(lexical_run_id))
130
- embedding_run = corpus.load_run(str(embedding_run_id))
131
- component_budget = _expand_component_budget(budget)
132
- lexical_result = lexical_backend.query(
133
- corpus, run=lexical_run, query_text=query_text, budget=component_budget
134
- )
135
- embedding_result = embedding_backend.query(
136
- corpus, run=embedding_run, query_text=query_text, budget=component_budget
137
- )
138
- candidates = _fuse_evidence(
139
- lexical_result.evidence,
140
- embedding_result.evidence,
141
- lexical_weight=recipe_config.lexical_weight,
142
- embedding_weight=recipe_config.embedding_weight,
143
- )
144
- sorted_candidates = sorted(
145
- candidates,
146
- key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
147
- )
148
- ranked = [
149
- evidence_item.model_copy(
150
- update={
151
- "rank": index,
152
- "recipe_id": run.recipe.recipe_id,
153
- "run_id": run.run_id,
154
- }
155
- )
156
- for index, evidence_item in enumerate(sorted_candidates, start=1)
157
- ]
158
- evidence = apply_budget(ranked, budget)
159
- stats = {
160
- "candidates": len(sorted_candidates),
161
- "returned": len(evidence),
162
- "fusion_weights": {
163
- "lexical": recipe_config.lexical_weight,
164
- "embedding": recipe_config.embedding_weight,
165
- },
166
- }
167
- return RetrievalResult(
168
- query_text=query_text,
169
- budget=budget,
170
- run_id=run.run_id,
171
- recipe_id=run.recipe.recipe_id,
172
- backend_id=self.backend_id,
173
- generated_at=utc_now_iso(),
174
- evidence=evidence,
175
- stats=stats,
176
- )
177
-
178
-
179
- def _ensure_backend_supported(recipe_config: HybridRecipeConfig) -> None:
180
- """
181
- Validate that hybrid backends do not reference the hybrid backend itself.
182
-
183
- :param recipe_config: Parsed hybrid recipe configuration.
184
- :type recipe_config: HybridRecipeConfig
185
- :return: None.
186
- :rtype: None
187
- :raises ValueError: If hybrid is used as a component backend.
188
- """
189
- if recipe_config.lexical_backend == HybridBackend.backend_id:
190
- raise ValueError("Hybrid backend cannot use itself as the lexical backend")
191
- if recipe_config.embedding_backend == HybridBackend.backend_id:
192
- raise ValueError("Hybrid backend cannot use itself as the embedding backend")
193
-
194
-
195
- def _resolve_backend(backend_id: str):
196
- """
197
- Resolve a backend by identifier.
198
-
199
- :param backend_id: Backend identifier.
200
- :type backend_id: str
201
- :return: Backend instance.
202
- :rtype: object
203
- """
204
- from . import get_backend
205
-
206
- return get_backend(backend_id)
207
-
208
-
209
- def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
210
- """
211
- Expand a final budget to collect more candidates for fusion.
212
-
213
- :param budget: Final evidence budget.
214
- :type budget: QueryBudget
215
- :param multiplier: Candidate expansion multiplier.
216
- :type multiplier: int
217
- :return: Expanded budget for component backends.
218
- :rtype: QueryBudget
219
- """
220
- maximum_total_characters = budget.maximum_total_characters
221
- expanded_characters = (
222
- maximum_total_characters * multiplier if maximum_total_characters is not None else None
223
- )
224
- expanded_max_items_per_source = (
225
- budget.max_items_per_source * multiplier
226
- if budget.max_items_per_source is not None
227
- else None
228
- )
229
- requested_items = budget.max_total_items + budget.offset
230
- return QueryBudget(
231
- max_total_items=requested_items * multiplier,
232
- offset=0,
233
- maximum_total_characters=expanded_characters,
234
- max_items_per_source=expanded_max_items_per_source,
235
- )
236
-
237
-
238
- def _fuse_evidence(
239
- lexical: List[Evidence],
240
- embedding: List[Evidence],
241
- *,
242
- lexical_weight: float,
243
- embedding_weight: float,
244
- ) -> List[Evidence]:
245
- """
246
- Fuse lexical and embedding evidence lists into hybrid candidates.
247
-
248
- :param lexical: Lexical evidence list.
249
- :type lexical: list[Evidence]
250
- :param embedding: Embedding evidence list.
251
- :type embedding: list[Evidence]
252
- :param lexical_weight: Lexical score weight.
253
- :type lexical_weight: float
254
- :param embedding_weight: Embedding score weight.
255
- :type embedding_weight: float
256
- :return: Hybrid evidence list.
257
- :rtype: list[Evidence]
258
- """
259
- merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
260
- for evidence_item in lexical:
261
- merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
262
- for evidence_item in embedding:
263
- merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
264
-
265
- candidates: List[Evidence] = []
266
- for item_id, sources in merged.items():
267
- lexical_evidence = sources.get("lexical")
268
- embedding_evidence = sources.get("embedding")
269
- lexical_score = lexical_evidence.score if lexical_evidence else 0.0
270
- embedding_score = embedding_evidence.score if embedding_evidence else 0.0
271
- combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
272
- base_evidence = lexical_evidence or embedding_evidence
273
- candidates.append(
274
- Evidence(
275
- item_id=item_id,
276
- source_uri=base_evidence.source_uri,
277
- media_type=base_evidence.media_type,
278
- score=combined_score,
279
- rank=1,
280
- text=base_evidence.text,
281
- content_ref=base_evidence.content_ref,
282
- span_start=base_evidence.span_start,
283
- span_end=base_evidence.span_end,
284
- stage="hybrid",
285
- stage_scores={"lexical": lexical_score, "embedding": embedding_score},
286
- recipe_id="",
287
- run_id="",
288
- metadata=base_evidence.metadata,
289
- hash=base_evidence.hash,
290
- )
291
- )
292
- return candidates