biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ Shared tool loop for virtual file edit workflows.
5
5
  from __future__ import annotations
6
6
 
7
7
  import json
8
+ import re
8
9
  from dataclasses import dataclass
9
10
  from typing import Any, Callable, Dict, List, Optional, Sequence
10
11
 
@@ -182,6 +183,18 @@ def run_tool_loop(
182
183
  last_error = "Tool loop requires non-empty old_str and new_str"
183
184
  tool_result = f"Error: {last_error}"
184
185
  else:
186
+ if old_str == new_str:
187
+ last_error = "Tool loop requires str_replace to make a change"
188
+ tool_result = f"Error: {last_error}"
189
+ had_tool_error = True
190
+ messages.append(
191
+ {
192
+ "role": "tool",
193
+ "tool_call_id": tool_call.get("id", ""),
194
+ "content": tool_result,
195
+ }
196
+ )
197
+ continue
185
198
  try:
186
199
  current_text = apply_str_replace(current_text, old_str, new_str)
187
200
  tool_result = (
@@ -214,6 +227,7 @@ def run_tool_loop(
214
227
  "content": _build_tool_error_message(
215
228
  error_message=last_error,
216
229
  current_text=current_text,
230
+ old_str=old_str if "old_str" in locals() else "",
217
231
  ),
218
232
  }
219
233
  )
@@ -260,19 +274,26 @@ def _build_retry_message(
260
274
  )
261
275
 
262
276
 
263
- def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
264
- if "not unique" in error_message:
277
+ def _build_tool_error_message(*, error_message: str, current_text: str, old_str: str) -> str:
278
+ if "found 0 matches" in error_message or "not found" in error_message:
279
+ guidance = (
280
+ "Copy the exact old_str from the current text (including punctuation/case) "
281
+ "or call view to inspect the latest text."
282
+ )
283
+ elif "found " in error_message and "matches" in error_message:
265
284
  guidance = (
266
285
  "Use a longer unique old_str by including surrounding words or punctuation "
267
286
  "so it matches exactly once."
268
287
  )
269
- elif "not found" in error_message:
288
+ elif "not unique" in error_message:
270
289
  guidance = (
271
- "Copy the exact old_str from the current text (including punctuation/case) "
272
- "or call view to inspect the latest text."
290
+ "Use a longer unique old_str by including surrounding words or punctuation "
291
+ "so it matches exactly once."
273
292
  )
274
293
  else:
275
294
  guidance = "Fix the tool call and try again."
295
+ if old_str and len(old_str) <= 3:
296
+ guidance = f"{guidance} If unsure, call view to pick a longer unique substring."
276
297
  return (
277
298
  "Your last tool call failed.\n"
278
299
  f"Error: {error_message}\n"
@@ -282,6 +303,43 @@ def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
282
303
  )
283
304
 
284
305
 
306
+ _SPAN_OPEN_PATTERN = re.compile(r"<span\b[^>]*>")
307
+ _SPAN_CLOSE_PATTERN = re.compile(r"</span>")
308
+ _SLICE_PATTERN = re.compile(r"<slice\s*/>")
309
+
310
+
311
+ def _strip_markup(text: str) -> str:
312
+ without_spans = _SPAN_CLOSE_PATTERN.sub("", _SPAN_OPEN_PATTERN.sub("", text))
313
+ return _SLICE_PATTERN.sub("", without_spans)
314
+
315
+
316
+ def apply_unique_str_replace(text: str, old_str: str, new_str: str) -> str:
317
+ """
318
+ Apply a single replacement only when old_str matches exactly once.
319
+
320
+ :param text: Current text content.
321
+ :type text: str
322
+ :param old_str: Substring to replace.
323
+ :type old_str: str
324
+ :param new_str: Replacement string.
325
+ :type new_str: str
326
+ :return: Updated text.
327
+ :rtype: str
328
+ :raises ValueError: If old_str matches zero or multiple times.
329
+ """
330
+ matches = text.count(old_str)
331
+ if matches != 1:
332
+ raise ValueError(
333
+ "Tool loop requires old_str to match exactly once " f"(found {matches} matches)"
334
+ )
335
+ if _strip_markup(old_str) != _strip_markup(new_str):
336
+ raise ValueError(
337
+ "Tool loop replacements may only insert markup tags; "
338
+ "the underlying text must stay the same"
339
+ )
340
+ return text.replace(old_str, new_str, 1)
341
+
342
+
285
343
  def _build_no_tool_calls_message(*, assistant_message: str, current_text: str) -> str:
286
344
  guidance = (
287
345
  "Use the tools to edit the text. "
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.16.0
3
+ Version: 1.1.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -80,7 +80,7 @@ See [retrieval augmented generation overview] for a short introduction to the id
80
80
  ## Analysis highlights
81
81
 
82
82
  - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
83
- - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
83
+ - YAML configurations support cascading composition plus dotted `--config key=value` overrides.
84
84
  - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
85
85
  - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
86
86
  - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
@@ -167,7 +167,7 @@ sequenceDiagram
167
167
 
168
168
  - You can ingest raw material once, then try many retrieval approaches over time.
169
169
  - You can keep raw files readable and portable, without locking your data inside a database.
170
- - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
170
+ - You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
171
171
 
172
172
  ## Typical flow
173
173
 
@@ -176,7 +176,7 @@ sequenceDiagram
176
176
  - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
177
177
  - Run extraction when you want derived text artifacts from non-text sources.
178
178
  - Reindex to refresh the catalog after edits.
179
- - Build a retrieval run with a backend.
179
+ - Build a retrieval snapshot with a backend.
180
180
  - Query the run to collect evidence and evaluate it with datasets.
181
181
 
182
182
  ## Install
@@ -292,8 +292,8 @@ for note_title, note_text in notes:
292
292
  corpus.ingest_note(note_text, title=note_title, tags=["memory"])
293
293
 
294
294
  backend = get_backend("scan")
295
- run = backend.build_run(corpus, recipe_name="Story demo", config={})
296
- budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=None)
295
+ run = backend.build_run(corpus, configuration_name="Story demo", config={})
296
+ budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
297
297
  result = backend.query(
298
298
  corpus,
299
299
  run=run,
@@ -333,11 +333,11 @@ Example output:
333
333
  "query_text": "Primary button style preference",
334
334
  "budget": {
335
335
  "max_total_items": 5,
336
- "max_total_characters": 2000,
336
+ "maximum_total_characters": 2000,
337
337
  "max_items_per_source": null
338
338
  },
339
- "run_id": "RUN_ID",
340
- "recipe_id": "RECIPE_ID",
339
+ "snapshot_id": "RUN_ID",
340
+ "configuration_id": "RECIPE_ID",
341
341
  "backend_id": "scan",
342
342
  "generated_at": "2026-01-29T00:00:00.000000Z",
343
343
  "evidence": [
@@ -352,8 +352,8 @@ Example output:
352
352
  "span_start": null,
353
353
  "span_end": null,
354
354
  "stage": "scan",
355
- "recipe_id": "RECIPE_ID",
356
- "run_id": "RUN_ID",
355
+ "configuration_id": "RECIPE_ID",
356
+ "snapshot_id": "RUN_ID",
357
357
  "hash": null
358
358
  }
359
359
  ],
@@ -422,7 +422,7 @@ flowchart TB
422
422
 
423
423
  subgraph RowExtraction[Pluggable: extraction pipeline]
424
424
  direction TB
425
- Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction run manifest]
425
+ Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
426
426
  end
427
427
 
428
428
  subgraph RowRetrieval[Pluggable: retrieval backend]
@@ -484,7 +484,7 @@ From Python, the same flow is available through the Corpus class and backend int
484
484
  - Ingest notes with `Corpus.ingest_note`.
485
485
  - Ingest files or web addresses with `Corpus.ingest_source`.
486
486
  - List items with `Corpus.list_items`.
487
- - Build a retrieval run with `get_backend` and `backend.build_run`.
487
+ - Build a retrieval snapshot with `get_backend` and `backend.build_run`.
488
488
  - Query a run with `backend.query`.
489
489
  - Evaluate with `evaluate_run`.
490
490
 
@@ -530,13 +530,13 @@ corpus/
530
530
  runs/
531
531
  extraction/
532
532
  pipeline/
533
- <run id>/
533
+ <snapshot id>/
534
534
  manifest.json
535
535
  text/
536
536
  <item id>.txt
537
537
  retrieval/
538
538
  <backend id>/
539
- <run id>/
539
+ <snapshot id>/
540
540
  manifest.json
541
541
  ```
542
542
 
@@ -552,7 +552,7 @@ For detailed documentation including configuration options, performance characte
552
552
 
553
553
  ## Retrieval documentation
554
554
 
555
- For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
555
+ For the retrieval pipeline overview and snapshot artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
556
556
  (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
557
557
  and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
558
558
  script (`scripts/retrieval_evaluation_lab.py`).
@@ -615,26 +615,26 @@ See `docs/TEXT_SLICE.md` for the utility API and examples.
615
615
 
616
616
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
617
617
  are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
618
- an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
618
+ an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
619
619
  optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
620
620
 
621
621
  See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
622
622
  `docs/TOPIC_MODELING.md` for topic modeling details.
623
623
 
624
- Run a topic analysis using a recipe file:
624
+ Run a topic analysis using a configuration file:
625
625
 
626
626
  ```
627
- biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
627
+ biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
628
628
  ```
629
629
 
630
- If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
630
+ If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
631
631
  reproducibility. The analysis output is stored under:
632
632
 
633
633
  ```
634
- .biblicus/runs/analysis/topic-modeling/<run_id>/output.json
634
+ .biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
635
635
  ```
636
636
 
637
- Minimal recipe example:
637
+ Minimal configuration example:
638
638
 
639
639
  ```yaml
640
640
  schema_version: 1
@@ -659,7 +659,7 @@ llm_fine_tuning:
659
659
  ```
660
660
 
661
661
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
662
- Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
662
+ Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
663
663
  AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
664
664
 
665
665
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
@@ -712,6 +712,15 @@ Build the documentation:
712
712
  python -m sphinx -b html docs docs/_build/html
713
713
  ```
714
714
 
715
+ Preview the documentation locally:
716
+
717
+ ```
718
+ cd docs/_build/html
719
+ python -m http.server
720
+ ```
721
+
722
+ Open `http://localhost:8000` in your browser.
723
+
715
724
  ## License
716
725
 
717
726
  License terms are in `LICENSE`.
@@ -0,0 +1,91 @@
1
+ biblicus/__init__.py,sha256=O9FlaC1aaafCfDoI3sIsbtUsjNKJpBI6sP-RTp_kCaI,1013
2
+ biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
+ biblicus/chunking.py,sha256=GdJr0skAAI0Su99mr7dXqCgR7eJ0sJu8n2XesVGyddY,13206
4
+ biblicus/cli.py,sha256=GN7L0-s0k9tAj_lthvBrJlfo_DG9y53vYc6k_IhSea0,45797
5
+ biblicus/configuration.py,sha256=JzQU-2pzO4hY7pBw8J79Ci0Glc9cvh4KrRvzSMK2d5w,4329
6
+ biblicus/constants.py,sha256=VVjfZvdmoiCNsiQv0JVI-cA6JKXWUsvGL_IjnTxlEI8,386
7
+ biblicus/context.py,sha256=I7L86ag2AbNr_QgiP5YSt1uwwULGx1cH73eR2nE9T3g,10842
8
+ biblicus/corpus.py,sha256=D9O1Z8lQ7yFNXQDkaKR9fSTRDMSwtrYTGavh_GM7Eww,60374
9
+ biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
10
+ biblicus/embedding_providers.py,sha256=phWEsq1vryyTFRRs6uZ0sx9FhrqWIkDsS3I52I64zqM,3839
11
+ biblicus/errors.py,sha256=7fAGJbe_pCD8ygnfbTn6bNRV6pam0Vx3xjIpLrxrucg,1382
12
+ biblicus/evaluation.py,sha256=XnQKPbUcUBnELllh7cNEzvTK8EKU1Ub0q3u_sIhXB5E,8372
13
+ biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
14
+ biblicus/extraction.py,sha256=YiJqLWY3mglYokSJqA8-oIxpFBPW4Hz0TEeeNp0PtWA,20581
15
+ biblicus/extraction_evaluation.py,sha256=kFbyKcHzZK_z0OgCmQ3Olj55zgGoxin0Ir3dUA50TLI,10641
16
+ biblicus/frontmatter.py,sha256=uFC4iIrgpnTDiP1gvAnT_CbFYdNuUVtETX7tZ3a9g-Y,2517
17
+ biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
18
+ biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
19
+ biblicus/hooks.py,sha256=-ZcKZ4scK9ctas_PcseOmJJOLCkwxpnIxrACcz1qUus,7907
20
+ biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
21
+ biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
22
+ biblicus/knowledge_base.py,sha256=pDZQlihjMB7AF61LccVG21rWEAjisgRtkEcn-dymZTM,6915
23
+ biblicus/models.py,sha256=UlaqdvdqPZHd2__4Gcd4pryA_DBVgSiv86uI2AYD8Ag,16990
24
+ biblicus/retrieval.py,sha256=9RA3KGw43dBOD1EFZwt9sqcVf334UtXb1qNHUqYW6As,4646
25
+ biblicus/sources.py,sha256=FNwW1FWts0jxWIL3AHon7D6c5ZatyG9AGFqzn1Id5mE,8504
26
+ biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
27
+ biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
28
+ biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
29
+ biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
30
+ biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
31
+ biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
32
+ biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
33
+ biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
34
+ biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
35
+ biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
36
+ biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
37
+ biblicus/analysis/__init__.py,sha256=d1q11tEx3JkrOPMaiGMNCHhN9tCOTr_QpQP-tI1J2Wk,1389
38
+ biblicus/analysis/base.py,sha256=HErFLn3gv1qf9ckAUxbolHF2k9sJDNZjPjdboCMhyBE,1349
39
+ biblicus/analysis/markov.py,sha256=pLtKvt4gtsqa1CASizh8bBJ4CQW2e0wGaQ-BgdP7Pfg,63766
40
+ biblicus/analysis/models.py,sha256=dYnm5gwUzTk5HvrHZjQx4vug_TZLnXU9qN6CLIRyLng,56495
41
+ biblicus/analysis/profiling.py,sha256=IynvrgcopqFj6lMUPHS1prwd0FxN8FzIa5p3JInDFCc,11185
42
+ biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
43
+ biblicus/analysis/topic_modeling.py,sha256=dsNHuqxcEoCKO_8aDAM9yEOa0kWCjPWS2NvcQayIyXQ,22623
44
+ biblicus/context_engine/__init__.py,sha256=cIJWTUwOewW1x13a2n0YKfr4-XU0IwlVdAH_0pckfKk,1337
45
+ biblicus/context_engine/assembler.py,sha256=E7VPdqUJ9peZUoonM0Ooa1wsaklFOuLCt2IH9nFxAfM,44260
46
+ biblicus/context_engine/compaction.py,sha256=2bLaCpT48d1TL7vt9rrcRCgfdHeWWp9LX85Cgij12o0,2921
47
+ biblicus/context_engine/models.py,sha256=jesVd83ZQcatO-7yNlzwKkactSQ-e1znYuWof4rxVFg,12762
48
+ biblicus/context_engine/retrieval.py,sha256=A0w6C5uPrDY_aeGeirRkSGr6I-gU0U0cY6ElvrLhe0Q,4425
49
+ biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
50
+ biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
51
+ biblicus/extractors/deepgram_stt.py,sha256=xx_zrROGRHotF5aht23Qey9dpCnU3KHZ0unwa933Pto,6358
52
+ biblicus/extractors/docling_granite_text.py,sha256=iDHWZVgqZd86Q3Zu-fcdCq7ia00xTpbFbTU7JbDNZ38,6953
53
+ biblicus/extractors/docling_smol_text.py,sha256=qI7m93Odrjmob0RW-Yvnt5Ck4AgFcgdVjwatLQA5krI,6885
54
+ biblicus/extractors/markitdown_text.py,sha256=kYixZbVxaIyeWtpezocnrSxC3z_9KqWuBzeK8sI4s1o,4567
55
+ biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
56
+ biblicus/extractors/openai_stt.py,sha256=d2CaVhxapfkXaeI_QZcoXwdVm5Bj5YwGLdmTZDcqgTc,7197
57
+ biblicus/extractors/paddleocr_vl_text.py,sha256=2xoHA1Jviw8zzeBvHBI74Lkx4SX_vSarCe3wxvYf6c4,11794
58
+ biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
59
+ biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
60
+ biblicus/extractors/pipeline.py,sha256=qdlBBSUVNdg2V4izHacv_8a2DikCGlVMAdpkZkzNvyY,3288
61
+ biblicus/extractors/rapidocr_text.py,sha256=5adSCiOmyHiCgX3jBMcl1OiQlGzYLxmgJQzo9GHSecs,4791
62
+ biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
63
+ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_mvjehiSec,4014
64
+ biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
65
+ biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
66
+ biblicus/extractors/unstructured_text.py,sha256=WXr_fu4KQ0NODkbb05e4HrAX-trOWRKiDOmznh9_pLI,3579
67
+ biblicus/retrievers/__init__.py,sha256=LOxhUYgph1sPAeY6PmSmXH4Os4bIGGOtw88iOdI9S2k,1704
68
+ biblicus/retrievers/base.py,sha256=DSf5Ve5IFeunIyV9zt7T1vEUvSkJWO4iBj96co5F0Qo,1891
69
+ biblicus/retrievers/embedding_index_common.py,sha256=63_dUds-yIALhq2L9_0oDNpoh-_h7v2j1kexbdVw1-o,11756
70
+ biblicus/retrievers/embedding_index_file.py,sha256=mvtXqRX-_eQpi9bRxQ2yqFxY26YhP8Vn2WGcoWVtMtc,10668
71
+ biblicus/retrievers/embedding_index_inmemory.py,sha256=8csrdjCGkkR7DgvmLZ72oD1gm4duWWUnxJsjw6nLicU,10525
72
+ biblicus/retrievers/hybrid.py,sha256=kaH-kIi4wxYyUWnKNFT7UNBbHFkRtcGlwjjiJpx-TJY,11789
73
+ biblicus/retrievers/scan.py,sha256=ccDGVnqBS9a2ymKeBEXdfJz8XLahsBeYWYyYXQcg2KQ,13147
74
+ biblicus/retrievers/sqlite_full_text_search.py,sha256=7rzYfzpRhPbsKuXjXi8x2-rmq8-z1em3amUF9UPAomI,25392
75
+ biblicus/retrievers/tf_vector.py,sha256=rkcRG1GU5S_3t8GRbQTBThITj-eHT5fs1dyVzXPLg8w,15776
76
+ biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
77
+ biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
78
+ biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
79
+ biblicus/text/link.py,sha256=2IdOi3WgyBKPFau0bpS1eToV1q2v_6wq5RK5_P_qUDg,20448
80
+ biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
81
+ biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
82
+ biblicus/text/prompts.py,sha256=9dx1cWpJb6oBY4AhDHxlkRUYs7DfbySH0gb-uBTNvtk,7567
83
+ biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
84
+ biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
85
+ biblicus/text/tool_loop.py,sha256=dFeIEcCUA-yR8GMqsJ_n4007fHVmn9zK2hhlm6NlWyg,14161
86
+ biblicus-1.1.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
87
+ biblicus-1.1.0.dist-info/METADATA,sha256=8hRnC6tlf8crtWxf6FPbGANZH9lxL6kiAtOtcxqJ3Ig,31202
88
+ biblicus-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
89
+ biblicus-1.1.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
90
+ biblicus-1.1.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
91
+ biblicus-1.1.0.dist-info/RECORD,,
@@ -1,50 +0,0 @@
1
- """
2
- Backend registry for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from typing import Dict, Type
8
-
9
- from .base import RetrievalBackend
10
- from .embedding_index_file import EmbeddingIndexFileBackend
11
- from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
12
- from .hybrid import HybridBackend
13
- from .scan import ScanBackend
14
- from .sqlite_full_text_search import SqliteFullTextSearchBackend
15
- from .tf_vector import TfVectorBackend
16
-
17
-
18
- def available_backends() -> Dict[str, Type[RetrievalBackend]]:
19
- """
20
- Return the registered retrieval backends.
21
-
22
- :return: Mapping of backend identifiers to backend classes.
23
- :rtype: dict[str, Type[RetrievalBackend]]
24
- """
25
- return {
26
- EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
27
- EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
28
- HybridBackend.backend_id: HybridBackend,
29
- ScanBackend.backend_id: ScanBackend,
30
- SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
31
- TfVectorBackend.backend_id: TfVectorBackend,
32
- }
33
-
34
-
35
- def get_backend(backend_id: str) -> RetrievalBackend:
36
- """
37
- Instantiate a retrieval backend by identifier.
38
-
39
- :param backend_id: Backend identifier.
40
- :type backend_id: str
41
- :return: Backend instance.
42
- :rtype: RetrievalBackend
43
- :raises KeyError: If the backend identifier is unknown.
44
- """
45
- registry = available_backends()
46
- backend_class = registry.get(backend_id)
47
- if backend_class is None:
48
- known = ", ".join(sorted(registry))
49
- raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
50
- return backend_class()
biblicus/backends/base.py DELETED
@@ -1,65 +0,0 @@
1
- """
2
- Backend interface for Biblicus retrieval engines.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from abc import ABC, abstractmethod
8
- from typing import Dict
9
-
10
- from ..corpus import Corpus
11
- from ..models import QueryBudget, RetrievalResult, RetrievalRun
12
-
13
-
14
- class RetrievalBackend(ABC):
15
- """
16
- Abstract interface for retrieval backends.
17
-
18
- :ivar backend_id: Identifier string for the backend.
19
- :vartype backend_id: str
20
- """
21
-
22
- backend_id: str
23
-
24
- @abstractmethod
25
- def build_run(
26
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
27
- ) -> RetrievalRun:
28
- """
29
- Build or register a retrieval run for the backend.
30
-
31
- :param corpus: Corpus to build against.
32
- :type corpus: Corpus
33
- :param recipe_name: Human name for the recipe.
34
- :type recipe_name: str
35
- :param config: Backend-specific configuration values.
36
- :type config: dict[str, object]
37
- :return: Run manifest describing the build.
38
- :rtype: RetrievalRun
39
- """
40
- raise NotImplementedError
41
-
42
- @abstractmethod
43
- def query(
44
- self,
45
- corpus: Corpus,
46
- *,
47
- run: RetrievalRun,
48
- query_text: str,
49
- budget: QueryBudget,
50
- ) -> RetrievalResult:
51
- """
52
- Run a retrieval query against a backend.
53
-
54
- :param corpus: Corpus associated with the run.
55
- :type corpus: Corpus
56
- :param run: Run manifest to use for querying.
57
- :type run: RetrievalRun
58
- :param query_text: Query text to execute.
59
- :type query_text: str
60
- :param budget: Evidence selection budget.
61
- :type budget: QueryBudget
62
- :return: Retrieval results containing evidence.
63
- :rtype: RetrievalResult
64
- """
65
- raise NotImplementedError