biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/text/tool_loop.py
CHANGED
|
@@ -5,6 +5,7 @@ Shared tool loop for virtual file edit workflows.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
|
+
import re
|
|
8
9
|
from dataclasses import dataclass
|
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
10
11
|
|
|
@@ -182,6 +183,18 @@ def run_tool_loop(
|
|
|
182
183
|
last_error = "Tool loop requires non-empty old_str and new_str"
|
|
183
184
|
tool_result = f"Error: {last_error}"
|
|
184
185
|
else:
|
|
186
|
+
if old_str == new_str:
|
|
187
|
+
last_error = "Tool loop requires str_replace to make a change"
|
|
188
|
+
tool_result = f"Error: {last_error}"
|
|
189
|
+
had_tool_error = True
|
|
190
|
+
messages.append(
|
|
191
|
+
{
|
|
192
|
+
"role": "tool",
|
|
193
|
+
"tool_call_id": tool_call.get("id", ""),
|
|
194
|
+
"content": tool_result,
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
continue
|
|
185
198
|
try:
|
|
186
199
|
current_text = apply_str_replace(current_text, old_str, new_str)
|
|
187
200
|
tool_result = (
|
|
@@ -214,6 +227,7 @@ def run_tool_loop(
|
|
|
214
227
|
"content": _build_tool_error_message(
|
|
215
228
|
error_message=last_error,
|
|
216
229
|
current_text=current_text,
|
|
230
|
+
old_str=old_str if "old_str" in locals() else "",
|
|
217
231
|
),
|
|
218
232
|
}
|
|
219
233
|
)
|
|
@@ -260,19 +274,26 @@ def _build_retry_message(
|
|
|
260
274
|
)
|
|
261
275
|
|
|
262
276
|
|
|
263
|
-
def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
|
|
264
|
-
if "not
|
|
277
|
+
def _build_tool_error_message(*, error_message: str, current_text: str, old_str: str) -> str:
|
|
278
|
+
if "found 0 matches" in error_message or "not found" in error_message:
|
|
279
|
+
guidance = (
|
|
280
|
+
"Copy the exact old_str from the current text (including punctuation/case) "
|
|
281
|
+
"or call view to inspect the latest text."
|
|
282
|
+
)
|
|
283
|
+
elif "found " in error_message and "matches" in error_message:
|
|
265
284
|
guidance = (
|
|
266
285
|
"Use a longer unique old_str by including surrounding words or punctuation "
|
|
267
286
|
"so it matches exactly once."
|
|
268
287
|
)
|
|
269
|
-
elif "not
|
|
288
|
+
elif "not unique" in error_message:
|
|
270
289
|
guidance = (
|
|
271
|
-
"
|
|
272
|
-
"
|
|
290
|
+
"Use a longer unique old_str by including surrounding words or punctuation "
|
|
291
|
+
"so it matches exactly once."
|
|
273
292
|
)
|
|
274
293
|
else:
|
|
275
294
|
guidance = "Fix the tool call and try again."
|
|
295
|
+
if old_str and len(old_str) <= 3:
|
|
296
|
+
guidance = f"{guidance} If unsure, call view to pick a longer unique substring."
|
|
276
297
|
return (
|
|
277
298
|
"Your last tool call failed.\n"
|
|
278
299
|
f"Error: {error_message}\n"
|
|
@@ -282,6 +303,43 @@ def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
|
|
|
282
303
|
)
|
|
283
304
|
|
|
284
305
|
|
|
306
|
+
_SPAN_OPEN_PATTERN = re.compile(r"<span\b[^>]*>")
|
|
307
|
+
_SPAN_CLOSE_PATTERN = re.compile(r"</span>")
|
|
308
|
+
_SLICE_PATTERN = re.compile(r"<slice\s*/>")
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _strip_markup(text: str) -> str:
|
|
312
|
+
without_spans = _SPAN_CLOSE_PATTERN.sub("", _SPAN_OPEN_PATTERN.sub("", text))
|
|
313
|
+
return _SLICE_PATTERN.sub("", without_spans)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def apply_unique_str_replace(text: str, old_str: str, new_str: str) -> str:
|
|
317
|
+
"""
|
|
318
|
+
Apply a single replacement only when old_str matches exactly once.
|
|
319
|
+
|
|
320
|
+
:param text: Current text content.
|
|
321
|
+
:type text: str
|
|
322
|
+
:param old_str: Substring to replace.
|
|
323
|
+
:type old_str: str
|
|
324
|
+
:param new_str: Replacement string.
|
|
325
|
+
:type new_str: str
|
|
326
|
+
:return: Updated text.
|
|
327
|
+
:rtype: str
|
|
328
|
+
:raises ValueError: If old_str matches zero or multiple times.
|
|
329
|
+
"""
|
|
330
|
+
matches = text.count(old_str)
|
|
331
|
+
if matches != 1:
|
|
332
|
+
raise ValueError(
|
|
333
|
+
"Tool loop requires old_str to match exactly once " f"(found {matches} matches)"
|
|
334
|
+
)
|
|
335
|
+
if _strip_markup(old_str) != _strip_markup(new_str):
|
|
336
|
+
raise ValueError(
|
|
337
|
+
"Tool loop replacements may only insert markup tags; "
|
|
338
|
+
"the underlying text must stay the same"
|
|
339
|
+
)
|
|
340
|
+
return text.replace(old_str, new_str, 1)
|
|
341
|
+
|
|
342
|
+
|
|
285
343
|
def _build_no_tool_calls_message(*, assistant_message: str, current_text: str) -> str:
|
|
286
344
|
guidance = (
|
|
287
345
|
"Use the tools to edit the text. "
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -80,7 +80,7 @@ See [retrieval augmented generation overview] for a short introduction to the id
|
|
|
80
80
|
## Analysis highlights
|
|
81
81
|
|
|
82
82
|
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
83
|
-
- YAML
|
|
83
|
+
- YAML configurations support cascading composition plus dotted `--config key=value` overrides.
|
|
84
84
|
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
85
85
|
- See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
|
|
86
86
|
- See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
|
|
@@ -167,7 +167,7 @@ sequenceDiagram
|
|
|
167
167
|
|
|
168
168
|
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
169
169
|
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
170
|
-
- You can evaluate retrieval
|
|
170
|
+
- You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
|
|
171
171
|
|
|
172
172
|
## Typical flow
|
|
173
173
|
|
|
@@ -176,7 +176,7 @@ sequenceDiagram
|
|
|
176
176
|
- Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
|
|
177
177
|
- Run extraction when you want derived text artifacts from non-text sources.
|
|
178
178
|
- Reindex to refresh the catalog after edits.
|
|
179
|
-
- Build a retrieval
|
|
179
|
+
- Build a retrieval snapshot with a backend.
|
|
180
180
|
- Query the run to collect evidence and evaluate it with datasets.
|
|
181
181
|
|
|
182
182
|
## Install
|
|
@@ -292,8 +292,8 @@ for note_title, note_text in notes:
|
|
|
292
292
|
corpus.ingest_note(note_text, title=note_title, tags=["memory"])
|
|
293
293
|
|
|
294
294
|
backend = get_backend("scan")
|
|
295
|
-
run = backend.build_run(corpus,
|
|
296
|
-
budget = QueryBudget(max_total_items=5,
|
|
295
|
+
run = backend.build_run(corpus, configuration_name="Story demo", config={})
|
|
296
|
+
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
297
297
|
result = backend.query(
|
|
298
298
|
corpus,
|
|
299
299
|
run=run,
|
|
@@ -333,11 +333,11 @@ Example output:
|
|
|
333
333
|
"query_text": "Primary button style preference",
|
|
334
334
|
"budget": {
|
|
335
335
|
"max_total_items": 5,
|
|
336
|
-
"
|
|
336
|
+
"maximum_total_characters": 2000,
|
|
337
337
|
"max_items_per_source": null
|
|
338
338
|
},
|
|
339
|
-
"
|
|
340
|
-
"
|
|
339
|
+
"snapshot_id": "RUN_ID",
|
|
340
|
+
"configuration_id": "RECIPE_ID",
|
|
341
341
|
"backend_id": "scan",
|
|
342
342
|
"generated_at": "2026-01-29T00:00:00.000000Z",
|
|
343
343
|
"evidence": [
|
|
@@ -352,8 +352,8 @@ Example output:
|
|
|
352
352
|
"span_start": null,
|
|
353
353
|
"span_end": null,
|
|
354
354
|
"stage": "scan",
|
|
355
|
-
"
|
|
356
|
-
"
|
|
355
|
+
"configuration_id": "RECIPE_ID",
|
|
356
|
+
"snapshot_id": "RUN_ID",
|
|
357
357
|
"hash": null
|
|
358
358
|
}
|
|
359
359
|
],
|
|
@@ -422,7 +422,7 @@ flowchart TB
|
|
|
422
422
|
|
|
423
423
|
subgraph RowExtraction[Pluggable: extraction pipeline]
|
|
424
424
|
direction TB
|
|
425
|
-
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction
|
|
425
|
+
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
|
|
426
426
|
end
|
|
427
427
|
|
|
428
428
|
subgraph RowRetrieval[Pluggable: retrieval backend]
|
|
@@ -484,7 +484,7 @@ From Python, the same flow is available through the Corpus class and backend int
|
|
|
484
484
|
- Ingest notes with `Corpus.ingest_note`.
|
|
485
485
|
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
486
486
|
- List items with `Corpus.list_items`.
|
|
487
|
-
- Build a retrieval
|
|
487
|
+
- Build a retrieval snapshot with `get_backend` and `backend.build_run`.
|
|
488
488
|
- Query a run with `backend.query`.
|
|
489
489
|
- Evaluate with `evaluate_run`.
|
|
490
490
|
|
|
@@ -530,13 +530,13 @@ corpus/
|
|
|
530
530
|
runs/
|
|
531
531
|
extraction/
|
|
532
532
|
pipeline/
|
|
533
|
-
<
|
|
533
|
+
<snapshot id>/
|
|
534
534
|
manifest.json
|
|
535
535
|
text/
|
|
536
536
|
<item id>.txt
|
|
537
537
|
retrieval/
|
|
538
538
|
<backend id>/
|
|
539
|
-
<
|
|
539
|
+
<snapshot id>/
|
|
540
540
|
manifest.json
|
|
541
541
|
```
|
|
542
542
|
|
|
@@ -552,7 +552,7 @@ For detailed documentation including configuration options, performance characte
|
|
|
552
552
|
|
|
553
553
|
## Retrieval documentation
|
|
554
554
|
|
|
555
|
-
For the retrieval pipeline overview and
|
|
555
|
+
For the retrieval pipeline overview and snapshot artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
|
|
556
556
|
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
|
|
557
557
|
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
|
|
558
558
|
script (`scripts/retrieval_evaluation_lab.py`).
|
|
@@ -615,26 +615,26 @@ See `docs/TEXT_SLICE.md` for the utility API and examples.
|
|
|
615
615
|
|
|
616
616
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
617
617
|
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
618
|
-
an extraction
|
|
618
|
+
an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
619
619
|
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
620
620
|
|
|
621
621
|
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
622
622
|
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
623
623
|
|
|
624
|
-
Run a topic analysis using a
|
|
624
|
+
Run a topic analysis using a configuration file:
|
|
625
625
|
|
|
626
626
|
```
|
|
627
|
-
biblicus analyze topics --corpus corpora/example --
|
|
627
|
+
biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
|
|
628
628
|
```
|
|
629
629
|
|
|
630
|
-
If `--extraction-run` is omitted, Biblicus uses the most recent extraction
|
|
630
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
|
|
631
631
|
reproducibility. The analysis output is stored under:
|
|
632
632
|
|
|
633
633
|
```
|
|
634
|
-
.biblicus/runs/analysis/topic-modeling/<
|
|
634
|
+
.biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
|
|
635
635
|
```
|
|
636
636
|
|
|
637
|
-
Minimal
|
|
637
|
+
Minimal configuration example:
|
|
638
638
|
|
|
639
639
|
```yaml
|
|
640
640
|
schema_version: 1
|
|
@@ -659,7 +659,7 @@ llm_fine_tuning:
|
|
|
659
659
|
```
|
|
660
660
|
|
|
661
661
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
662
|
-
|
|
662
|
+
Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
663
663
|
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
664
664
|
|
|
665
665
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
@@ -712,6 +712,15 @@ Build the documentation:
|
|
|
712
712
|
python -m sphinx -b html docs docs/_build/html
|
|
713
713
|
```
|
|
714
714
|
|
|
715
|
+
Preview the documentation locally:
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
cd docs/_build/html
|
|
719
|
+
python -m http.server
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
Open `http://localhost:8000` in your browser.
|
|
723
|
+
|
|
715
724
|
## License
|
|
716
725
|
|
|
717
726
|
License terms are in `LICENSE`.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
biblicus/__init__.py,sha256=O9FlaC1aaafCfDoI3sIsbtUsjNKJpBI6sP-RTp_kCaI,1013
|
|
2
|
+
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
+
biblicus/chunking.py,sha256=GdJr0skAAI0Su99mr7dXqCgR7eJ0sJu8n2XesVGyddY,13206
|
|
4
|
+
biblicus/cli.py,sha256=GN7L0-s0k9tAj_lthvBrJlfo_DG9y53vYc6k_IhSea0,45797
|
|
5
|
+
biblicus/configuration.py,sha256=JzQU-2pzO4hY7pBw8J79Ci0Glc9cvh4KrRvzSMK2d5w,4329
|
|
6
|
+
biblicus/constants.py,sha256=VVjfZvdmoiCNsiQv0JVI-cA6JKXWUsvGL_IjnTxlEI8,386
|
|
7
|
+
biblicus/context.py,sha256=I7L86ag2AbNr_QgiP5YSt1uwwULGx1cH73eR2nE9T3g,10842
|
|
8
|
+
biblicus/corpus.py,sha256=D9O1Z8lQ7yFNXQDkaKR9fSTRDMSwtrYTGavh_GM7Eww,60374
|
|
9
|
+
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
10
|
+
biblicus/embedding_providers.py,sha256=phWEsq1vryyTFRRs6uZ0sx9FhrqWIkDsS3I52I64zqM,3839
|
|
11
|
+
biblicus/errors.py,sha256=7fAGJbe_pCD8ygnfbTn6bNRV6pam0Vx3xjIpLrxrucg,1382
|
|
12
|
+
biblicus/evaluation.py,sha256=XnQKPbUcUBnELllh7cNEzvTK8EKU1Ub0q3u_sIhXB5E,8372
|
|
13
|
+
biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
|
|
14
|
+
biblicus/extraction.py,sha256=YiJqLWY3mglYokSJqA8-oIxpFBPW4Hz0TEeeNp0PtWA,20581
|
|
15
|
+
biblicus/extraction_evaluation.py,sha256=kFbyKcHzZK_z0OgCmQ3Olj55zgGoxin0Ir3dUA50TLI,10641
|
|
16
|
+
biblicus/frontmatter.py,sha256=uFC4iIrgpnTDiP1gvAnT_CbFYdNuUVtETX7tZ3a9g-Y,2517
|
|
17
|
+
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
18
|
+
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
19
|
+
biblicus/hooks.py,sha256=-ZcKZ4scK9ctas_PcseOmJJOLCkwxpnIxrACcz1qUus,7907
|
|
20
|
+
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
21
|
+
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
22
|
+
biblicus/knowledge_base.py,sha256=pDZQlihjMB7AF61LccVG21rWEAjisgRtkEcn-dymZTM,6915
|
|
23
|
+
biblicus/models.py,sha256=UlaqdvdqPZHd2__4Gcd4pryA_DBVgSiv86uI2AYD8Ag,16990
|
|
24
|
+
biblicus/retrieval.py,sha256=9RA3KGw43dBOD1EFZwt9sqcVf334UtXb1qNHUqYW6As,4646
|
|
25
|
+
biblicus/sources.py,sha256=FNwW1FWts0jxWIL3AHon7D6c5ZatyG9AGFqzn1Id5mE,8504
|
|
26
|
+
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
27
|
+
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
28
|
+
biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
|
|
29
|
+
biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
|
|
30
|
+
biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
|
|
31
|
+
biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
|
|
32
|
+
biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
|
|
33
|
+
biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
|
|
34
|
+
biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
|
|
35
|
+
biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
|
|
36
|
+
biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
|
|
37
|
+
biblicus/analysis/__init__.py,sha256=d1q11tEx3JkrOPMaiGMNCHhN9tCOTr_QpQP-tI1J2Wk,1389
|
|
38
|
+
biblicus/analysis/base.py,sha256=HErFLn3gv1qf9ckAUxbolHF2k9sJDNZjPjdboCMhyBE,1349
|
|
39
|
+
biblicus/analysis/markov.py,sha256=pLtKvt4gtsqa1CASizh8bBJ4CQW2e0wGaQ-BgdP7Pfg,63766
|
|
40
|
+
biblicus/analysis/models.py,sha256=dYnm5gwUzTk5HvrHZjQx4vug_TZLnXU9qN6CLIRyLng,56495
|
|
41
|
+
biblicus/analysis/profiling.py,sha256=IynvrgcopqFj6lMUPHS1prwd0FxN8FzIa5p3JInDFCc,11185
|
|
42
|
+
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
43
|
+
biblicus/analysis/topic_modeling.py,sha256=dsNHuqxcEoCKO_8aDAM9yEOa0kWCjPWS2NvcQayIyXQ,22623
|
|
44
|
+
biblicus/context_engine/__init__.py,sha256=cIJWTUwOewW1x13a2n0YKfr4-XU0IwlVdAH_0pckfKk,1337
|
|
45
|
+
biblicus/context_engine/assembler.py,sha256=E7VPdqUJ9peZUoonM0Ooa1wsaklFOuLCt2IH9nFxAfM,44260
|
|
46
|
+
biblicus/context_engine/compaction.py,sha256=2bLaCpT48d1TL7vt9rrcRCgfdHeWWp9LX85Cgij12o0,2921
|
|
47
|
+
biblicus/context_engine/models.py,sha256=jesVd83ZQcatO-7yNlzwKkactSQ-e1znYuWof4rxVFg,12762
|
|
48
|
+
biblicus/context_engine/retrieval.py,sha256=A0w6C5uPrDY_aeGeirRkSGr6I-gU0U0cY6ElvrLhe0Q,4425
|
|
49
|
+
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
50
|
+
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
51
|
+
biblicus/extractors/deepgram_stt.py,sha256=xx_zrROGRHotF5aht23Qey9dpCnU3KHZ0unwa933Pto,6358
|
|
52
|
+
biblicus/extractors/docling_granite_text.py,sha256=iDHWZVgqZd86Q3Zu-fcdCq7ia00xTpbFbTU7JbDNZ38,6953
|
|
53
|
+
biblicus/extractors/docling_smol_text.py,sha256=qI7m93Odrjmob0RW-Yvnt5Ck4AgFcgdVjwatLQA5krI,6885
|
|
54
|
+
biblicus/extractors/markitdown_text.py,sha256=kYixZbVxaIyeWtpezocnrSxC3z_9KqWuBzeK8sI4s1o,4567
|
|
55
|
+
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
56
|
+
biblicus/extractors/openai_stt.py,sha256=d2CaVhxapfkXaeI_QZcoXwdVm5Bj5YwGLdmTZDcqgTc,7197
|
|
57
|
+
biblicus/extractors/paddleocr_vl_text.py,sha256=2xoHA1Jviw8zzeBvHBI74Lkx4SX_vSarCe3wxvYf6c4,11794
|
|
58
|
+
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
59
|
+
biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
|
|
60
|
+
biblicus/extractors/pipeline.py,sha256=qdlBBSUVNdg2V4izHacv_8a2DikCGlVMAdpkZkzNvyY,3288
|
|
61
|
+
biblicus/extractors/rapidocr_text.py,sha256=5adSCiOmyHiCgX3jBMcl1OiQlGzYLxmgJQzo9GHSecs,4791
|
|
62
|
+
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
63
|
+
biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_mvjehiSec,4014
|
|
64
|
+
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
65
|
+
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
66
|
+
biblicus/extractors/unstructured_text.py,sha256=WXr_fu4KQ0NODkbb05e4HrAX-trOWRKiDOmznh9_pLI,3579
|
|
67
|
+
biblicus/retrievers/__init__.py,sha256=LOxhUYgph1sPAeY6PmSmXH4Os4bIGGOtw88iOdI9S2k,1704
|
|
68
|
+
biblicus/retrievers/base.py,sha256=DSf5Ve5IFeunIyV9zt7T1vEUvSkJWO4iBj96co5F0Qo,1891
|
|
69
|
+
biblicus/retrievers/embedding_index_common.py,sha256=63_dUds-yIALhq2L9_0oDNpoh-_h7v2j1kexbdVw1-o,11756
|
|
70
|
+
biblicus/retrievers/embedding_index_file.py,sha256=mvtXqRX-_eQpi9bRxQ2yqFxY26YhP8Vn2WGcoWVtMtc,10668
|
|
71
|
+
biblicus/retrievers/embedding_index_inmemory.py,sha256=8csrdjCGkkR7DgvmLZ72oD1gm4duWWUnxJsjw6nLicU,10525
|
|
72
|
+
biblicus/retrievers/hybrid.py,sha256=kaH-kIi4wxYyUWnKNFT7UNBbHFkRtcGlwjjiJpx-TJY,11789
|
|
73
|
+
biblicus/retrievers/scan.py,sha256=ccDGVnqBS9a2ymKeBEXdfJz8XLahsBeYWYyYXQcg2KQ,13147
|
|
74
|
+
biblicus/retrievers/sqlite_full_text_search.py,sha256=7rzYfzpRhPbsKuXjXi8x2-rmq8-z1em3amUF9UPAomI,25392
|
|
75
|
+
biblicus/retrievers/tf_vector.py,sha256=rkcRG1GU5S_3t8GRbQTBThITj-eHT5fs1dyVzXPLg8w,15776
|
|
76
|
+
biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
|
|
77
|
+
biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
|
|
78
|
+
biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
|
|
79
|
+
biblicus/text/link.py,sha256=2IdOi3WgyBKPFau0bpS1eToV1q2v_6wq5RK5_P_qUDg,20448
|
|
80
|
+
biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
|
|
81
|
+
biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
|
|
82
|
+
biblicus/text/prompts.py,sha256=9dx1cWpJb6oBY4AhDHxlkRUYs7DfbySH0gb-uBTNvtk,7567
|
|
83
|
+
biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
|
|
84
|
+
biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
|
|
85
|
+
biblicus/text/tool_loop.py,sha256=dFeIEcCUA-yR8GMqsJ_n4007fHVmn9zK2hhlm6NlWyg,14161
|
|
86
|
+
biblicus-1.1.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
87
|
+
biblicus-1.1.0.dist-info/METADATA,sha256=8hRnC6tlf8crtWxf6FPbGANZH9lxL6kiAtOtcxqJ3Ig,31202
|
|
88
|
+
biblicus-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
89
|
+
biblicus-1.1.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
90
|
+
biblicus-1.1.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
91
|
+
biblicus-1.1.0.dist-info/RECORD,,
|
biblicus/backends/__init__.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Backend registry for Biblicus retrieval engines.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
from typing import Dict, Type
|
|
8
|
-
|
|
9
|
-
from .base import RetrievalBackend
|
|
10
|
-
from .embedding_index_file import EmbeddingIndexFileBackend
|
|
11
|
-
from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
|
|
12
|
-
from .hybrid import HybridBackend
|
|
13
|
-
from .scan import ScanBackend
|
|
14
|
-
from .sqlite_full_text_search import SqliteFullTextSearchBackend
|
|
15
|
-
from .tf_vector import TfVectorBackend
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
19
|
-
"""
|
|
20
|
-
Return the registered retrieval backends.
|
|
21
|
-
|
|
22
|
-
:return: Mapping of backend identifiers to backend classes.
|
|
23
|
-
:rtype: dict[str, Type[RetrievalBackend]]
|
|
24
|
-
"""
|
|
25
|
-
return {
|
|
26
|
-
EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
|
|
27
|
-
EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
|
|
28
|
-
HybridBackend.backend_id: HybridBackend,
|
|
29
|
-
ScanBackend.backend_id: ScanBackend,
|
|
30
|
-
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
31
|
-
TfVectorBackend.backend_id: TfVectorBackend,
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def get_backend(backend_id: str) -> RetrievalBackend:
|
|
36
|
-
"""
|
|
37
|
-
Instantiate a retrieval backend by identifier.
|
|
38
|
-
|
|
39
|
-
:param backend_id: Backend identifier.
|
|
40
|
-
:type backend_id: str
|
|
41
|
-
:return: Backend instance.
|
|
42
|
-
:rtype: RetrievalBackend
|
|
43
|
-
:raises KeyError: If the backend identifier is unknown.
|
|
44
|
-
"""
|
|
45
|
-
registry = available_backends()
|
|
46
|
-
backend_class = registry.get(backend_id)
|
|
47
|
-
if backend_class is None:
|
|
48
|
-
known = ", ".join(sorted(registry))
|
|
49
|
-
raise KeyError(f"Unknown backend '{backend_id}'. Known backends: {known}")
|
|
50
|
-
return backend_class()
|
biblicus/backends/base.py
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Backend interface for Biblicus retrieval engines.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import Dict
|
|
9
|
-
|
|
10
|
-
from ..corpus import Corpus
|
|
11
|
-
from ..models import QueryBudget, RetrievalResult, RetrievalRun
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class RetrievalBackend(ABC):
|
|
15
|
-
"""
|
|
16
|
-
Abstract interface for retrieval backends.
|
|
17
|
-
|
|
18
|
-
:ivar backend_id: Identifier string for the backend.
|
|
19
|
-
:vartype backend_id: str
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
backend_id: str
|
|
23
|
-
|
|
24
|
-
@abstractmethod
|
|
25
|
-
def build_run(
|
|
26
|
-
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
27
|
-
) -> RetrievalRun:
|
|
28
|
-
"""
|
|
29
|
-
Build or register a retrieval run for the backend.
|
|
30
|
-
|
|
31
|
-
:param corpus: Corpus to build against.
|
|
32
|
-
:type corpus: Corpus
|
|
33
|
-
:param recipe_name: Human name for the recipe.
|
|
34
|
-
:type recipe_name: str
|
|
35
|
-
:param config: Backend-specific configuration values.
|
|
36
|
-
:type config: dict[str, object]
|
|
37
|
-
:return: Run manifest describing the build.
|
|
38
|
-
:rtype: RetrievalRun
|
|
39
|
-
"""
|
|
40
|
-
raise NotImplementedError
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
def query(
|
|
44
|
-
self,
|
|
45
|
-
corpus: Corpus,
|
|
46
|
-
*,
|
|
47
|
-
run: RetrievalRun,
|
|
48
|
-
query_text: str,
|
|
49
|
-
budget: QueryBudget,
|
|
50
|
-
) -> RetrievalResult:
|
|
51
|
-
"""
|
|
52
|
-
Run a retrieval query against a backend.
|
|
53
|
-
|
|
54
|
-
:param corpus: Corpus associated with the run.
|
|
55
|
-
:type corpus: Corpus
|
|
56
|
-
:param run: Run manifest to use for querying.
|
|
57
|
-
:type run: RetrievalRun
|
|
58
|
-
:param query_text: Query text to execute.
|
|
59
|
-
:type query_text: str
|
|
60
|
-
:param budget: Evidence selection budget.
|
|
61
|
-
:type budget: QueryBudget
|
|
62
|
-
:return: Retrieval results containing evidence.
|
|
63
|
-
:rtype: RetrievalResult
|
|
64
|
-
"""
|
|
65
|
-
raise NotImplementedError
|