biblicus 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.5.0/src/biblicus.egg-info → biblicus-0.6.0}/PKG-INFO +39 -3
- {biblicus-0.5.0 → biblicus-0.6.0}/README.md +38 -2
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/FEATURE_INDEX.md +15 -0
- biblicus-0.6.0/docs/KNOWLEDGE_BASE.md +68 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/ROADMAP.md +59 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/api.rst +4 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/index.rst +1 -0
- biblicus-0.6.0/features/knowledge_base.feature +55 -0
- biblicus-0.6.0/features/steps/knowledge_base_steps.py +90 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/pyproject.toml +1 -1
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/__init__.py +3 -1
- biblicus-0.6.0/src/biblicus/knowledge_base.py +191 -0
- {biblicus-0.5.0 → biblicus-0.6.0/src/biblicus.egg-info}/PKG-INFO +39 -3
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/SOURCES.txt +4 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/LICENSE +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/MANIFEST.in +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/BACKENDS.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/CORPUS.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/DEMOS.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/EXTRACTION.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/TESTING.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/USER_CONFIGURATION.md +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/docs/conf.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/backend_validation.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/context_pack.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/crawl.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/environment.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/error_cases.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/evaluation.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/frontmatter.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/import_tree.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/model_validation.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/python_api.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/query_processing.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/source_loading.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/cli_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/crawl_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extraction_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extractor_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/openai_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/python_api_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/rapidocr_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/user_config_steps.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/token_budget.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/features/user_config.feature +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_audio_samples.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_image_samples.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_mixed_samples.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_pdf_samples.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_wikipedia.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/readme_end_to_end_demo.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/scripts/test.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/setup.cfg +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/sqlite_full_text_search.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/cli.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/context.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/corpus.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extraction.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/__init__.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/models.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/time.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/user_config.py +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/requires.txt +0 -0
- {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -45,6 +45,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
|
|
|
45
45
|
|
|
46
46
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
47
47
|
|
|
48
|
+
## Start with a knowledge base
|
|
49
|
+
|
|
50
|
+
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
51
|
+
|
|
52
|
+
This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
kb = KnowledgeBase.from_folder("notes")
|
|
59
|
+
result = kb.query("Primary button style preference")
|
|
60
|
+
context_pack = kb.context_pack(result, max_tokens=800)
|
|
61
|
+
|
|
62
|
+
print(context_pack.text)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
|
|
66
|
+
|
|
67
|
+
This simplified sequence diagram shows the same idea at a high level.
|
|
68
|
+
|
|
69
|
+
```mermaid
|
|
70
|
+
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
|
+
sequenceDiagram
|
|
72
|
+
participant App as Your assistant code
|
|
73
|
+
participant KB as Knowledge base
|
|
74
|
+
participant LLM as Large language model
|
|
75
|
+
|
|
76
|
+
App->>KB: query
|
|
77
|
+
KB-->>App: evidence and context
|
|
78
|
+
App->>LLM: context plus prompt
|
|
79
|
+
LLM-->>App: response draft
|
|
80
|
+
```
|
|
81
|
+
|
|
48
82
|
## A simple mental model
|
|
49
83
|
|
|
50
84
|
Think in three stages.
|
|
@@ -153,11 +187,11 @@ biblicus crawl --corpus corpora/example \\
|
|
|
153
187
|
--tag crawled
|
|
154
188
|
```
|
|
155
189
|
|
|
156
|
-
## End-to-end example:
|
|
190
|
+
## End-to-end example: lower-level control
|
|
157
191
|
|
|
158
192
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
159
193
|
|
|
160
|
-
|
|
194
|
+
This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
|
|
161
195
|
|
|
162
196
|
```python
|
|
163
197
|
from biblicus.backends import get_backend
|
|
@@ -383,6 +417,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
383
417
|
|
|
384
418
|
- [Corpus][corpus]
|
|
385
419
|
- [Text extraction][text-extraction]
|
|
420
|
+
- [Knowledge base][knowledge-base]
|
|
386
421
|
- [Backends][backends]
|
|
387
422
|
- [Context packs][context-packs]
|
|
388
423
|
- [Testing and evaluation][testing]
|
|
@@ -485,6 +520,7 @@ License terms are in `LICENSE`.
|
|
|
485
520
|
[roadmap]: docs/ROADMAP.md
|
|
486
521
|
[feature-index]: docs/FEATURE_INDEX.md
|
|
487
522
|
[corpus]: docs/CORPUS.md
|
|
523
|
+
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
488
524
|
[text-extraction]: docs/EXTRACTION.md
|
|
489
525
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
490
526
|
[backends]: docs/BACKENDS.md
|
|
@@ -16,6 +16,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
|
|
|
16
16
|
|
|
17
17
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
18
18
|
|
|
19
|
+
## Start with a knowledge base
|
|
20
|
+
|
|
21
|
+
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
22
|
+
|
|
23
|
+
This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
kb = KnowledgeBase.from_folder("notes")
|
|
30
|
+
result = kb.query("Primary button style preference")
|
|
31
|
+
context_pack = kb.context_pack(result, max_tokens=800)
|
|
32
|
+
|
|
33
|
+
print(context_pack.text)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
|
|
37
|
+
|
|
38
|
+
This simplified sequence diagram shows the same idea at a high level.
|
|
39
|
+
|
|
40
|
+
```mermaid
|
|
41
|
+
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
42
|
+
sequenceDiagram
|
|
43
|
+
participant App as Your assistant code
|
|
44
|
+
participant KB as Knowledge base
|
|
45
|
+
participant LLM as Large language model
|
|
46
|
+
|
|
47
|
+
App->>KB: query
|
|
48
|
+
KB-->>App: evidence and context
|
|
49
|
+
App->>LLM: context plus prompt
|
|
50
|
+
LLM-->>App: response draft
|
|
51
|
+
```
|
|
52
|
+
|
|
19
53
|
## A simple mental model
|
|
20
54
|
|
|
21
55
|
Think in three stages.
|
|
@@ -124,11 +158,11 @@ biblicus crawl --corpus corpora/example \\
|
|
|
124
158
|
--tag crawled
|
|
125
159
|
```
|
|
126
160
|
|
|
127
|
-
## End-to-end example:
|
|
161
|
+
## End-to-end example: lower-level control
|
|
128
162
|
|
|
129
163
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
130
164
|
|
|
131
|
-
|
|
165
|
+
This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
|
|
132
166
|
|
|
133
167
|
```python
|
|
134
168
|
from biblicus.backends import get_backend
|
|
@@ -354,6 +388,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
354
388
|
|
|
355
389
|
- [Corpus][corpus]
|
|
356
390
|
- [Text extraction][text-extraction]
|
|
391
|
+
- [Knowledge base][knowledge-base]
|
|
357
392
|
- [Backends][backends]
|
|
358
393
|
- [Context packs][context-packs]
|
|
359
394
|
- [Testing and evaluation][testing]
|
|
@@ -456,6 +491,7 @@ License terms are in `LICENSE`.
|
|
|
456
491
|
[roadmap]: docs/ROADMAP.md
|
|
457
492
|
[feature-index]: docs/FEATURE_INDEX.md
|
|
458
493
|
[corpus]: docs/CORPUS.md
|
|
494
|
+
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
459
495
|
[text-extraction]: docs/EXTRACTION.md
|
|
460
496
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
461
497
|
[backends]: docs/BACKENDS.md
|
|
@@ -208,6 +208,21 @@ Primary implementation:
|
|
|
208
208
|
|
|
209
209
|
- `src/biblicus/context.py`
|
|
210
210
|
|
|
211
|
+
## Knowledge base
|
|
212
|
+
|
|
213
|
+
What it does:
|
|
214
|
+
|
|
215
|
+
- Provides a turnkey interface that accepts a folder and returns a ready-to-query workflow.
|
|
216
|
+
- Applies sensible defaults for import, retrieval, and context pack shaping.
|
|
217
|
+
|
|
218
|
+
Behavior specifications:
|
|
219
|
+
|
|
220
|
+
- `features/knowledge_base.feature`
|
|
221
|
+
|
|
222
|
+
Primary implementation:
|
|
223
|
+
|
|
224
|
+
- `src/biblicus/knowledge_base.py`
|
|
225
|
+
|
|
211
226
|
## Testing, coverage, and documentation build
|
|
212
227
|
|
|
213
228
|
What it does:
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Knowledge base
|
|
2
|
+
|
|
3
|
+
The knowledge base is the high‑level, turnkey workflow that makes Biblicus feel effortless. You hand it a folder. It chooses sensible defaults, builds a retrieval run, and gives you evidence you can turn into context.
|
|
4
|
+
|
|
5
|
+
This is the right layer when you want to use Biblicus without spending time on setup. You can still override the defaults later when you want fine‑grained control.
|
|
6
|
+
|
|
7
|
+
## What it does
|
|
8
|
+
|
|
9
|
+
- Creates or opens a corpus at a chosen location (or a temporary location if you do not provide one).
|
|
10
|
+
- Imports a folder tree into that corpus.
|
|
11
|
+
- Builds a retrieval run using a default backend.
|
|
12
|
+
- Exposes a simple `query` method that returns evidence.
|
|
13
|
+
- Exposes a `context_pack` helper to shape evidence into model context.
|
|
14
|
+
|
|
15
|
+
## Minimal use
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
kb = KnowledgeBase.from_folder("notes")
|
|
22
|
+
result = kb.query("Primary button style preference")
|
|
23
|
+
context_pack = kb.context_pack(result, max_tokens=800)
|
|
24
|
+
|
|
25
|
+
print(context_pack.text)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Default behavior
|
|
29
|
+
|
|
30
|
+
The knowledge base wraps existing primitives. Defaults are explicit and deterministic.
|
|
31
|
+
|
|
32
|
+
- **Corpus**: stored on disk and fully inspectable.
|
|
33
|
+
- **Import**: uses the folder tree import, preserving relative paths.
|
|
34
|
+
- **Backend**: defaults to the `scan` backend.
|
|
35
|
+
- **Query budget**: defaults to a small, conservative evidence budget.
|
|
36
|
+
|
|
37
|
+
## Overrides
|
|
38
|
+
|
|
39
|
+
You can override the defaults when needed.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
43
|
+
from biblicus.models import QueryBudget
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
kb = KnowledgeBase.from_folder(
|
|
47
|
+
"notes",
|
|
48
|
+
backend_id="scan",
|
|
49
|
+
recipe_name="Knowledge base demo",
|
|
50
|
+
query_budget=QueryBudget(max_total_items=10, max_total_characters=4000, max_items_per_source=None),
|
|
51
|
+
tags=["memory"],
|
|
52
|
+
corpus_root="corpora/knowledge-base",
|
|
53
|
+
)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## How it relates to lower‑level control
|
|
57
|
+
|
|
58
|
+
The knowledge base is a convenience layer. It uses the same underlying parts that the lower‑level examples use.
|
|
59
|
+
|
|
60
|
+
- `Corpus` for ingestion and storage
|
|
61
|
+
- `import_tree` for folder ingestion
|
|
62
|
+
- A backend run (`scan` by default)
|
|
63
|
+
- `QueryBudget` for evidence limits
|
|
64
|
+
- `ContextPackPolicy` and token fitting for context shaping
|
|
65
|
+
|
|
66
|
+
You can always drop down to those lower‑level primitives when you need more control.
|
|
67
|
+
|
|
68
|
+
If the high‑level workflow is not enough, switch to `Corpus`, `get_backend`, and `ContextPackPolicy` directly.
|
|
@@ -46,6 +46,65 @@ Acceptance checks:
|
|
|
46
46
|
- Behavior specifications cover policy selection and budgeting behaviors.
|
|
47
47
|
- Example outputs show how context packs differ across policies.
|
|
48
48
|
|
|
49
|
+
## Next: extraction backends (OCR and document understanding)
|
|
50
|
+
|
|
51
|
+
Goal: treat optical character recognition and document understanding as pluggable extractors with consistent inputs and outputs.
|
|
52
|
+
|
|
53
|
+
Deliverables:
|
|
54
|
+
|
|
55
|
+
- A baseline OCR extractor that is fast and local for smoke tests.
|
|
56
|
+
- A higher quality OCR extractor candidate (for example: Paddle OCR or Docling OCR).
|
|
57
|
+
- A general document understanding extractor candidate (for example: Docling or Unstructured).
|
|
58
|
+
- A consistent output contract that captures text plus optional confidence and per-page metadata.
|
|
59
|
+
- A selector policy for choosing between multiple extractor outputs in a pipeline.
|
|
60
|
+
- A shared evaluation harness for extraction backends using the same corpus and dataset.
|
|
61
|
+
|
|
62
|
+
Acceptance checks:
|
|
63
|
+
|
|
64
|
+
- Behavior specifications cover extractor selection and output provenance.
|
|
65
|
+
- Evaluation reports compare accuracy, processable fraction, latency, and cost.
|
|
66
|
+
|
|
67
|
+
## Next: corpus analysis tools
|
|
68
|
+
|
|
69
|
+
Goal: provide lightweight analysis utilities that summarize corpus themes and guide curation.
|
|
70
|
+
|
|
71
|
+
Deliverables:
|
|
72
|
+
|
|
73
|
+
- A topic modeling workflow for corpus analysis (for example: BERTopic).
|
|
74
|
+
- A report that highlights dominant themes and outliers.
|
|
75
|
+
- A way to compare topic distributions across corpora or corpus snapshots.
|
|
76
|
+
|
|
77
|
+
Acceptance checks:
|
|
78
|
+
|
|
79
|
+
- Analysis is reproducible for the same corpus state.
|
|
80
|
+
- Reports are exportable and readable without custom tooling.
|
|
81
|
+
|
|
82
|
+
### Candidate backend ecosystem (for planning and evaluation)
|
|
83
|
+
|
|
84
|
+
Document understanding and OCR blur together at the interface level in Biblicus, so the roadmap treats them as extractor candidates with the same input/output contract.
|
|
85
|
+
|
|
86
|
+
Docling family candidates:
|
|
87
|
+
|
|
88
|
+
- Docling (document understanding with structured outputs)
|
|
89
|
+
- docling-ocr (OCR component in the Docling ecosystem)
|
|
90
|
+
|
|
91
|
+
General-purpose extraction candidates:
|
|
92
|
+
|
|
93
|
+
- Unstructured (element-oriented extraction for many formats)
|
|
94
|
+
- MarkItDown (lightweight conversion to Markdown)
|
|
95
|
+
- Kreuzberg (speed-focused extraction for bulk workflows)
|
|
96
|
+
- ExtractThinker (schema-driven extraction using Pydantic contracts)
|
|
97
|
+
|
|
98
|
+
Ecosystem adapters:
|
|
99
|
+
|
|
100
|
+
- LangChain document loaders (uniform loader interface across many sources)
|
|
101
|
+
|
|
102
|
+
### Guidance for choosing early targets
|
|
103
|
+
|
|
104
|
+
- If you need layout and table understanding, prioritize Docling and docling-ocr.
|
|
105
|
+
- If you need speed and simplicity, prioritize MarkItDown or Kreuzberg.
|
|
106
|
+
- If you need schema-first extraction, prioritize ExtractThinker layered on an OCR or document extractor.
|
|
107
|
+
|
|
49
108
|
## Later: alternate backends and hosting modes
|
|
50
109
|
|
|
51
110
|
Goal: broaden the backend surface while keeping the core predictable.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
Feature: Knowledge base (turnkey workflow)
|
|
2
|
+
A knowledge base is a high-level workflow that hides the plumbing while keeping behavior explicit.
|
|
3
|
+
It should accept a folder, ingest files, build defaults, and allow retrieval with minimal configuration.
|
|
4
|
+
|
|
5
|
+
Scenario: Build a knowledge base from a folder and query it
|
|
6
|
+
Given a folder "notes" exists with text files:
|
|
7
|
+
| filename | contents |
|
|
8
|
+
| note1.txt | The user's name is Tactus Maximus. |
|
|
9
|
+
| note2.txt | Primary button style preference: the user's favorite color is magenta. |
|
|
10
|
+
When I create a knowledge base from folder "notes" only
|
|
11
|
+
And I query the knowledge base for "Primary button style preference"
|
|
12
|
+
Then the knowledge base returns evidence that includes "favorite color is magenta"
|
|
13
|
+
|
|
14
|
+
Scenario: Knowledge base context pack is shaped with a token budget
|
|
15
|
+
Given a folder "notes" exists with text files:
|
|
16
|
+
| filename | contents |
|
|
17
|
+
| note1.txt | one two three |
|
|
18
|
+
| note2.txt | four five six |
|
|
19
|
+
When I create a knowledge base from folder "notes" only
|
|
20
|
+
And I query the knowledge base for "one"
|
|
21
|
+
And I build a context pack from the knowledge base query with token budget 3
|
|
22
|
+
Then the context pack text equals:
|
|
23
|
+
"""
|
|
24
|
+
one two three
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
Scenario: Knowledge base context pack defaults to no token budget
|
|
28
|
+
Given a folder "notes" exists with text files:
|
|
29
|
+
| filename | contents |
|
|
30
|
+
| note1.txt | alpha beta |
|
|
31
|
+
When I create a knowledge base from folder "notes" only
|
|
32
|
+
And I query the knowledge base for "alpha"
|
|
33
|
+
And I build a context pack from the knowledge base query without a token budget
|
|
34
|
+
Then the context pack text equals:
|
|
35
|
+
"""
|
|
36
|
+
alpha beta
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
Scenario: Knowledge base rejects missing folder
|
|
40
|
+
When I attempt to create a knowledge base from folder "missing"
|
|
41
|
+
Then the knowledge base error includes "does not exist"
|
|
42
|
+
|
|
43
|
+
Scenario: Knowledge base rejects non-folder path
|
|
44
|
+
Given a file "not-a-folder.txt" exists with contents "hello"
|
|
45
|
+
When I attempt to create a knowledge base from folder "not-a-folder.txt"
|
|
46
|
+
Then the knowledge base error includes "not a directory"
|
|
47
|
+
|
|
48
|
+
Scenario: Knowledge base can use an explicit corpus root
|
|
49
|
+
Given a folder "notes" exists with text files:
|
|
50
|
+
| filename | contents |
|
|
51
|
+
| note1.txt | alpha |
|
|
52
|
+
And a folder "kb-root" exists
|
|
53
|
+
When I create a knowledge base from folder "notes" using corpus root "kb-root"
|
|
54
|
+
And I query the knowledge base for "alpha"
|
|
55
|
+
Then the knowledge base returns evidence that includes "alpha"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from behave import given, then, when
|
|
6
|
+
|
|
7
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@given('a folder "{folder}" exists')
|
|
11
|
+
def given_folder_exists(context, folder: str) -> None:
|
|
12
|
+
root = Path(context.workdir) / folder
|
|
13
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
context.knowledge_base_folder = root
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@given('a folder "{folder}" exists with text files:')
|
|
18
|
+
def given_folder_exists_with_text_files(context, folder: str) -> None:
|
|
19
|
+
root = Path(context.workdir) / folder
|
|
20
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
for row in context.table:
|
|
22
|
+
filename = row["filename"]
|
|
23
|
+
contents = row["contents"]
|
|
24
|
+
path = root / filename
|
|
25
|
+
path.write_text(contents, encoding="utf-8")
|
|
26
|
+
context.knowledge_base_folder = root
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@given('a file "{filename}" exists with contents "{contents}"')
|
|
30
|
+
def given_file_exists_with_contents(context, filename: str, contents: str) -> None:
|
|
31
|
+
path = Path(context.workdir) / filename
|
|
32
|
+
path.write_text(contents, encoding="utf-8")
|
|
33
|
+
context.knowledge_base_file = path
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@when('I create a knowledge base from folder "{folder}" only')
|
|
37
|
+
def when_create_knowledge_base_from_folder(context, folder: str) -> None:
|
|
38
|
+
root = Path(context.workdir) / folder
|
|
39
|
+
context.knowledge_base = KnowledgeBase.from_folder(root)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@when('I create a knowledge base from folder "{folder}" using corpus root "{corpus_root}"')
|
|
43
|
+
def when_create_knowledge_base_from_folder_with_corpus_root(
|
|
44
|
+
context, folder: str, corpus_root: str
|
|
45
|
+
) -> None:
|
|
46
|
+
root = Path(context.workdir) / folder
|
|
47
|
+
corpus_root_path = Path(context.workdir) / corpus_root
|
|
48
|
+
context.knowledge_base = KnowledgeBase.from_folder(root, corpus_root=corpus_root_path)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@when('I attempt to create a knowledge base from folder "{folder}"')
|
|
52
|
+
def when_attempt_create_knowledge_base_from_folder(context, folder: str) -> None:
|
|
53
|
+
root = Path(context.workdir) / folder
|
|
54
|
+
try:
|
|
55
|
+
KnowledgeBase.from_folder(root)
|
|
56
|
+
except (FileNotFoundError, NotADirectoryError) as exc:
|
|
57
|
+
context.knowledge_base_error = exc
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@then('the knowledge base error includes "{text}"')
|
|
61
|
+
def then_knowledge_base_error_includes(context, text: str) -> None:
|
|
62
|
+
error = context.knowledge_base_error
|
|
63
|
+
assert text in str(error)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@when('I query the knowledge base for "{query_text}"')
|
|
67
|
+
def when_query_knowledge_base(context, query_text: str) -> None:
|
|
68
|
+
context.knowledge_base_result = context.knowledge_base.query(query_text)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@when("I build a context pack from the knowledge base query with token budget {max_tokens:d}")
|
|
72
|
+
def when_build_context_pack_from_knowledge_base_query(context, max_tokens: int) -> None:
|
|
73
|
+
context.context_pack = context.knowledge_base.context_pack(
|
|
74
|
+
context.knowledge_base_result,
|
|
75
|
+
max_tokens=max_tokens,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@when("I build a context pack from the knowledge base query without a token budget")
|
|
80
|
+
def when_build_context_pack_from_knowledge_base_query_without_budget(context) -> None:
|
|
81
|
+
context.context_pack = context.knowledge_base.context_pack(
|
|
82
|
+
context.knowledge_base_result,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@then('the knowledge base returns evidence that includes "{text}"')
|
|
87
|
+
def then_knowledge_base_returns_evidence_that_includes(context, text: str) -> None:
|
|
88
|
+
evidence_items = context.knowledge_base_result.evidence
|
|
89
|
+
evidence_texts = [item.text or "" for item in evidence_items]
|
|
90
|
+
assert any(text in evidence_text for evidence_text in evidence_texts)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "biblicus"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "Command line interface and Python library for corpus ingestion, retrieval, and evaluation."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -3,6 +3,7 @@ Biblicus public package interface.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .corpus import Corpus
|
|
6
|
+
from .knowledge_base import KnowledgeBase
|
|
6
7
|
from .models import (
|
|
7
8
|
CorpusConfig,
|
|
8
9
|
Evidence,
|
|
@@ -19,10 +20,11 @@ __all__ = [
|
|
|
19
20
|
"CorpusConfig",
|
|
20
21
|
"Evidence",
|
|
21
22
|
"IngestResult",
|
|
23
|
+
"KnowledgeBase",
|
|
22
24
|
"QueryBudget",
|
|
23
25
|
"RecipeManifest",
|
|
24
26
|
"RetrievalResult",
|
|
25
27
|
"RetrievalRun",
|
|
26
28
|
]
|
|
27
29
|
|
|
28
|
-
__version__ = "0.
|
|
30
|
+
__version__ = "0.6.0"
|