biblicus 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus-0.1.1/LICENSE +21 -0
- biblicus-0.1.1/MANIFEST.in +19 -0
- biblicus-0.1.1/PKG-INFO +174 -0
- biblicus-0.1.1/README.md +154 -0
- biblicus-0.1.1/datasets/wikipedia_mini.json +37 -0
- biblicus-0.1.1/docs/ARCHITECTURE.md +179 -0
- biblicus-0.1.1/docs/BACKENDS.md +36 -0
- biblicus-0.1.1/docs/api.rst +32 -0
- biblicus-0.1.1/docs/conf.py +31 -0
- biblicus-0.1.1/docs/index.rst +14 -0
- biblicus-0.1.1/features/backend_validation.feature +14 -0
- biblicus-0.1.1/features/biblicus_corpus.feature +99 -0
- biblicus-0.1.1/features/cli_entrypoint.feature +6 -0
- biblicus-0.1.1/features/cli_parsing.feature +21 -0
- biblicus-0.1.1/features/corpus_edge_cases.feature +133 -0
- biblicus-0.1.1/features/corpus_identity.feature +14 -0
- biblicus-0.1.1/features/corpus_purge.feature +31 -0
- biblicus-0.1.1/features/environment.py +154 -0
- biblicus-0.1.1/features/error_cases.feature +170 -0
- biblicus-0.1.1/features/evaluation.feature +80 -0
- biblicus-0.1.1/features/frontmatter.feature +16 -0
- biblicus-0.1.1/features/ingest_sources.feature +24 -0
- biblicus-0.1.1/features/integration_wikipedia.feature +7 -0
- biblicus-0.1.1/features/model_validation.feature +6 -0
- biblicus-0.1.1/features/python_api.feature +57 -0
- biblicus-0.1.1/features/retrieval_budget.feature +7 -0
- biblicus-0.1.1/features/retrieval_scan.feature +77 -0
- biblicus-0.1.1/features/retrieval_sqlite_full_text_search.feature +59 -0
- biblicus-0.1.1/features/retrieval_utilities.feature +43 -0
- biblicus-0.1.1/features/steps/backend_steps.py +124 -0
- biblicus-0.1.1/features/steps/cli_parsing_steps.py +57 -0
- biblicus-0.1.1/features/steps/cli_steps.py +661 -0
- biblicus-0.1.1/features/steps/frontmatter_steps.py +50 -0
- biblicus-0.1.1/features/steps/model_steps.py +34 -0
- biblicus-0.1.1/features/steps/python_api_steps.py +196 -0
- biblicus-0.1.1/features/steps/retrieval_steps.py +485 -0
- biblicus-0.1.1/pyproject.toml +71 -0
- biblicus-0.1.1/scripts/download_wikipedia.py +138 -0
- biblicus-0.1.1/scripts/test.py +78 -0
- biblicus-0.1.1/setup.cfg +4 -0
- biblicus-0.1.1/src/biblicus/__init__.py +28 -0
- biblicus-0.1.1/src/biblicus/__main__.py +8 -0
- biblicus-0.1.1/src/biblicus/backends/__init__.py +44 -0
- biblicus-0.1.1/src/biblicus/backends/base.py +65 -0
- biblicus-0.1.1/src/biblicus/backends/scan.py +292 -0
- biblicus-0.1.1/src/biblicus/backends/sqlite_full_text_search.py +427 -0
- biblicus-0.1.1/src/biblicus/cli.py +468 -0
- biblicus-0.1.1/src/biblicus/constants.py +10 -0
- biblicus-0.1.1/src/biblicus/corpus.py +952 -0
- biblicus-0.1.1/src/biblicus/evaluation.py +261 -0
- biblicus-0.1.1/src/biblicus/frontmatter.py +92 -0
- biblicus-0.1.1/src/biblicus/models.py +307 -0
- biblicus-0.1.1/src/biblicus/retrieval.py +137 -0
- biblicus-0.1.1/src/biblicus/sources.py +132 -0
- biblicus-0.1.1/src/biblicus/time.py +18 -0
- biblicus-0.1.1/src/biblicus/uris.py +64 -0
- biblicus-0.1.1/src/biblicus.egg-info/PKG-INFO +174 -0
- biblicus-0.1.1/src/biblicus.egg-info/SOURCES.txt +60 -0
- biblicus-0.1.1/src/biblicus.egg-info/dependency_links.txt +1 -0
- biblicus-0.1.1/src/biblicus.egg-info/entry_points.txt +2 -0
- biblicus-0.1.1/src/biblicus.egg-info/requires.txt +11 -0
- biblicus-0.1.1/src/biblicus.egg-info/top_level.txt +1 -0
biblicus-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Biblicus Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
|
|
5
|
+
recursive-include src *.py
|
|
6
|
+
recursive-include docs *.rst *.md *.py
|
|
7
|
+
recursive-include features *.feature *.py
|
|
8
|
+
recursive-include scripts *.py
|
|
9
|
+
recursive-include datasets *.json
|
|
10
|
+
|
|
11
|
+
prune corpora
|
|
12
|
+
prune reports
|
|
13
|
+
prune docs/_build
|
|
14
|
+
|
|
15
|
+
global-exclude *.pyc
|
|
16
|
+
global-exclude *.pyo
|
|
17
|
+
global-exclude __pycache__/*
|
|
18
|
+
global-exclude .DS_Store
|
|
19
|
+
global-exclude .coverage
|
biblicus-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: biblicus
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: PyYAML>=6.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
13
|
+
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
14
|
+
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
15
|
+
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
16
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
17
|
+
Requires-Dist: black>=24.0; extra == "dev"
|
|
18
|
+
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# Biblicus
|
|
22
|
+
|
|
23
|
+
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
24
|
+
|
|
25
|
+
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
26
|
+
|
|
27
|
+
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
28
|
+
|
|
29
|
+
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
30
|
+
|
|
31
|
+
It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
|
|
32
|
+
|
|
33
|
+
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
34
|
+
|
|
35
|
+
## The framework
|
|
36
|
+
|
|
37
|
+
The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
|
|
38
|
+
|
|
39
|
+
- Corpus is the folder that holds raw items and their metadata.
|
|
40
|
+
- Item is the raw bytes of a document or other artifact, plus its source.
|
|
41
|
+
- Catalog is the rebuildable index of the corpus.
|
|
42
|
+
- Evidence is what retrieval returns, ready to be turned into context for a large language model.
|
|
43
|
+
- Run is a recorded retrieval build for a corpus.
|
|
44
|
+
- Backend is a pluggable retrieval implementation.
|
|
45
|
+
- Recipe is a named configuration for a backend.
|
|
46
|
+
- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
|
|
47
|
+
|
|
48
|
+
## Practical value
|
|
49
|
+
|
|
50
|
+
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
51
|
+
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
52
|
+
- You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
|
|
53
|
+
|
|
54
|
+
## Typical flow
|
|
55
|
+
|
|
56
|
+
- Initialize a corpus folder.
|
|
57
|
+
- Ingest items from file paths, web addresses, or text input.
|
|
58
|
+
- Reindex to refresh the catalog after edits.
|
|
59
|
+
- Build a retrieval run with a backend.
|
|
60
|
+
- Query the run to collect evidence and evaluate it with datasets.
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
python3 -m pip install -e .
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
After the first release, you can install it from Python Package Index.
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
python3 -m pip install biblicus
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Quick start
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
biblicus init corpora/example
|
|
80
|
+
biblicus ingest --corpus corpora/example notes/example.txt
|
|
81
|
+
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
82
|
+
biblicus list --corpus corpora/example
|
|
83
|
+
biblicus build --corpus corpora/example --backend scan
|
|
84
|
+
biblicus query --corpus corpora/example --query "note"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Python usage
|
|
88
|
+
|
|
89
|
+
From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
|
|
90
|
+
|
|
91
|
+
- Create a corpus with `Corpus.init` or open one with `Corpus.open`.
|
|
92
|
+
- Ingest notes with `Corpus.ingest_note`.
|
|
93
|
+
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
94
|
+
- List items with `Corpus.list_items`.
|
|
95
|
+
- Build a retrieval run with `get_backend` and `backend.build_run`.
|
|
96
|
+
- Query a run with `backend.query`.
|
|
97
|
+
- Evaluate with `evaluate_run`.
|
|
98
|
+
|
|
99
|
+
## How it fits into an assistant
|
|
100
|
+
|
|
101
|
+
In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
|
|
102
|
+
|
|
103
|
+
- Use a corpus as the source of truth for raw items.
|
|
104
|
+
- Use a backend run to build any derived artifacts needed for retrieval.
|
|
105
|
+
- Use queries to obtain evidence objects.
|
|
106
|
+
- Convert evidence into the format your framework expects, such as message content, tool output, or citations.
|
|
107
|
+
|
|
108
|
+
## Learn more
|
|
109
|
+
|
|
110
|
+
The documents below are written to be read in order.
|
|
111
|
+
|
|
112
|
+
- [Architecture][architecture]
|
|
113
|
+
- [Backends][backends]
|
|
114
|
+
|
|
115
|
+
## Metadata and catalog
|
|
116
|
+
|
|
117
|
+
Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
|
|
118
|
+
|
|
119
|
+
## Corpus layout
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
corpus/
|
|
123
|
+
raw/
|
|
124
|
+
item.bin
|
|
125
|
+
item.bin.biblicus.yml
|
|
126
|
+
.biblicus/
|
|
127
|
+
config.json
|
|
128
|
+
catalog.json
|
|
129
|
+
runs/
|
|
130
|
+
run-id.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Retrieval backends
|
|
134
|
+
|
|
135
|
+
Two backends are included.
|
|
136
|
+
|
|
137
|
+
- `scan` is a minimal baseline that scans raw items directly.
|
|
138
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
139
|
+
|
|
140
|
+
## Integration corpus and evaluation dataset
|
|
141
|
+
|
|
142
|
+
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
143
|
+
|
|
144
|
+
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
145
|
+
|
|
146
|
+
## Tests and coverage
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
python3 scripts/test.py
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Releases
|
|
153
|
+
|
|
154
|
+
Releases are automated from the main branch using semantic versioning and conventional commit messages.
|
|
155
|
+
|
|
156
|
+
The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
|
|
157
|
+
|
|
158
|
+
Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
|
|
159
|
+
|
|
160
|
+
## Documentation
|
|
161
|
+
|
|
162
|
+
Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
sphinx-build -b html docs docs/_build
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
License terms are in `LICENSE`.
|
|
171
|
+
|
|
172
|
+
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
173
|
+
[architecture]: docs/ARCHITECTURE.md
|
|
174
|
+
[backends]: docs/BACKENDS.md
|
biblicus-0.1.1/README.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Biblicus
|
|
2
|
+
|
|
3
|
+
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
4
|
+
|
|
5
|
+
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
6
|
+
|
|
7
|
+
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
8
|
+
|
|
9
|
+
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
10
|
+
|
|
11
|
+
It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
|
|
12
|
+
|
|
13
|
+
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
14
|
+
|
|
15
|
+
## The framework
|
|
16
|
+
|
|
17
|
+
The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
|
|
18
|
+
|
|
19
|
+
- Corpus is the folder that holds raw items and their metadata.
|
|
20
|
+
- Item is the raw bytes of a document or other artifact, plus its source.
|
|
21
|
+
- Catalog is the rebuildable index of the corpus.
|
|
22
|
+
- Evidence is what retrieval returns, ready to be turned into context for a large language model.
|
|
23
|
+
- Run is a recorded retrieval build for a corpus.
|
|
24
|
+
- Backend is a pluggable retrieval implementation.
|
|
25
|
+
- Recipe is a named configuration for a backend.
|
|
26
|
+
- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
|
|
27
|
+
|
|
28
|
+
## Practical value
|
|
29
|
+
|
|
30
|
+
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
31
|
+
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
32
|
+
- You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
|
|
33
|
+
|
|
34
|
+
## Typical flow
|
|
35
|
+
|
|
36
|
+
- Initialize a corpus folder.
|
|
37
|
+
- Ingest items from file paths, web addresses, or text input.
|
|
38
|
+
- Reindex to refresh the catalog after edits.
|
|
39
|
+
- Build a retrieval run with a backend.
|
|
40
|
+
- Query the run to collect evidence and evaluate it with datasets.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
python3 -m pip install -e .
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
After the first release, you can install it from Python Package Index.
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
python3 -m pip install biblicus
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick start
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
biblicus init corpora/example
|
|
60
|
+
biblicus ingest --corpus corpora/example notes/example.txt
|
|
61
|
+
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
62
|
+
biblicus list --corpus corpora/example
|
|
63
|
+
biblicus build --corpus corpora/example --backend scan
|
|
64
|
+
biblicus query --corpus corpora/example --query "note"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Python usage
|
|
68
|
+
|
|
69
|
+
From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
|
|
70
|
+
|
|
71
|
+
- Create a corpus with `Corpus.init` or open one with `Corpus.open`.
|
|
72
|
+
- Ingest notes with `Corpus.ingest_note`.
|
|
73
|
+
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
74
|
+
- List items with `Corpus.list_items`.
|
|
75
|
+
- Build a retrieval run with `get_backend` and `backend.build_run`.
|
|
76
|
+
- Query a run with `backend.query`.
|
|
77
|
+
- Evaluate with `evaluate_run`.
|
|
78
|
+
|
|
79
|
+
## How it fits into an assistant
|
|
80
|
+
|
|
81
|
+
In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
|
|
82
|
+
|
|
83
|
+
- Use a corpus as the source of truth for raw items.
|
|
84
|
+
- Use a backend run to build any derived artifacts needed for retrieval.
|
|
85
|
+
- Use queries to obtain evidence objects.
|
|
86
|
+
- Convert evidence into the format your framework expects, such as message content, tool output, or citations.
|
|
87
|
+
|
|
88
|
+
## Learn more
|
|
89
|
+
|
|
90
|
+
The documents below are written to be read in order.
|
|
91
|
+
|
|
92
|
+
- [Architecture][architecture]
|
|
93
|
+
- [Backends][backends]
|
|
94
|
+
|
|
95
|
+
## Metadata and catalog
|
|
96
|
+
|
|
97
|
+
Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
|
|
98
|
+
|
|
99
|
+
## Corpus layout
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
corpus/
|
|
103
|
+
raw/
|
|
104
|
+
item.bin
|
|
105
|
+
item.bin.biblicus.yml
|
|
106
|
+
.biblicus/
|
|
107
|
+
config.json
|
|
108
|
+
catalog.json
|
|
109
|
+
runs/
|
|
110
|
+
run-id.json
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Retrieval backends
|
|
114
|
+
|
|
115
|
+
Two backends are included.
|
|
116
|
+
|
|
117
|
+
- `scan` is a minimal baseline that scans raw items directly.
|
|
118
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
119
|
+
|
|
120
|
+
## Integration corpus and evaluation dataset
|
|
121
|
+
|
|
122
|
+
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
123
|
+
|
|
124
|
+
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
125
|
+
|
|
126
|
+
## Tests and coverage
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
python3 scripts/test.py
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Releases
|
|
133
|
+
|
|
134
|
+
Releases are automated from the main branch using semantic versioning and conventional commit messages.
|
|
135
|
+
|
|
136
|
+
The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
|
|
137
|
+
|
|
138
|
+
Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
|
|
139
|
+
|
|
140
|
+
## Documentation
|
|
141
|
+
|
|
142
|
+
Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
sphinx-build -b html docs docs/_build
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
License terms are in `LICENSE`.
|
|
151
|
+
|
|
152
|
+
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
153
|
+
[architecture]: docs/ARCHITECTURE.md
|
|
154
|
+
[backends]: docs/BACKENDS.md
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": 1,
|
|
3
|
+
"name": "wikipedia-mini",
|
|
4
|
+
"description": "Small evaluation set aligned with the Wikipedia integration corpus.",
|
|
5
|
+
"queries": [
|
|
6
|
+
{
|
|
7
|
+
"query_id": "q1",
|
|
8
|
+
"query_text": "mathematician who worked on the Analytical Engine",
|
|
9
|
+
"expected_source_uri": "https://en.wikipedia.org/wiki/Ada_Lovelace",
|
|
10
|
+
"kind": "gold"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"query_id": "q2",
|
|
14
|
+
"query_text": "pioneer of computer programming in the US Navy",
|
|
15
|
+
"expected_source_uri": "https://en.wikipedia.org/wiki/Grace_Hopper",
|
|
16
|
+
"kind": "gold"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"query_id": "q3",
|
|
20
|
+
"query_text": "proposed the Church-Turing thesis",
|
|
21
|
+
"expected_source_uri": "https://en.wikipedia.org/wiki/Alan_Turing",
|
|
22
|
+
"kind": "gold"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"query_id": "q4",
|
|
26
|
+
"query_text": "invented information theory",
|
|
27
|
+
"expected_source_uri": "https://en.wikipedia.org/wiki/Claude_Shannon",
|
|
28
|
+
"kind": "gold"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"query_id": "q5",
|
|
32
|
+
"query_text": "coined the term artificial intelligence",
|
|
33
|
+
"expected_source_uri": "https://en.wikipedia.org/wiki/John_McCarthy_(computer_scientist)",
|
|
34
|
+
"kind": "synthetic"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Biblicus Architecture
|
|
2
|
+
|
|
3
|
+
Biblicus is a command line interface first **corpus** manager for ingesting, curating, and evaluating corpora used by assistant systems. The early goal is to make it easy to add raw, unstructured content, while keeping the system structured enough to support reproducible experiments.
|
|
4
|
+
|
|
5
|
+
## How we design
|
|
6
|
+
|
|
7
|
+
Design starts from strict behavior-driven development:
|
|
8
|
+
- The authoritative description of behavior lives in `features/*.feature`.
|
|
9
|
+
- All changes should follow specification-first behavior-driven development: failing scenario, implementation, passing scenario, then refactor.
|
|
10
|
+
- Behavior-driven development scenarios are not an afterthought: they are how we keep the domain vocabulary consistent and the platform comparable across backends and recipes.
|
|
11
|
+
- **Specification completeness** is mandatory: if behavior exists, it must be specified. Ambiguous or untestable behavior should be removed or turned into an explicit error.
|
|
12
|
+
|
|
13
|
+
## Domain-specific cognitive framework
|
|
14
|
+
|
|
15
|
+
The domain-specific cognitive framework is the set of **stable nouns**, **verbs**, and **invariants** that make Biblicus pleasant to use over time.
|
|
16
|
+
We prefer a small set of universal concepts with strict semantics over a large set of ad-hoc flags.
|
|
17
|
+
|
|
18
|
+
We also treat **Pydantic models** as the canonical way to codify and validate these constructs at boundaries.
|
|
19
|
+
|
|
20
|
+
## The Python developer mental model
|
|
21
|
+
|
|
22
|
+
If this system is pleasant to use, a Python developer should be able to describe intent with the core nouns:
|
|
23
|
+
|
|
24
|
+
- I have a **corpus** at this path or uniform resource identifier.
|
|
25
|
+
- I ingest an **item** with optional **metadata**.
|
|
26
|
+
- I rebuild the derived **index** after edits.
|
|
27
|
+
- I run a **recipe** against the same corpus.
|
|
28
|
+
- I query and receive **evidence**.
|
|
29
|
+
|
|
30
|
+
Anything that does not map cleanly to these nouns is either a derived helper or a backend-specific implementation detail that should not leak.
|
|
31
|
+
|
|
32
|
+
## Relationship to agent frameworks
|
|
33
|
+
|
|
34
|
+
Biblicus is designed to integrate with agent frameworks through explicit tools and clear application programming interfaces. Tactus is one target environment with strong isolation requirements.
|
|
35
|
+
|
|
36
|
+
- **Tools and toolsets**, including the Model Context Protocol, are the primary capability boundary.
|
|
37
|
+
- **Sandboxing and brokered or secretless execution** are primary deployment modes.
|
|
38
|
+
- **Durability and evaluations** are central: invariants via specifications, quality via evaluations.
|
|
39
|
+
|
|
40
|
+
## Core concepts
|
|
41
|
+
|
|
42
|
+
### Concepts
|
|
43
|
+
|
|
44
|
+
- **Corpus**: a named, mutable collection rooted at a path or uniform resource identifier. In version zero it is typically a local folder containing raw files plus a `.biblicus/` directory for minimal metadata.
|
|
45
|
+
- **Item**: the unit of ingestion in a corpus: raw bytes of any modality, including text, images, Portable Document Format documents, audio, and video, plus optional metadata and provenance.
|
|
46
|
+
- **Knowledge base backend**: an implementation that can ingest and retrieve from a corpus, such as scan, full text search, vector retrieval, or hybrid retrieval, exposed to procedures through retrieval primitives.
|
|
47
|
+
- **Retrieval recipe**: a named configuration bundle for a backend, such as chunking rules, embedding model and version, hybrid weights, reranker choice, and filters. This is what we benchmark and compare.
|
|
48
|
+
- **Recipe manifest**: a reproducibility record describing the backend and recipe parameters, plus any referenced materializations and build runs.
|
|
49
|
+
- **Materialization**: an optional, persisted representation derived from raw content for a given recipe and backend, such as chunks, embeddings, or indexes. Some backends intentionally have none and operate on demand.
|
|
50
|
+
- **Evidence**: structured retrieval output from backend queries. Evidence includes spans, scores, and provenance used by downstream retrieval augmented generation procedures.
|
|
51
|
+
- **Pipeline stage / editorial layer**: a structured step that transforms, filters, extracts, or curates content, such as raw, curated, and published, or extract text from Portable Document Format documents.
|
|
52
|
+
|
|
53
|
+
## Design principles
|
|
54
|
+
|
|
55
|
+
- **Primitives + derived constructs**: keep the protocol surface small and composable; ship higher-level helpers and example procedures on top.
|
|
56
|
+
- **Minimal opinion raw store**: raw ingestion should work for a folder of files with optional lightweight tagging.
|
|
57
|
+
- **Reproducibility by default**: comparisons require manifests (even when there are no persisted materializations).
|
|
58
|
+
- **Mutability is real**: corpora are edited, pruned, and reorganized; re-indexing must be a core workflow.
|
|
59
|
+
- **Separation of concerns**: retrieval returns evidence; retrieval-augmented generation patterns live in Tactus procedures (not inside the knowledge base backend).
|
|
60
|
+
- **Deployment flexibility**: same interface across local/offline, brokered external services, and hybrid environments.
|
|
61
|
+
- **Evidence is the primary output**: every retrieval returns structured evidence; everything else is a derived helper.
|
|
62
|
+
|
|
63
|
+
## Locked decisions (version zero)
|
|
64
|
+
|
|
65
|
+
These are explicit, opinionated policies encoded into the project:
|
|
66
|
+
|
|
67
|
+
- **Evidence schema strictness**: moderate-to-strong schema. Evidence must include stable identifiers, provenance, and retrieval scores; richer fields (spans, stage, recipe and run identifiers) are expected.
|
|
68
|
+
- **Retrieval stages**: multi-stage is explicit (retrieve, rerank, then filter). Pipelines are expressed through evidence metadata rather than hard-coded backends.
|
|
69
|
+
- **Corpus versioning**: snapshot or reindex runs are versioned; full directed acyclic graph lineage is deferred.
|
|
70
|
+
- **Evaluation datasets**: mixed human-labeled and synthetic questions; human-labeled for truth, synthetic for scale.
|
|
71
|
+
- **Baseline retriever**: hybrid is the strategic target, but the first reference backend is deterministic lexical.
|
|
72
|
+
- **Context budgeting**: evidence selection is governed by budgets (token, unit, and per-source limits), not a fixed count.
|
|
73
|
+
|
|
74
|
+
## Evidence schema (version zero)
|
|
75
|
+
|
|
76
|
+
Evidence is the canonical output of retrieval. Required fields:
|
|
77
|
+
|
|
78
|
+
- `item_id`, `source_uri`, `media_type`
|
|
79
|
+
- `score` and `rank`
|
|
80
|
+
- `text` (or `content_ref` when non-text)
|
|
81
|
+
- `stage` (for example, `scan`, `full-text-search`, `rerank`)
|
|
82
|
+
- `recipe_id` / `run_id` (for reproducibility)
|
|
83
|
+
- Optional: `span_start`, `span_end`, `hash`
|
|
84
|
+
|
|
85
|
+
## Architectural policies version zero
|
|
86
|
+
|
|
87
|
+
### Integration boundary
|
|
88
|
+
|
|
89
|
+
- Biblicus can integrate with Tactus as a **Model Context Protocol toolset**, for example with tool names such as `knowledge_base_ingest`, `knowledge_base_query`, and `knowledge_base_stats`.
|
|
90
|
+
- We will **not** add a knowledge base or retrieval augmented generation language primitive in version zero. Revisit only if we need semantics that tools cannot express cleanly, such as enforceable policy boundaries, runtime managed durability, caching hooks, or guaranteed instrumentation.
|
|
91
|
+
|
|
92
|
+
### Interface packaging
|
|
93
|
+
|
|
94
|
+
- The knowledge base interface is a **small protocol and reference implementation**, including tool schemas and a reference Model Context Protocol server. We will not build a full managed service in version zero.
|
|
95
|
+
|
|
96
|
+
### Corpus identity and layout
|
|
97
|
+
|
|
98
|
+
- Corpora are identified by a **uniform resource identifier**; simple strings and paths normalize to canonical `file://...`.
|
|
99
|
+
- The raw corpus is the source of truth and must support:
|
|
100
|
+
- a plain folder of arbitrary files
|
|
101
|
+
- optional Markdown + Yet Another Markup Language front matter for lightweight tagging
|
|
102
|
+
- sidecar metadata for any file type (for example, `file.pdf.biblicus.yml`)
|
|
103
|
+
- Raw items are written with **usable file extensions** whenever possible (based on `media_type`) so the corpus remains easy to browse and recover with ordinary operating system tools.
|
|
104
|
+
|
|
105
|
+
### Mutability and editorial workflow
|
|
106
|
+
|
|
107
|
+
- Corpora are **mutable**. Re-indexing and refresh are primary operations.
|
|
108
|
+
- Filtering, pruning, and curation are primary needs; we may model this as a **multi-layer editorial pipeline** such as raw, curated, then published.
|
|
109
|
+
|
|
110
|
+
### Pipeline stages
|
|
111
|
+
|
|
112
|
+
- Text extraction (Portable Document Format, office documents, or image optical character recognition) is a **pipeline stage**, not part of raw ingestion.
|
|
113
|
+
|
|
114
|
+
### Backend hosting modes (all supported)
|
|
115
|
+
|
|
116
|
+
Biblicus must support all three backend hosting modes behind the same interface, and ship at least one reference example of each:
|
|
117
|
+
|
|
118
|
+
- **In-process plugin**: simplest local minimum viable product and deterministic testing.
|
|
119
|
+
- **Out-of-process local daemon**: isolates dependencies and supports warm indexes for heavier systems.
|
|
120
|
+
- **Remote service**: production deployments, multi-tenant separation, and managed infrastructure.
|
|
121
|
+
|
|
122
|
+
Backend hosting mode is a primary benchmark dimension (cold start, warm start, latency, throughput, cost, operational complexity).
|
|
123
|
+
|
|
124
|
+
### Security / sandbox topology (all supported)
|
|
125
|
+
|
|
126
|
+
Biblicus must support all three deployment topologies, selected as appropriate per environment and backend:
|
|
127
|
+
|
|
128
|
+
- **In-sandbox**: the knowledge base runs inside the Tactus sandbox container (local, offline, simplest wiring).
|
|
129
|
+
- **Brokered or external**: the knowledge base runs outside the sandbox and is accessed via tools (aligns with secretless or brokered execution).
|
|
130
|
+
- **Hybrid**: mix modes across environments (for example, local development in-sandbox; production external).
|
|
131
|
+
|
|
132
|
+
The interface stays the same; topology is configuration.
|
|
133
|
+
|
|
134
|
+
### Query semantics
|
|
135
|
+
|
|
136
|
+
- `knowledge_base_query` returns **evidence objects** as the low-level, composable building block.
|
|
137
|
+
- Biblicus may ship higher-level convenience helpers built on top of evidence (for example, a prompt-ready context pack formatter), but those helpers remain derived and swappable.
|
|
138
|
+
|
|
139
|
+
### Reproducibility
|
|
140
|
+
|
|
141
|
+
- Biblicus always records a **recipe manifest** for reproducibility.
|
|
142
|
+
- When a backend produces persisted materializations, Biblicus treats them as **versioned build runs** identified by `run_id` (rather than overwriting in place by default).
|
|
143
|
+
- Manifests exist even for just-in-time backends (materializations may be empty).
|
|
144
|
+
- Full directed acyclic graph lineage is not included in version zero; revisit only if needed.
|
|
145
|
+
- Future (optional): define **shared materialization formats** (canonical chunk and embedding stores) so multiple backends can reuse intermediates when it makes sense; keep it opt-in.
|
|
146
|
+
|
|
147
|
+
### Evaluation
|
|
148
|
+
|
|
149
|
+
- Evaluate **both** knowledge base level behavior and end-to-end procedure behavior using **shared datasets**:
|
|
150
|
+
- **Knowledge base level**: retrieval metrics and system properties (for example, recall and mean reciprocal rank, latency, index size, and cost).
|
|
151
|
+
- **Procedure-level (Tactus)**: end-to-end success, policy compliance, and quality metrics across real inputs.
|
|
152
|
+
|
|
153
|
+
### Catalog stance
|
|
154
|
+
|
|
155
|
+
- The corpus catalog is **file-based** (committable, portable, backend-agnostic) so any backend/tool can consume it without requiring a database engine.
|
|
156
|
+
- Canonical version zero format is a single JavaScript Object Notation file at `.biblicus/catalog.json`, written atomically (temporary file and rename) on updates.
|
|
157
|
+
- The catalog includes `latest_run_id` and run manifests are stored at `.biblicus/runs/<run_id>.json`.
|
|
158
|
+
- If this ever becomes a bottleneck at very large scales, we will **change the specification** (bump `schema_version`) rather than introduce multiple “supported” catalog storage modes.
|
|
159
|
+
|
|
160
|
+
## Near-term deliverables
|
|
161
|
+
|
|
162
|
+
1. Define Biblicus version zero knowledge base tool schemas (Model Context Protocol) for:
|
|
163
|
+
- `knowledge_base_ingest` (upsert documents)
|
|
164
|
+
- `knowledge_base_query` (retrieve evidence)
|
|
165
|
+
- `knowledge_base_get` and `knowledge_base_list` (basic management)
|
|
166
|
+
- `knowledge_base_stats` (latency, counts, sizes)
|
|
167
|
+
2. Implement reference backend examples for each hosting mode:
|
|
168
|
+
- **In-process plugin**: a naive local backend (for example, metadata registry and lexical baseline) for determinism and tests
|
|
169
|
+
- **Local daemon**: a vector backend (Qdrant or Postgres with pgvector) for real use
|
|
170
|
+
- **Remote service**: the same vector backend configured against a remote endpoint
|
|
171
|
+
3. Implement one reference Tactus procedure showing a basic retrieval-augmented generation pattern using the toolset.
|
|
172
|
+
4. Add a small evaluation dataset and run `tactus eval` against multiple retrieval configs.
|
|
173
|
+
|
|
174
|
+
## Open questions
|
|
175
|
+
|
|
176
|
+
- **Editorial pipeline model**: do layers live as directory views, metadata flags, or both?
|
|
177
|
+
- **Chunking strategy**: semantic vs fixed-size, and how to compare fairly across corpora.
|
|
178
|
+
- **Re-ranking tradeoffs**: quality versus cost and latency, and when to use cross-encoders.
|
|
179
|
+
- **Context synthesis**: raw snippets vs summary-based packs, and how to evaluate hallucination risk.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Adding a Retrieval Backend
|
|
2
|
+
|
|
3
|
+
Backends are pluggable engines that implement a small, stable interface.
|
|
4
|
+
The goal is to make new retrieval ideas easy to test without reshaping the corpus.
|
|
5
|
+
|
|
6
|
+
## Backend contract
|
|
7
|
+
|
|
8
|
+
Backends implement two operations:
|
|
9
|
+
|
|
10
|
+
- **Build run**: create a `RetrievalRun` manifest (and optional artifacts).
|
|
11
|
+
- **Query**: return structured `Evidence` objects under a `QueryBudget`.
|
|
12
|
+
|
|
13
|
+
## Implementation checklist
|
|
14
|
+
|
|
15
|
+
1. **Define a Pydantic configuration model** for your backend recipe.
|
|
16
|
+
2. **Implement `RetrievalBackend`**:
|
|
17
|
+
- `build_run(corpus, recipe_name, config)`
|
|
18
|
+
- `query(corpus, run, query_text, budget)`
|
|
19
|
+
3. **Emit `Evidence`** with required fields:
|
|
20
|
+
- `item_id`, `source_uri`, `media_type`, `score`, `rank`, `stage`, `recipe_id`, `run_id`
|
|
21
|
+
- `text` **or** `content_ref`
|
|
22
|
+
4. **Register the backend** in `biblicus.backends.available_backends`.
|
|
23
|
+
5. **Add behavior-driven development specifications** before implementation and make them pass with 100% coverage.
|
|
24
|
+
|
|
25
|
+
## Design notes
|
|
26
|
+
|
|
27
|
+
- Treat **runs** as immutable manifests with reproducible parameters.
|
|
28
|
+
- If your backend needs artifacts, store them under `.biblicus/runs/` and record paths in `artifact_paths`.
|
|
29
|
+
- Keep **text extraction** in explicit pipeline stages, not in backend ingestion.
|
|
30
|
+
|
|
31
|
+
## Examples
|
|
32
|
+
|
|
33
|
+
See:
|
|
34
|
+
|
|
35
|
+
- `biblicus.backends.scan.ScanBackend` (minimal baseline)
|
|
36
|
+
- `biblicus.backends.sqlite_full_text_search.SqliteFullTextSearchBackend` (practical local backend)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Application Programming Interface Reference
|
|
2
|
+
===========================================
|
|
3
|
+
|
|
4
|
+
Core
|
|
5
|
+
----
|
|
6
|
+
|
|
7
|
+
.. automodule:: biblicus.corpus
|
|
8
|
+
:members:
|
|
9
|
+
:undoc-members:
|
|
10
|
+
|
|
11
|
+
.. automodule:: biblicus.models
|
|
12
|
+
:members:
|
|
13
|
+
:undoc-members:
|
|
14
|
+
|
|
15
|
+
.. automodule:: biblicus.retrieval
|
|
16
|
+
:members:
|
|
17
|
+
:undoc-members:
|
|
18
|
+
|
|
19
|
+
.. automodule:: biblicus.evaluation
|
|
20
|
+
:members:
|
|
21
|
+
:undoc-members:
|
|
22
|
+
|
|
23
|
+
Backends
|
|
24
|
+
--------
|
|
25
|
+
|
|
26
|
+
.. automodule:: biblicus.backends.scan
|
|
27
|
+
:members:
|
|
28
|
+
:undoc-members:
|
|
29
|
+
|
|
30
|
+
.. automodule:: biblicus.backends.sqlite_full_text_search
|
|
31
|
+
:members:
|
|
32
|
+
:undoc-members:
|