biblicus 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. biblicus-0.1.1/LICENSE +21 -0
  2. biblicus-0.1.1/MANIFEST.in +19 -0
  3. biblicus-0.1.1/PKG-INFO +174 -0
  4. biblicus-0.1.1/README.md +154 -0
  5. biblicus-0.1.1/datasets/wikipedia_mini.json +37 -0
  6. biblicus-0.1.1/docs/ARCHITECTURE.md +179 -0
  7. biblicus-0.1.1/docs/BACKENDS.md +36 -0
  8. biblicus-0.1.1/docs/api.rst +32 -0
  9. biblicus-0.1.1/docs/conf.py +31 -0
  10. biblicus-0.1.1/docs/index.rst +14 -0
  11. biblicus-0.1.1/features/backend_validation.feature +14 -0
  12. biblicus-0.1.1/features/biblicus_corpus.feature +99 -0
  13. biblicus-0.1.1/features/cli_entrypoint.feature +6 -0
  14. biblicus-0.1.1/features/cli_parsing.feature +21 -0
  15. biblicus-0.1.1/features/corpus_edge_cases.feature +133 -0
  16. biblicus-0.1.1/features/corpus_identity.feature +14 -0
  17. biblicus-0.1.1/features/corpus_purge.feature +31 -0
  18. biblicus-0.1.1/features/environment.py +154 -0
  19. biblicus-0.1.1/features/error_cases.feature +170 -0
  20. biblicus-0.1.1/features/evaluation.feature +80 -0
  21. biblicus-0.1.1/features/frontmatter.feature +16 -0
  22. biblicus-0.1.1/features/ingest_sources.feature +24 -0
  23. biblicus-0.1.1/features/integration_wikipedia.feature +7 -0
  24. biblicus-0.1.1/features/model_validation.feature +6 -0
  25. biblicus-0.1.1/features/python_api.feature +57 -0
  26. biblicus-0.1.1/features/retrieval_budget.feature +7 -0
  27. biblicus-0.1.1/features/retrieval_scan.feature +77 -0
  28. biblicus-0.1.1/features/retrieval_sqlite_full_text_search.feature +59 -0
  29. biblicus-0.1.1/features/retrieval_utilities.feature +43 -0
  30. biblicus-0.1.1/features/steps/backend_steps.py +124 -0
  31. biblicus-0.1.1/features/steps/cli_parsing_steps.py +57 -0
  32. biblicus-0.1.1/features/steps/cli_steps.py +661 -0
  33. biblicus-0.1.1/features/steps/frontmatter_steps.py +50 -0
  34. biblicus-0.1.1/features/steps/model_steps.py +34 -0
  35. biblicus-0.1.1/features/steps/python_api_steps.py +196 -0
  36. biblicus-0.1.1/features/steps/retrieval_steps.py +485 -0
  37. biblicus-0.1.1/pyproject.toml +71 -0
  38. biblicus-0.1.1/scripts/download_wikipedia.py +138 -0
  39. biblicus-0.1.1/scripts/test.py +78 -0
  40. biblicus-0.1.1/setup.cfg +4 -0
  41. biblicus-0.1.1/src/biblicus/__init__.py +28 -0
  42. biblicus-0.1.1/src/biblicus/__main__.py +8 -0
  43. biblicus-0.1.1/src/biblicus/backends/__init__.py +44 -0
  44. biblicus-0.1.1/src/biblicus/backends/base.py +65 -0
  45. biblicus-0.1.1/src/biblicus/backends/scan.py +292 -0
  46. biblicus-0.1.1/src/biblicus/backends/sqlite_full_text_search.py +427 -0
  47. biblicus-0.1.1/src/biblicus/cli.py +468 -0
  48. biblicus-0.1.1/src/biblicus/constants.py +10 -0
  49. biblicus-0.1.1/src/biblicus/corpus.py +952 -0
  50. biblicus-0.1.1/src/biblicus/evaluation.py +261 -0
  51. biblicus-0.1.1/src/biblicus/frontmatter.py +92 -0
  52. biblicus-0.1.1/src/biblicus/models.py +307 -0
  53. biblicus-0.1.1/src/biblicus/retrieval.py +137 -0
  54. biblicus-0.1.1/src/biblicus/sources.py +132 -0
  55. biblicus-0.1.1/src/biblicus/time.py +18 -0
  56. biblicus-0.1.1/src/biblicus/uris.py +64 -0
  57. biblicus-0.1.1/src/biblicus.egg-info/PKG-INFO +174 -0
  58. biblicus-0.1.1/src/biblicus.egg-info/SOURCES.txt +60 -0
  59. biblicus-0.1.1/src/biblicus.egg-info/dependency_links.txt +1 -0
  60. biblicus-0.1.1/src/biblicus.egg-info/entry_points.txt +2 -0
  61. biblicus-0.1.1/src/biblicus.egg-info/requires.txt +11 -0
  62. biblicus-0.1.1/src/biblicus.egg-info/top_level.txt +1 -0
biblicus-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Biblicus Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,19 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+
5
+ recursive-include src *.py
6
+ recursive-include docs *.rst *.md *.py
7
+ recursive-include features *.feature *.py
8
+ recursive-include scripts *.py
9
+ recursive-include datasets *.json
10
+
11
+ prune corpora
12
+ prune reports
13
+ prune docs/_build
14
+
15
+ global-exclude *.pyc
16
+ global-exclude *.pyo
17
+ global-exclude __pycache__/*
18
+ global-exclude .DS_Store
19
+ global-exclude .coverage
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: biblicus
3
+ Version: 0.1.1
4
+ Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
+ License: MIT
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: PyYAML>=6.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: behave>=1.2.6; extra == "dev"
13
+ Requires-Dist: coverage[toml]>=7.0; extra == "dev"
14
+ Requires-Dist: sphinx>=7.0; extra == "dev"
15
+ Requires-Dist: myst-parser>=2.0; extra == "dev"
16
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
17
+ Requires-Dist: black>=24.0; extra == "dev"
18
+ Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
19
+ Dynamic: license-file
20
+
21
+ # Biblicus
22
+
23
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
24
+
25
+ If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
26
+
27
+ The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
28
+
29
+ This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
30
+
31
+ It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
32
+
33
+ See [retrieval augmented generation overview] for a short introduction to the idea.
34
+
35
+ ## The framework
36
+
37
+ The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
38
+
39
+ - Corpus is the folder that holds raw items and their metadata.
40
+ - Item is the raw bytes of a document or other artifact, plus its source.
41
+ - Catalog is the rebuildable index of the corpus.
42
+ - Evidence is what retrieval returns, ready to be turned into context for a large language model.
43
+ - Run is a recorded retrieval build for a corpus.
44
+ - Backend is a pluggable retrieval implementation.
45
+ - Recipe is a named configuration for a backend.
46
+ - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
47
+
48
+ ## Practical value
49
+
50
+ - You can ingest raw material once, then try many retrieval approaches over time.
51
+ - You can keep raw files readable and portable, without locking your data inside a database.
52
+ - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
53
+
54
+ ## Typical flow
55
+
56
+ - Initialize a corpus folder.
57
+ - Ingest items from file paths, web addresses, or text input.
58
+ - Reindex to refresh the catalog after edits.
59
+ - Build a retrieval run with a backend.
60
+ - Query the run to collect evidence and evaluate it with datasets.
61
+
62
+ ## Install
63
+
64
+ This repository is a working Python package. Install it into a virtual environment from the repository root.
65
+
66
+ ```
67
+ python3 -m pip install -e .
68
+ ```
69
+
70
+ After the first release, you can install it from Python Package Index.
71
+
72
+ ```
73
+ python3 -m pip install biblicus
74
+ ```
75
+
76
+ ## Quick start
77
+
78
+ ```
79
+ biblicus init corpora/example
80
+ biblicus ingest --corpus corpora/example notes/example.txt
81
+ echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
82
+ biblicus list --corpus corpora/example
83
+ biblicus build --corpus corpora/example --backend scan
84
+ biblicus query --corpus corpora/example --query "note"
85
+ ```
86
+
87
+ ## Python usage
88
+
89
+ From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
90
+
91
+ - Create a corpus with `Corpus.init` or open one with `Corpus.open`.
92
+ - Ingest notes with `Corpus.ingest_note`.
93
+ - Ingest files or web addresses with `Corpus.ingest_source`.
94
+ - List items with `Corpus.list_items`.
95
+ - Build a retrieval run with `get_backend` and `backend.build_run`.
96
+ - Query a run with `backend.query`.
97
+ - Evaluate with `evaluate_run`.
98
+
99
+ ## How it fits into an assistant
100
+
101
+ In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
102
+
103
+ - Use a corpus as the source of truth for raw items.
104
+ - Use a backend run to build any derived artifacts needed for retrieval.
105
+ - Use queries to obtain evidence objects.
106
+ - Convert evidence into the format your framework expects, such as message content, tool output, or citations.
107
+
108
+ ## Learn more
109
+
110
+ The documents below are written to be read in order.
111
+
112
+ - [Architecture][architecture]
113
+ - [Backends][backends]
114
+
115
+ ## Metadata and catalog
116
+
117
+ Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
118
+
119
+ ## Corpus layout
120
+
121
+ ```
122
+ corpus/
123
+ raw/
124
+ item.bin
125
+ item.bin.biblicus.yml
126
+ .biblicus/
127
+ config.json
128
+ catalog.json
129
+ runs/
130
+ run-id.json
131
+ ```
132
+
133
+ ## Retrieval backends
134
+
135
+ Two backends are included.
136
+
137
+ - `scan` is a minimal baseline that scans raw items directly.
138
+ - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
139
+
140
+ ## Integration corpus and evaluation dataset
141
+
142
+ Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
143
+
144
+ The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
145
+
146
+ ## Tests and coverage
147
+
148
+ ```
149
+ python3 scripts/test.py
150
+ ```
151
+
152
+ ## Releases
153
+
154
+ Releases are automated from the main branch using semantic versioning and conventional commit messages.
155
+
156
+ The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
157
+
158
+ Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
159
+
160
+ ## Documentation
161
+
162
+ Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
163
+
164
+ ```
165
+ sphinx-build -b html docs docs/_build
166
+ ```
167
+
168
+ ## License
169
+
170
+ License terms are in `LICENSE`.
171
+
172
+ [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
173
+ [architecture]: docs/ARCHITECTURE.md
174
+ [backends]: docs/BACKENDS.md
@@ -0,0 +1,154 @@
1
+ # Biblicus
2
+
3
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
4
+
5
+ If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
6
+
7
+ The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
8
+
9
+ This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
10
+
11
+ It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
12
+
13
+ See [retrieval augmented generation overview] for a short introduction to the idea.
14
+
15
+ ## The framework
16
+
17
+ The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
18
+
19
+ - Corpus is the folder that holds raw items and their metadata.
20
+ - Item is the raw bytes of a document or other artifact, plus its source.
21
+ - Catalog is the rebuildable index of the corpus.
22
+ - Evidence is what retrieval returns, ready to be turned into context for a large language model.
23
+ - Run is a recorded retrieval build for a corpus.
24
+ - Backend is a pluggable retrieval implementation.
25
+ - Recipe is a named configuration for a backend.
26
+ - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
27
+
28
+ ## Practical value
29
+
30
+ - You can ingest raw material once, then try many retrieval approaches over time.
31
+ - You can keep raw files readable and portable, without locking your data inside a database.
32
+ - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
33
+
34
+ ## Typical flow
35
+
36
+ - Initialize a corpus folder.
37
+ - Ingest items from file paths, web addresses, or text input.
38
+ - Reindex to refresh the catalog after edits.
39
+ - Build a retrieval run with a backend.
40
+ - Query the run to collect evidence and evaluate it with datasets.
41
+
42
+ ## Install
43
+
44
+ This repository is a working Python package. Install it into a virtual environment from the repository root.
45
+
46
+ ```
47
+ python3 -m pip install -e .
48
+ ```
49
+
50
+ After the first release, you can install it from Python Package Index.
51
+
52
+ ```
53
+ python3 -m pip install biblicus
54
+ ```
55
+
56
+ ## Quick start
57
+
58
+ ```
59
+ biblicus init corpora/example
60
+ biblicus ingest --corpus corpora/example notes/example.txt
61
+ echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
62
+ biblicus list --corpus corpora/example
63
+ biblicus build --corpus corpora/example --backend scan
64
+ biblicus query --corpus corpora/example --query "note"
65
+ ```
66
+
67
+ ## Python usage
68
+
69
+ From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
70
+
71
+ - Create a corpus with `Corpus.init` or open one with `Corpus.open`.
72
+ - Ingest notes with `Corpus.ingest_note`.
73
+ - Ingest files or web addresses with `Corpus.ingest_source`.
74
+ - List items with `Corpus.list_items`.
75
+ - Build a retrieval run with `get_backend` and `backend.build_run`.
76
+ - Query a run with `backend.query`.
77
+ - Evaluate with `evaluate_run`.
78
+
79
+ ## How it fits into an assistant
80
+
81
+ In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
82
+
83
+ - Use a corpus as the source of truth for raw items.
84
+ - Use a backend run to build any derived artifacts needed for retrieval.
85
+ - Use queries to obtain evidence objects.
86
+ - Convert evidence into the format your framework expects, such as message content, tool output, or citations.
87
+
88
+ ## Learn more
89
+
90
+ The documents below are written to be read in order.
91
+
92
+ - [Architecture][architecture]
93
+ - [Backends][backends]
94
+
95
+ ## Metadata and catalog
96
+
97
+ Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
98
+
99
+ ## Corpus layout
100
+
101
+ ```
102
+ corpus/
103
+ raw/
104
+ item.bin
105
+ item.bin.biblicus.yml
106
+ .biblicus/
107
+ config.json
108
+ catalog.json
109
+ runs/
110
+ run-id.json
111
+ ```
112
+
113
+ ## Retrieval backends
114
+
115
+ Two backends are included.
116
+
117
+ - `scan` is a minimal baseline that scans raw items directly.
118
+ - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
119
+
120
+ ## Integration corpus and evaluation dataset
121
+
122
+ Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
123
+
124
+ The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
125
+
126
+ ## Tests and coverage
127
+
128
+ ```
129
+ python3 scripts/test.py
130
+ ```
131
+
132
+ ## Releases
133
+
134
+ Releases are automated from the main branch using semantic versioning and conventional commit messages.
135
+
136
+ The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
137
+
138
+ Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
139
+
140
+ ## Documentation
141
+
142
+ Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
143
+
144
+ ```
145
+ sphinx-build -b html docs docs/_build
146
+ ```
147
+
148
+ ## License
149
+
150
+ License terms are in `LICENSE`.
151
+
152
+ [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
153
+ [architecture]: docs/ARCHITECTURE.md
154
+ [backends]: docs/BACKENDS.md
@@ -0,0 +1,37 @@
1
+ {
2
+ "schema_version": 1,
3
+ "name": "wikipedia-mini",
4
+ "description": "Small evaluation set aligned with the Wikipedia integration corpus.",
5
+ "queries": [
6
+ {
7
+ "query_id": "q1",
8
+ "query_text": "mathematician who worked on the Analytical Engine",
9
+ "expected_source_uri": "https://en.wikipedia.org/wiki/Ada_Lovelace",
10
+ "kind": "gold"
11
+ },
12
+ {
13
+ "query_id": "q2",
14
+ "query_text": "pioneer of computer programming in the US Navy",
15
+ "expected_source_uri": "https://en.wikipedia.org/wiki/Grace_Hopper",
16
+ "kind": "gold"
17
+ },
18
+ {
19
+ "query_id": "q3",
20
+ "query_text": "proposed the Church-Turing thesis",
21
+ "expected_source_uri": "https://en.wikipedia.org/wiki/Alan_Turing",
22
+ "kind": "gold"
23
+ },
24
+ {
25
+ "query_id": "q4",
26
+ "query_text": "invented information theory",
27
+ "expected_source_uri": "https://en.wikipedia.org/wiki/Claude_Shannon",
28
+ "kind": "gold"
29
+ },
30
+ {
31
+ "query_id": "q5",
32
+ "query_text": "coined the term artificial intelligence",
33
+ "expected_source_uri": "https://en.wikipedia.org/wiki/John_McCarthy_(computer_scientist)",
34
+ "kind": "synthetic"
35
+ }
36
+ ]
37
+ }
@@ -0,0 +1,179 @@
1
+ # Biblicus Architecture
2
+
3
+ Biblicus is a command line interface first **corpus** manager for ingesting, curating, and evaluating corpora used by assistant systems. The early goal is to make it easy to add raw, unstructured content, while keeping the system structured enough to support reproducible experiments.
4
+
5
+ ## How we design
6
+
7
+ Design starts from strict behavior-driven development:
8
+ - The authoritative description of behavior lives in `features/*.feature`.
9
+ - All changes should follow specification-first behavior-driven development: failing scenario, implementation, passing scenario, then refactor.
10
+ - Behavior-driven development scenarios are not an afterthought: they are how we keep the domain vocabulary consistent and the platform comparable across backends and recipes.
11
+ - **Specification completeness** is mandatory: if behavior exists, it must be specified. Ambiguous or untestable behavior should be removed or turned into an explicit error.
12
+
13
+ ## Domain-specific cognitive framework
14
+
15
+ The domain-specific cognitive framework is the set of **stable nouns**, **verbs**, and **invariants** that make Biblicus pleasant to use over time.
16
+ We prefer a small set of universal concepts with strict semantics over a large set of ad-hoc flags.
17
+
18
+ We also treat **Pydantic models** as the canonical way to codify and validate these constructs at boundaries.
19
+
20
+ ## The Python developer mental model
21
+
22
+ If this system is pleasant to use, a Python developer should be able to describe intent with the core nouns:
23
+
24
+ - I have a **corpus** at this path or uniform resource identifier.
25
+ - I ingest an **item** with optional **metadata**.
26
+ - I rebuild the derived **index** after edits.
27
+ - I run a **recipe** against the same corpus.
28
+ - I query and receive **evidence**.
29
+
30
+ Anything that does not map cleanly to these nouns is either a derived helper or a backend-specific implementation detail that should not leak.
31
+
32
+ ## Relationship to agent frameworks
33
+
34
+ Biblicus is designed to integrate with agent frameworks through explicit tools and clear application programming interfaces. Tactus is one target environment with strong isolation requirements.
35
+
36
+ - **Tools and toolsets**, including the Model Context Protocol, are the primary capability boundary.
37
+ - **Sandboxing and brokered or secretless execution** are primary deployment modes.
38
+ - **Durability and evaluations** are central: invariants via specifications, quality via evaluations.
39
+
40
+ ## Core concepts
41
+
42
+ ### Concepts
43
+
44
+ - **Corpus**: a named, mutable collection rooted at a path or uniform resource identifier. In version zero it is typically a local folder containing raw files plus a `.biblicus/` directory for minimal metadata.
45
+ - **Item**: the unit of ingestion in a corpus: raw bytes of any modality, including text, images, Portable Document Format documents, audio, and video, plus optional metadata and provenance.
46
+ - **Knowledge base backend**: an implementation that can ingest and retrieve from a corpus, such as scan, full text search, vector retrieval, or hybrid retrieval, exposed to procedures through retrieval primitives.
47
+ - **Retrieval recipe**: a named configuration bundle for a backend, such as chunking rules, embedding model and version, hybrid weights, reranker choice, and filters. This is what we benchmark and compare.
48
+ - **Recipe manifest**: a reproducibility record describing the backend and recipe parameters, plus any referenced materializations and build runs.
49
+ - **Materialization**: an optional, persisted representation derived from raw content for a given recipe and backend, such as chunks, embeddings, or indexes. Some backends intentionally have none and operate on demand.
50
+ - **Evidence**: structured retrieval output from backend queries. Evidence includes spans, scores, and provenance used by downstream retrieval augmented generation procedures.
51
+ - **Pipeline stage / editorial layer**: a structured step that transforms, filters, extracts, or curates content, such as raw, curated, and published, or extract text from Portable Document Format documents.
52
+
53
+ ## Design principles
54
+
55
+ - **Primitives + derived constructs**: keep the protocol surface small and composable; ship higher-level helpers and example procedures on top.
56
+ - **Minimal opinion raw store**: raw ingestion should work for a folder of files with optional lightweight tagging.
57
+ - **Reproducibility by default**: comparisons require manifests (even when there are no persisted materializations).
58
+ - **Mutability is real**: corpora are edited, pruned, and reorganized; re-indexing must be a core workflow.
59
+ - **Separation of concerns**: retrieval returns evidence; retrieval-augmented generation patterns live in Tactus procedures (not inside the knowledge base backend).
60
+ - **Deployment flexibility**: same interface across local/offline, brokered external services, and hybrid environments.
61
+ - **Evidence is the primary output**: every retrieval returns structured evidence; everything else is a derived helper.
62
+
63
+ ## Locked decisions (version zero)
64
+
65
+ These are explicit, opinionated policies encoded into the project:
66
+
67
+ - **Evidence schema strictness**: moderate-to-strong schema. Evidence must include stable identifiers, provenance, and retrieval scores; richer fields (spans, stage, recipe and run identifiers) are expected.
68
+ - **Retrieval stages**: multi-stage is explicit (retrieve, rerank, then filter). Pipelines are expressed through evidence metadata rather than hard-coded backends.
69
+ - **Corpus versioning**: snapshot or reindex runs are versioned; full directed acyclic graph lineage is deferred.
70
+ - **Evaluation datasets**: mixed human-labeled and synthetic questions; human-labeled for truth, synthetic for scale.
71
+ - **Baseline retriever**: hybrid is the strategic target, but the first reference backend is deterministic lexical.
72
+ - **Context budgeting**: evidence selection is governed by budgets (token, unit, and per-source limits), not a fixed count.
73
+
74
+ ## Evidence schema (version zero)
75
+
76
+ Evidence is the canonical output of retrieval. Required fields:
77
+
78
+ - `item_id`, `source_uri`, `media_type`
79
+ - `score` and `rank`
80
+ - `text` (or `content_ref` when non-text)
81
+ - `stage` (for example, `scan`, `full-text-search`, `rerank`)
82
+ - `recipe_id` / `run_id` (for reproducibility)
83
+ - Optional: `span_start`, `span_end`, `hash`
84
+
85
+ ## Architectural policies version zero
86
+
87
+ ### Integration boundary
88
+
89
+ - Biblicus can integrate with Tactus as a **Model Context Protocol toolset**, for example with tool names such as `knowledge_base_ingest`, `knowledge_base_query`, and `knowledge_base_stats`.
90
+ - We will **not** add a knowledge base or retrieval augmented generation language primitive in version zero. Revisit only if we need semantics that tools cannot express cleanly, such as enforceable policy boundaries, runtime managed durability, caching hooks, or guaranteed instrumentation.
91
+
92
+ ### Interface packaging
93
+
94
+ - The knowledge base interface is a **small protocol and reference implementation**, including tool schemas and a reference Model Context Protocol server. We will not build a full managed service in version zero.
95
+
96
+ ### Corpus identity and layout
97
+
98
+ - Corpora are identified by a **uniform resource identifier**; simple strings and paths normalize to canonical `file://...`.
99
+ - The raw corpus is the source of truth and must support:
100
+ - a plain folder of arbitrary files
101
+ - optional Markdown + Yet Another Markup Language front matter for lightweight tagging
102
+ - sidecar metadata for any file type (for example, `file.pdf.biblicus.yml`)
103
+ - Raw items are written with **usable file extensions** whenever possible (based on `media_type`) so the corpus remains easy to browse and recover with ordinary operating system tools.
104
+
105
+ ### Mutability and editorial workflow
106
+
107
+ - Corpora are **mutable**. Re-indexing and refresh are primary operations.
108
+ - Filtering, pruning, and curation are primary needs; we may model this as a **multi-layer editorial pipeline** such as raw, curated, then published.
109
+
110
+ ### Pipeline stages
111
+
112
+ - Text extraction (Portable Document Format, office documents, or image optical character recognition) is a **pipeline stage**, not part of raw ingestion.
113
+
114
+ ### Backend hosting modes (all supported)
115
+
116
+ Biblicus must support all three backend hosting modes behind the same interface, and ship at least one reference example of each:
117
+
118
+ - **In-process plugin**: simplest local minimum viable product and deterministic testing.
119
+ - **Out-of-process local daemon**: isolates dependencies and supports warm indexes for heavier systems.
120
+ - **Remote service**: production deployments, multi-tenant separation, and managed infrastructure.
121
+
122
+ Backend hosting mode is a primary benchmark dimension (cold start, warm start, latency, throughput, cost, operational complexity).
123
+
124
+ ### Security / sandbox topology (all supported)
125
+
126
+ Biblicus must support all three deployment topologies, selected as appropriate per environment and backend:
127
+
128
+ - **In-sandbox**: the knowledge base runs inside the Tactus sandbox container (local, offline, simplest wiring).
129
+ - **Brokered or external**: the knowledge base runs outside the sandbox and is accessed via tools (aligns with secretless or brokered execution).
130
+ - **Hybrid**: mix modes across environments (for example, local development in-sandbox; production external).
131
+
132
+ The interface stays the same; topology is configuration.
133
+
134
+ ### Query semantics
135
+
136
+ - `knowledge_base_query` returns **evidence objects** as the low-level, composable building block.
137
+ - Biblicus may ship higher-level convenience helpers built on top of evidence (for example, a prompt-ready context pack formatter), but those helpers remain derived and swappable.
138
+
139
+ ### Reproducibility
140
+
141
+ - Biblicus always records a **recipe manifest** for reproducibility.
142
+ - When a backend produces persisted materializations, Biblicus treats them as **versioned build runs** identified by `run_id` (rather than overwriting in place by default).
143
+ - Manifests exist even for just-in-time backends (materializations may be empty).
144
+ - Full directed acyclic graph lineage is not included in version zero; revisit only if needed.
145
+ - Future (optional): define **shared materialization formats** (canonical chunk and embedding stores) so multiple backends can reuse intermediates when it makes sense; keep it opt-in.
146
+
147
+ ### Evaluation
148
+
149
+ - Evaluate **both** knowledge base level behavior and end-to-end procedure behavior using **shared datasets**:
150
+ - **Knowledge base level**: retrieval metrics and system properties (for example, recall and mean reciprocal rank, latency, index size, and cost).
151
+ - **Procedure-level (Tactus)**: end-to-end success, policy compliance, and quality metrics across real inputs.
152
+
153
+ ### Catalog stance
154
+
155
+ - The corpus catalog is **file-based** (committable, portable, backend-agnostic) so any backend/tool can consume it without requiring a database engine.
156
+ - Canonical version zero format is a single JavaScript Object Notation file at `.biblicus/catalog.json`, written atomically (temporary file and rename) on updates.
157
+ - The catalog includes `latest_run_id` and run manifests are stored at `.biblicus/runs/<run_id>.json`.
158
+ - If this ever becomes a bottleneck at very large scales, we will **change the specification** (bump `schema_version`) rather than introduce multiple “supported” catalog storage modes.
159
+
160
+ ## Near-term deliverables
161
+
162
+ 1. Define Biblicus version zero knowledge base tool schemas (Model Context Protocol) for:
163
+ - `knowledge_base_ingest` (upsert documents)
164
+ - `knowledge_base_query` (retrieve evidence)
165
+ - `knowledge_base_get` and `knowledge_base_list` (basic management)
166
+ - `knowledge_base_stats` (latency, counts, sizes)
167
+ 2. Implement reference backend examples for each hosting mode:
168
+ - **In-process plugin**: a naive local backend (for example, metadata registry and lexical baseline) for determinism and tests
169
+ - **Local daemon**: a vector backend (Qdrant or Postgres with pgvector) for real use
170
+ - **Remote service**: the same vector backend configured against a remote endpoint
171
+ 3. Implement one reference Tactus procedure showing a basic retrieval-augmented generation pattern using the toolset.
172
+ 4. Add a small evaluation dataset and run `tactus eval` against multiple retrieval configs.
173
+
174
+ ## Open questions
175
+
176
+ - **Editorial pipeline model**: do layers live as directory views, metadata flags, or both?
177
+ - **Chunking strategy**: semantic vs fixed-size, and how to compare fairly across corpora.
178
+ - **Re-ranking tradeoffs**: quality versus cost and latency, and when to use cross-encoders.
179
+ - **Context synthesis**: raw snippets vs summary-based packs, and how to evaluate hallucination risk.
@@ -0,0 +1,36 @@
1
+ # Adding a Retrieval Backend
2
+
3
+ Backends are pluggable engines that implement a small, stable interface.
4
+ The goal is to make new retrieval ideas easy to test without reshaping the corpus.
5
+
6
+ ## Backend contract
7
+
8
+ Backends implement two operations:
9
+
10
+ - **Build run**: create a `RetrievalRun` manifest (and optional artifacts).
11
+ - **Query**: return structured `Evidence` objects under a `QueryBudget`.
12
+
13
+ ## Implementation checklist
14
+
15
+ 1. **Define a Pydantic configuration model** for your backend recipe.
16
+ 2. **Implement `RetrievalBackend`**:
17
+ - `build_run(corpus, recipe_name, config)`
18
+ - `query(corpus, run, query_text, budget)`
19
+ 3. **Emit `Evidence`** with required fields:
20
+ - `item_id`, `source_uri`, `media_type`, `score`, `rank`, `stage`, `recipe_id`, `run_id`
21
+ - `text` **or** `content_ref`
22
+ 4. **Register the backend** in `biblicus.backends.available_backends`.
23
+ 5. **Add behavior-driven development specifications** before implementation and make them pass with 100% coverage.
24
+
25
+ ## Design notes
26
+
27
+ - Treat **runs** as immutable manifests with reproducible parameters.
28
+ - If your backend needs artifacts, store them under `.biblicus/runs/` and record paths in `artifact_paths`.
29
+ - Keep **text extraction** in explicit pipeline stages, not in backend ingestion.
30
+
31
+ ## Examples
32
+
33
+ See:
34
+
35
+ - `biblicus.backends.scan.ScanBackend` (minimal baseline)
36
+ - `biblicus.backends.sqlite_full_text_search.SqliteFullTextSearchBackend` (practical local backend)
@@ -0,0 +1,32 @@
1
+ Application Programming Interface Reference
2
+ ===========================================
3
+
4
+ Core
5
+ ----
6
+
7
+ .. automodule:: biblicus.corpus
8
+ :members:
9
+ :undoc-members:
10
+
11
+ .. automodule:: biblicus.models
12
+ :members:
13
+ :undoc-members:
14
+
15
+ .. automodule:: biblicus.retrieval
16
+ :members:
17
+ :undoc-members:
18
+
19
+ .. automodule:: biblicus.evaluation
20
+ :members:
21
+ :undoc-members:
22
+
23
+ Backends
24
+ --------
25
+
26
+ .. automodule:: biblicus.backends.scan
27
+ :members:
28
+ :undoc-members:
29
+
30
+ .. automodule:: biblicus.backends.sqlite_full_text_search
31
+ :members:
32
+ :undoc-members: