biblicus 0.1.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. {biblicus-0.1.1 → biblicus-0.3.0}/MANIFEST.in +2 -0
  2. biblicus-0.3.0/PKG-INFO +336 -0
  3. biblicus-0.3.0/README.md +307 -0
  4. biblicus-0.3.0/THIRD_PARTY_NOTICES.md +36 -0
  5. {biblicus-0.1.1 → biblicus-0.3.0}/docs/BACKENDS.md +1 -0
  6. biblicus-0.3.0/docs/CORPUS.md +103 -0
  7. biblicus-0.3.0/docs/CORPUS_DESIGN.md +404 -0
  8. biblicus-0.3.0/docs/DEMOS.md +334 -0
  9. biblicus-0.3.0/docs/EXTRACTION.md +186 -0
  10. biblicus-0.3.0/docs/FEATURE_INDEX.md +228 -0
  11. biblicus-0.3.0/docs/ROADMAP.md +174 -0
  12. biblicus-0.3.0/docs/TESTING.md +53 -0
  13. biblicus-0.3.0/docs/USER_CONFIGURATION.md +36 -0
  14. biblicus-0.3.0/docs/conf.py +56 -0
  15. {biblicus-0.1.1 → biblicus-0.3.0}/docs/index.rst +8 -0
  16. {biblicus-0.1.1 → biblicus-0.3.0}/features/cli_parsing.feature +5 -0
  17. biblicus-0.3.0/features/content_sniffing.feature +111 -0
  18. {biblicus-0.1.1 → biblicus-0.3.0}/features/environment.py +71 -3
  19. biblicus-0.3.0/features/extraction_error_handling.feature +32 -0
  20. biblicus-0.3.0/features/extraction_selection.feature +72 -0
  21. biblicus-0.3.0/features/extraction_selection_longest.feature +66 -0
  22. biblicus-0.3.0/features/extractor_pipeline.feature +105 -0
  23. biblicus-0.3.0/features/extractor_validation.feature +7 -0
  24. biblicus-0.3.0/features/hook_config_validation.feature +28 -0
  25. biblicus-0.3.0/features/hook_error_handling.feature +15 -0
  26. biblicus-0.3.0/features/import_tree.feature +54 -0
  27. {biblicus-0.1.1 → biblicus-0.3.0}/features/ingest_sources.feature +14 -0
  28. biblicus-0.3.0/features/integration_audio_samples.feature +13 -0
  29. biblicus-0.3.0/features/integration_image_samples.feature +11 -0
  30. biblicus-0.3.0/features/integration_mixed_corpus.feature +15 -0
  31. biblicus-0.3.0/features/integration_mixed_extraction.feature +15 -0
  32. biblicus-0.3.0/features/integration_ocr_image_extraction.feature +11 -0
  33. biblicus-0.3.0/features/integration_pdf_retrieval.feature +20 -0
  34. biblicus-0.3.0/features/integration_pdf_samples.feature +8 -0
  35. biblicus-0.3.0/features/integration_unstructured_extraction.feature +11 -0
  36. biblicus-0.3.0/features/lifecycle_hooks.feature +96 -0
  37. biblicus-0.3.0/features/ocr_extractor.feature +61 -0
  38. biblicus-0.3.0/features/pdf_text_extraction.feature +41 -0
  39. {biblicus-0.1.1 → biblicus-0.3.0}/features/python_api.feature +17 -0
  40. biblicus-0.3.0/features/python_hook_logging.feature +10 -0
  41. biblicus-0.3.0/features/retrieval_uses_extraction_run.feature +110 -0
  42. biblicus-0.3.0/features/source_loading.feature +9 -0
  43. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/backend_steps.py +3 -1
  44. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/cli_parsing_steps.py +20 -1
  45. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/cli_steps.py +263 -11
  46. biblicus-0.3.0/features/steps/extraction_steps.py +479 -0
  47. biblicus-0.3.0/features/steps/extractor_steps.py +97 -0
  48. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/frontmatter_steps.py +5 -2
  49. biblicus-0.3.0/features/steps/openai_steps.py +236 -0
  50. biblicus-0.3.0/features/steps/pdf_steps.py +115 -0
  51. biblicus-0.3.0/features/steps/python_api_steps.py +416 -0
  52. biblicus-0.3.0/features/steps/rapidocr_steps.py +145 -0
  53. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/retrieval_steps.py +82 -4
  54. biblicus-0.3.0/features/steps/stt_steps.py +93 -0
  55. biblicus-0.3.0/features/steps/unstructured_steps.py +143 -0
  56. biblicus-0.3.0/features/steps/user_config_steps.py +47 -0
  57. biblicus-0.3.0/features/streaming_ingest.feature +11 -0
  58. biblicus-0.3.0/features/stt_extractor.feature +139 -0
  59. biblicus-0.3.0/features/text_extraction_runs.feature +85 -0
  60. biblicus-0.3.0/features/unstructured_extractor.feature +62 -0
  61. biblicus-0.3.0/features/user_config.feature +39 -0
  62. {biblicus-0.1.1 → biblicus-0.3.0}/pyproject.toml +38 -3
  63. biblicus-0.3.0/scripts/download_audio_samples.py +200 -0
  64. biblicus-0.3.0/scripts/download_image_samples.py +180 -0
  65. biblicus-0.3.0/scripts/download_mixed_samples.py +239 -0
  66. biblicus-0.3.0/scripts/download_pdf_samples.py +136 -0
  67. {biblicus-0.1.1 → biblicus-0.3.0}/scripts/download_wikipedia.py +3 -1
  68. {biblicus-0.1.1 → biblicus-0.3.0}/scripts/test.py +35 -1
  69. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/__init__.py +2 -2
  70. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/__init__.py +14 -0
  71. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/interpolation.py +63 -0
  72. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/loader.py +181 -0
  73. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/transformer.py +135 -0
  74. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/backends/__init__.py +0 -2
  75. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/backends/base.py +3 -3
  76. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/backends/scan.py +96 -13
  77. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/backends/sqlite_full_text_search.py +74 -14
  78. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/cli.py +126 -19
  79. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/constants.py +2 -0
  80. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/corpus.py +455 -45
  81. biblicus-0.3.0/src/biblicus/errors.py +15 -0
  82. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/evaluation.py +4 -8
  83. biblicus-0.3.0/src/biblicus/extraction.py +529 -0
  84. biblicus-0.3.0/src/biblicus/extractors/__init__.py +44 -0
  85. biblicus-0.3.0/src/biblicus/extractors/base.py +68 -0
  86. biblicus-0.3.0/src/biblicus/extractors/metadata_text.py +106 -0
  87. biblicus-0.3.0/src/biblicus/extractors/openai_stt.py +180 -0
  88. biblicus-0.3.0/src/biblicus/extractors/pass_through_text.py +84 -0
  89. biblicus-0.3.0/src/biblicus/extractors/pdf_text.py +100 -0
  90. biblicus-0.3.0/src/biblicus/extractors/pipeline.py +105 -0
  91. biblicus-0.3.0/src/biblicus/extractors/rapidocr_text.py +129 -0
  92. biblicus-0.3.0/src/biblicus/extractors/select_longest_text.py +105 -0
  93. biblicus-0.3.0/src/biblicus/extractors/select_text.py +100 -0
  94. biblicus-0.3.0/src/biblicus/extractors/unstructured_text.py +100 -0
  95. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/frontmatter.py +0 -3
  96. biblicus-0.3.0/src/biblicus/hook_logging.py +180 -0
  97. biblicus-0.3.0/src/biblicus/hook_manager.py +203 -0
  98. biblicus-0.3.0/src/biblicus/hooks.py +261 -0
  99. biblicus-0.3.0/src/biblicus/ignore.py +64 -0
  100. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/models.py +107 -0
  101. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/retrieval.py +0 -4
  102. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/sources.py +85 -5
  103. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/time.py +0 -1
  104. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/uris.py +3 -4
  105. biblicus-0.3.0/src/biblicus/user_config.py +138 -0
  106. biblicus-0.3.0/src/biblicus.egg-info/PKG-INFO +336 -0
  107. biblicus-0.3.0/src/biblicus.egg-info/SOURCES.txt +131 -0
  108. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus.egg-info/requires.txt +12 -0
  109. biblicus-0.1.1/PKG-INFO +0 -174
  110. biblicus-0.1.1/README.md +0 -154
  111. biblicus-0.1.1/docs/conf.py +0 -31
  112. biblicus-0.1.1/features/steps/python_api_steps.py +0 -196
  113. biblicus-0.1.1/src/biblicus.egg-info/PKG-INFO +0 -174
  114. biblicus-0.1.1/src/biblicus.egg-info/SOURCES.txt +0 -60
  115. {biblicus-0.1.1 → biblicus-0.3.0}/LICENSE +0 -0
  116. {biblicus-0.1.1 → biblicus-0.3.0}/datasets/wikipedia_mini.json +0 -0
  117. {biblicus-0.1.1 → biblicus-0.3.0}/docs/ARCHITECTURE.md +0 -0
  118. {biblicus-0.1.1 → biblicus-0.3.0}/docs/api.rst +0 -0
  119. {biblicus-0.1.1 → biblicus-0.3.0}/features/backend_validation.feature +0 -0
  120. {biblicus-0.1.1 → biblicus-0.3.0}/features/biblicus_corpus.feature +0 -0
  121. {biblicus-0.1.1 → biblicus-0.3.0}/features/cli_entrypoint.feature +0 -0
  122. {biblicus-0.1.1 → biblicus-0.3.0}/features/corpus_edge_cases.feature +0 -0
  123. {biblicus-0.1.1 → biblicus-0.3.0}/features/corpus_identity.feature +0 -0
  124. {biblicus-0.1.1 → biblicus-0.3.0}/features/corpus_purge.feature +0 -0
  125. {biblicus-0.1.1 → biblicus-0.3.0}/features/error_cases.feature +0 -0
  126. {biblicus-0.1.1 → biblicus-0.3.0}/features/evaluation.feature +0 -0
  127. {biblicus-0.1.1 → biblicus-0.3.0}/features/frontmatter.feature +0 -0
  128. {biblicus-0.1.1 → biblicus-0.3.0}/features/integration_wikipedia.feature +0 -0
  129. {biblicus-0.1.1 → biblicus-0.3.0}/features/model_validation.feature +0 -0
  130. {biblicus-0.1.1 → biblicus-0.3.0}/features/retrieval_budget.feature +0 -0
  131. {biblicus-0.1.1 → biblicus-0.3.0}/features/retrieval_scan.feature +0 -0
  132. {biblicus-0.1.1 → biblicus-0.3.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  133. {biblicus-0.1.1 → biblicus-0.3.0}/features/retrieval_utilities.feature +0 -0
  134. {biblicus-0.1.1 → biblicus-0.3.0}/features/steps/model_steps.py +0 -0
  135. {biblicus-0.1.1 → biblicus-0.3.0}/setup.cfg +0 -0
  136. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus/__main__.py +0 -0
  137. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  138. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  139. {biblicus-0.1.1 → biblicus-0.3.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  include README.md
2
2
  include LICENSE
3
+ include THIRD_PARTY_NOTICES.md
4
+ include .biblicus/config.example.yml
3
5
  include pyproject.toml
4
6
 
5
7
  recursive-include src *.py
@@ -0,0 +1,336 @@
1
+ Metadata-Version: 2.4
2
+ Name: biblicus
3
+ Version: 0.3.0
4
+ Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
+ License: MIT
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: PyYAML>=6.0
11
+ Requires-Dist: pypdf>=4.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: behave>=1.2.6; extra == "dev"
14
+ Requires-Dist: coverage[toml]>=7.0; extra == "dev"
15
+ Requires-Dist: sphinx>=7.0; extra == "dev"
16
+ Requires-Dist: myst-parser>=2.0; extra == "dev"
17
+ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
18
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
19
+ Requires-Dist: black>=24.0; extra == "dev"
20
+ Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
21
+ Provides-Extra: openai
22
+ Requires-Dist: openai>=1.0; extra == "openai"
23
+ Provides-Extra: unstructured
24
+ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
25
+ Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
26
+ Provides-Extra: ocr
27
+ Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
28
+ Dynamic: license-file
29
+
30
+ # Biblicus
31
+
32
+ ![Continuous integration][continuous-integration-badge]
33
+ ![Coverage][coverage-badge]
34
+ ![Documentation][documentation-badge]
35
+
36
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
37
+
38
+ If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
39
+
40
+ The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
41
+
42
+ This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
43
+
44
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
45
+
46
+ See [retrieval augmented generation overview] for a short introduction to the idea.
47
+
48
+ ## A beginner friendly mental model
49
+
50
+ Think in three stages.
51
+
52
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
53
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
54
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
55
+
56
+ If you learn a few project words, the rest of the system becomes predictable.
57
+
58
+ - Corpus is the folder that holds raw items and their metadata.
59
+ - Item is the raw bytes plus optional metadata and source information.
60
+ - Catalog is the rebuildable index of the corpus.
61
+ - Extraction run is a recorded extraction build that produces text artifacts.
62
+ - Backend is a pluggable retrieval implementation.
63
+ - Run is a recorded retrieval build for a corpus.
64
+ - Evidence is what retrieval returns, with identifiers and source information.
65
+
66
+ ## Diagram
67
+
68
+ This diagram shows how a corpus becomes evidence for an assistant.
69
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
70
+ The legend shows what the block styles mean.
71
+ Your code is where you decide how to turn evidence into context and how to call a model.
72
+
73
+ ```mermaid
74
+ %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
75
+ flowchart LR
76
+ subgraph Legend[Legend]
77
+ direction LR
78
+ LegendArtifact[Stored artifact or evidence]
79
+ LegendStep[Step]
80
+ LegendStable[Stable region]
81
+ LegendPluggable[Pluggable region]
82
+ LegendArtifact --- LegendStep
83
+ LegendStable --- LegendPluggable
84
+ end
85
+
86
+ subgraph Main[" "]
87
+ direction TB
88
+
89
+ subgraph StableCore[Stable core]
90
+ direction TB
91
+ Source[Source items] --> Ingest[Ingest]
92
+ Ingest --> Raw[Raw item files]
93
+ Raw --> Catalog[Catalog file]
94
+ end
95
+
96
+ subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
97
+ direction TB
98
+ Catalog --> Extract[Extract pipeline]
99
+ Extract --> ExtractedText[Extracted text artifacts]
100
+ ExtractedText --> ExtractionRun[Extraction run manifest]
101
+ end
102
+
103
+ subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
104
+ direction LR
105
+
106
+ subgraph BackendIngestionIndexing[Ingestion and indexing]
107
+ direction TB
108
+ ExtractionRun --> Build[Build run]
109
+ Build --> BackendIndex[Backend index]
110
+ BackendIndex --> Run[Run manifest]
111
+ end
112
+
113
+ subgraph BackendRetrievalGeneration[Retrieval and generation]
114
+ direction TB
115
+ Run --> Query[Query]
116
+ Query --> Evidence[Evidence]
117
+ end
118
+ end
119
+
120
+ Evidence --> Context
121
+
122
+ subgraph YourCode[Your code]
123
+ direction TB
124
+ Context[Assistant context] --> Model[Large language model call]
125
+ Model --> Answer[Answer]
126
+ end
127
+
128
+ style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
129
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
130
+ style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
131
+ style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
132
+ style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
133
+ style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
134
+
135
+ style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
136
+ style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
137
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
138
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
139
+ style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
140
+ style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
141
+ style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
142
+ style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
143
+ style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
144
+ style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
145
+
146
+ style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
147
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
148
+ style Build fill:#eceff1,stroke:#90a4ae,color:#111111
149
+ style Query fill:#eceff1,stroke:#90a4ae,color:#111111
150
+ style Model fill:#eceff1,stroke:#90a4ae,color:#111111
151
+ end
152
+
153
+ style Legend fill:#ffffff,stroke:#ffffff,color:#111111
154
+ style Main fill:#ffffff,stroke:#ffffff,color:#111111
155
+ style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
156
+ style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
157
+ style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
158
+ style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
159
+ ```
160
+
161
+ ## Practical value
162
+
163
+ - You can ingest raw material once, then try many retrieval approaches over time.
164
+ - You can keep raw files readable and portable, without locking your data inside a database.
165
+ - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
166
+
167
+ ## Typical flow
168
+
169
+ - Initialize a corpus folder.
170
+ - Ingest items from file paths, web addresses, or text input.
171
+ - Run extraction when you want derived text artifacts from non-text sources.
172
+ - Reindex to refresh the catalog after edits.
173
+ - Build a retrieval run with a backend.
174
+ - Query the run to collect evidence and evaluate it with datasets.
175
+
176
+ ## Install
177
+
178
+ This repository is a working Python package. Install it into a virtual environment from the repository root.
179
+
180
+ ```
181
+ python3 -m pip install -e .
182
+ ```
183
+
184
+ After the first release, you can install it from Python Package Index.
185
+
186
+ ```
187
+ python3 -m pip install biblicus
188
+ ```
189
+
190
+ ### Optional extras
191
+
192
+ Some extractors are optional so the base install stays small.
193
+
194
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
195
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
196
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
197
+
198
+ ## Quick start
199
+
200
+ ```
201
+ mkdir -p notes
202
+ echo "A small file note" > notes/example.txt
203
+
204
+ biblicus init corpora/example
205
+ biblicus ingest --corpus corpora/example notes/example.txt
206
+ echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
207
+ biblicus list --corpus corpora/example
208
+ biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
209
+ biblicus build --corpus corpora/example --backend scan
210
+ biblicus query --corpus corpora/example --query "note"
211
+ ```
212
+
213
+ ## Python usage
214
+
215
+ From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
216
+
217
+ - Create a corpus with `Corpus.init` or open one with `Corpus.open`.
218
+ - Ingest notes with `Corpus.ingest_note`.
219
+ - Ingest files or web addresses with `Corpus.ingest_source`.
220
+ - List items with `Corpus.list_items`.
221
+ - Build a retrieval run with `get_backend` and `backend.build_run`.
222
+ - Query a run with `backend.query`.
223
+ - Evaluate with `evaluate_run`.
224
+
225
+ ## How it fits into an assistant
226
+
227
+ In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
228
+
229
+ - Use a corpus as the source of truth for raw items.
230
+ - Use a backend run to build any derived artifacts needed for retrieval.
231
+ - Use queries to obtain evidence objects.
232
+ - Convert evidence into the format your framework expects, such as message content, tool output, or citations.
233
+
234
+ ## Learn more
235
+
236
+ Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
237
+
238
+ The documents below are written to be read in order.
239
+
240
+ - [Architecture][architecture]
241
+ - [Roadmap][roadmap]
242
+ - [Feature index][feature-index]
243
+ - [Corpus][corpus]
244
+ - [Text extraction][text-extraction]
245
+ - [User configuration][user-configuration]
246
+ - [Backends][backends]
247
+ - [Demos][demos]
248
+ - [Testing][testing]
249
+
250
+ ## Metadata and catalog
251
+
252
+ Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
253
+
254
+ ## Corpus layout
255
+
256
+ ```
257
+ corpus/
258
+ raw/
259
+ item.bin
260
+ item.bin.biblicus.yml
261
+ .biblicus/
262
+ config.json
263
+ catalog.json
264
+ runs/
265
+ run-id.json
266
+ ```
267
+
268
+ ## Retrieval backends
269
+
270
+ Two backends are included.
271
+
272
+ - `scan` is a minimal baseline that scans raw items directly.
273
+ - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
274
+
275
+ ## Integration corpus and evaluation dataset
276
+
277
+ Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
278
+
279
+ The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
280
+
281
+ Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
282
+
283
+ ## Tests and coverage
284
+
285
+ ```
286
+ python3 scripts/test.py
287
+ ```
288
+
289
+ To include integration scenarios that download public test data at runtime, run this command.
290
+
291
+ ```
292
+ python3 scripts/test.py --integration
293
+ ```
294
+
295
+ ## Releases
296
+
297
+ Releases are automated from the main branch using semantic versioning and conventional commit messages.
298
+
299
+ The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
300
+
301
+ Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
302
+
303
+ ## Documentation
304
+
305
+ Reference documentation is generated from Sphinx style docstrings.
306
+
307
+ Install development dependencies:
308
+
309
+ ```
310
+ python3 -m pip install -e ".[dev]"
311
+ ```
312
+
313
+ Build the documentation:
314
+
315
+ ```
316
+ python3 -m sphinx -b html docs docs/_build
317
+ ```
318
+
319
+ ## License
320
+
321
+ License terms are in `LICENSE`.
322
+
323
+ [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
324
+ [architecture]: docs/ARCHITECTURE.md
325
+ [roadmap]: docs/ROADMAP.md
326
+ [feature-index]: docs/FEATURE_INDEX.md
327
+ [corpus]: docs/CORPUS.md
328
+ [text-extraction]: docs/EXTRACTION.md
329
+ [user-configuration]: docs/USER_CONFIGURATION.md
330
+ [backends]: docs/BACKENDS.md
331
+ [demos]: docs/DEMOS.md
332
+ [testing]: docs/TESTING.md
333
+
334
+ [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
335
+ [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
336
+ [documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
@@ -0,0 +1,307 @@
1
+ # Biblicus
2
+
3
+ ![Continuous integration][continuous-integration-badge]
4
+ ![Coverage][coverage-badge]
5
+ ![Documentation][documentation-badge]
6
+
7
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
8
+
9
+ If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
10
+
11
+ The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
12
+
13
+ This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
14
+
15
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
16
+
17
+ See [retrieval augmented generation overview] for a short introduction to the idea.
18
+
19
+ ## A beginner friendly mental model
20
+
21
+ Think in three stages.
22
+
23
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
24
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
25
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
26
+
27
+ If you learn a few project words, the rest of the system becomes predictable.
28
+
29
+ - Corpus is the folder that holds raw items and their metadata.
30
+ - Item is the raw bytes plus optional metadata and source information.
31
+ - Catalog is the rebuildable index of the corpus.
32
+ - Extraction run is a recorded extraction build that produces text artifacts.
33
+ - Backend is a pluggable retrieval implementation.
34
+ - Run is a recorded retrieval build for a corpus.
35
+ - Evidence is what retrieval returns, with identifiers and source information.
36
+
37
+ ## Diagram
38
+
39
+ This diagram shows how a corpus becomes evidence for an assistant.
40
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
41
+ The legend shows what the block styles mean.
42
+ Your code is where you decide how to turn evidence into context and how to call a model.
43
+
44
+ ```mermaid
45
+ %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
46
+ flowchart LR
47
+ subgraph Legend[Legend]
48
+ direction LR
49
+ LegendArtifact[Stored artifact or evidence]
50
+ LegendStep[Step]
51
+ LegendStable[Stable region]
52
+ LegendPluggable[Pluggable region]
53
+ LegendArtifact --- LegendStep
54
+ LegendStable --- LegendPluggable
55
+ end
56
+
57
+ subgraph Main[" "]
58
+ direction TB
59
+
60
+ subgraph StableCore[Stable core]
61
+ direction TB
62
+ Source[Source items] --> Ingest[Ingest]
63
+ Ingest --> Raw[Raw item files]
64
+ Raw --> Catalog[Catalog file]
65
+ end
66
+
67
+ subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
68
+ direction TB
69
+ Catalog --> Extract[Extract pipeline]
70
+ Extract --> ExtractedText[Extracted text artifacts]
71
+ ExtractedText --> ExtractionRun[Extraction run manifest]
72
+ end
73
+
74
+ subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
75
+ direction LR
76
+
77
+ subgraph BackendIngestionIndexing[Ingestion and indexing]
78
+ direction TB
79
+ ExtractionRun --> Build[Build run]
80
+ Build --> BackendIndex[Backend index]
81
+ BackendIndex --> Run[Run manifest]
82
+ end
83
+
84
+ subgraph BackendRetrievalGeneration[Retrieval and generation]
85
+ direction TB
86
+ Run --> Query[Query]
87
+ Query --> Evidence[Evidence]
88
+ end
89
+ end
90
+
91
+ Evidence --> Context
92
+
93
+ subgraph YourCode[Your code]
94
+ direction TB
95
+ Context[Assistant context] --> Model[Large language model call]
96
+ Model --> Answer[Answer]
97
+ end
98
+
99
+ style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
100
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
101
+ style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
102
+ style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
103
+ style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
104
+ style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
105
+
106
+ style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
107
+ style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
108
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
+ style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
+ style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
+ style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
113
+ style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
114
+ style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
115
+ style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
+
117
+ style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
118
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
119
+ style Build fill:#eceff1,stroke:#90a4ae,color:#111111
120
+ style Query fill:#eceff1,stroke:#90a4ae,color:#111111
121
+ style Model fill:#eceff1,stroke:#90a4ae,color:#111111
122
+ end
123
+
124
+ style Legend fill:#ffffff,stroke:#ffffff,color:#111111
125
+ style Main fill:#ffffff,stroke:#ffffff,color:#111111
126
+ style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
127
+ style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
128
+ style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
129
+ style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
130
+ ```
131
+
132
+ ## Practical value
133
+
134
+ - You can ingest raw material once, then try many retrieval approaches over time.
135
+ - You can keep raw files readable and portable, without locking your data inside a database.
136
+ - You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
137
+
138
+ ## Typical flow
139
+
140
+ - Initialize a corpus folder.
141
+ - Ingest items from file paths, web addresses, or text input.
142
+ - Run extraction when you want derived text artifacts from non-text sources.
143
+ - Reindex to refresh the catalog after edits.
144
+ - Build a retrieval run with a backend.
145
+ - Query the run to collect evidence and evaluate it with datasets.
146
+
147
+ ## Install
148
+
149
+ This repository is a working Python package. Install it into a virtual environment from the repository root.
150
+
151
+ ```
152
+ python3 -m pip install -e .
153
+ ```
154
+
155
+ After the first release, you can install it from Python Package Index.
156
+
157
+ ```
158
+ python3 -m pip install biblicus
159
+ ```
160
+
161
+ ### Optional extras
162
+
163
+ Some extractors are optional so the base install stays small.
164
+
165
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
166
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
167
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
168
+
169
+ ## Quick start
170
+
171
+ ```
172
+ mkdir -p notes
173
+ echo "A small file note" > notes/example.txt
174
+
175
+ biblicus init corpora/example
176
+ biblicus ingest --corpus corpora/example notes/example.txt
177
+ echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
178
+ biblicus list --corpus corpora/example
179
+ biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
180
+ biblicus build --corpus corpora/example --backend scan
181
+ biblicus query --corpus corpora/example --query "note"
182
+ ```
183
+
184
+ ## Python usage
185
+
186
+ From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
187
+
188
+ - Create a corpus with `Corpus.init` or open one with `Corpus.open`.
189
+ - Ingest notes with `Corpus.ingest_note`.
190
+ - Ingest files or web addresses with `Corpus.ingest_source`.
191
+ - List items with `Corpus.list_items`.
192
+ - Build a retrieval run with `get_backend` and `backend.build_run`.
193
+ - Query a run with `backend.query`.
194
+ - Evaluate with `evaluate_run`.
195
+
196
+ ## How it fits into an assistant
197
+
198
+ In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
199
+
200
+ - Use a corpus as the source of truth for raw items.
201
+ - Use a backend run to build any derived artifacts needed for retrieval.
202
+ - Use queries to obtain evidence objects.
203
+ - Convert evidence into the format your framework expects, such as message content, tool output, or citations.
204
+
205
+ ## Learn more
206
+
207
+ Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
208
+
209
+ The documents below are written to be read in order.
210
+
211
+ - [Architecture][architecture]
212
+ - [Roadmap][roadmap]
213
+ - [Feature index][feature-index]
214
+ - [Corpus][corpus]
215
+ - [Text extraction][text-extraction]
216
+ - [User configuration][user-configuration]
217
+ - [Backends][backends]
218
+ - [Demos][demos]
219
+ - [Testing][testing]
220
+
221
+ ## Metadata and catalog
222
+
223
+ Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
224
+
225
+ ## Corpus layout
226
+
227
+ ```
228
+ corpus/
229
+ raw/
230
+ item.bin
231
+ item.bin.biblicus.yml
232
+ .biblicus/
233
+ config.json
234
+ catalog.json
235
+ runs/
236
+ run-id.json
237
+ ```
238
+
239
+ ## Retrieval backends
240
+
241
+ Two backends are included.
242
+
243
+ - `scan` is a minimal baseline that scans raw items directly.
244
+ - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
245
+
246
+ ## Integration corpus and evaluation dataset
247
+
248
+ Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
249
+
250
+ The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
251
+
252
+ Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
253
+
254
+ ## Tests and coverage
255
+
256
+ ```
257
+ python3 scripts/test.py
258
+ ```
259
+
260
+ To include integration scenarios that download public test data at runtime, run this command.
261
+
262
+ ```
263
+ python3 scripts/test.py --integration
264
+ ```
265
+
266
+ ## Releases
267
+
268
+ Releases are automated from the main branch using semantic versioning and conventional commit messages.
269
+
270
+ The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
271
+
272
+ Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
273
+
274
+ ## Documentation
275
+
276
+ Reference documentation is generated from Sphinx style docstrings.
277
+
278
+ Install development dependencies:
279
+
280
+ ```
281
+ python3 -m pip install -e ".[dev]"
282
+ ```
283
+
284
+ Build the documentation:
285
+
286
+ ```
287
+ python3 -m sphinx -b html docs docs/_build
288
+ ```
289
+
290
+ ## License
291
+
292
+ License terms are in `LICENSE`.
293
+
294
+ [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
295
+ [architecture]: docs/ARCHITECTURE.md
296
+ [roadmap]: docs/ROADMAP.md
297
+ [feature-index]: docs/FEATURE_INDEX.md
298
+ [corpus]: docs/CORPUS.md
299
+ [text-extraction]: docs/EXTRACTION.md
300
+ [user-configuration]: docs/USER_CONFIGURATION.md
301
+ [backends]: docs/BACKENDS.md
302
+ [demos]: docs/DEMOS.md
303
+ [testing]: docs/TESTING.md
304
+
305
+ [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
306
+ [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
307
+ [documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
@@ -0,0 +1,36 @@
1
+ # Third-party notices
2
+
3
+ This project includes vendored third-party source code.
4
+
5
+ ## dotyaml
6
+
7
+ Portions of this repository vendor code from the `dotyaml` project.
8
+
9
+ - Project: `dotyaml`
10
+ - Source: `../dotyaml` (vendored into `src/biblicus/_vendor/dotyaml/`)
11
+ - License: MIT
12
+
13
+ ```
14
+ MIT License
15
+
16
+ Copyright (c) 2025 yamlenv
17
+
18
+ Permission is hereby granted, free of charge, to any person obtaining a copy
19
+ of this software and associated documentation files (the "Software"), to deal
20
+ in the Software without restriction, including without limitation the rights
21
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
+ copies of the Software, and to permit persons to whom the Software is
23
+ furnished to do so, subject to the following conditions:
24
+
25
+ The above copyright notice and this permission notice shall be included in all
26
+ copies or substantial portions of the Software.
27
+
28
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34
+ SOFTWARE.
35
+ ```
36
+
@@ -27,6 +27,7 @@ Backends implement two operations:
27
27
  - Treat **runs** as immutable manifests with reproducible parameters.
28
28
  - If your backend needs artifacts, store them under `.biblicus/runs/` and record paths in `artifact_paths`.
29
29
  - Keep **text extraction** in explicit pipeline stages, not in backend ingestion.
30
+ See `docs/EXTRACTION.md` for how extraction runs are built and referenced from backend configs.
30
31
 
31
32
  ## Examples
32
33