biblicus 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. {biblicus-0.2.0 → biblicus-0.4.0}/MANIFEST.in +2 -0
  2. {biblicus-0.2.0 → biblicus-0.4.0}/PKG-INFO +96 -18
  3. biblicus-0.2.0/src/biblicus.egg-info/PKG-INFO → biblicus-0.4.0/README.md +86 -37
  4. biblicus-0.4.0/THIRD_PARTY_NOTICES.md +36 -0
  5. {biblicus-0.2.0 → biblicus-0.4.0}/docs/CORPUS.md +14 -1
  6. biblicus-0.2.0/docs/CORPUS_WORKFLOWS.md → biblicus-0.4.0/docs/CORPUS_DESIGN.md +2 -6
  7. biblicus-0.4.0/docs/DEMOS.md +374 -0
  8. biblicus-0.4.0/docs/EXTRACTION.md +203 -0
  9. biblicus-0.4.0/docs/FEATURE_INDEX.md +228 -0
  10. biblicus-0.4.0/docs/ROADMAP.md +200 -0
  11. biblicus-0.4.0/docs/TESTING.md +53 -0
  12. biblicus-0.4.0/docs/USER_CONFIGURATION.md +36 -0
  13. biblicus-0.4.0/docs/conf.py +55 -0
  14. {biblicus-0.2.0 → biblicus-0.4.0}/docs/index.rst +9 -3
  15. {biblicus-0.2.0 → biblicus-0.4.0}/features/cli_parsing.feature +5 -0
  16. {biblicus-0.2.0 → biblicus-0.4.0}/features/content_sniffing.feature +48 -0
  17. biblicus-0.4.0/features/crawl.feature +81 -0
  18. {biblicus-0.2.0 → biblicus-0.4.0}/features/environment.py +71 -0
  19. biblicus-0.4.0/features/extraction_error_handling.feature +32 -0
  20. biblicus-0.4.0/features/extraction_run_lifecycle.feature +117 -0
  21. biblicus-0.4.0/features/extraction_selection.feature +72 -0
  22. biblicus-0.4.0/features/extraction_selection_longest.feature +66 -0
  23. biblicus-0.4.0/features/extractor_pipeline.feature +105 -0
  24. biblicus-0.4.0/features/integration_audio_samples.feature +13 -0
  25. biblicus-0.4.0/features/integration_image_samples.feature +11 -0
  26. biblicus-0.4.0/features/integration_mixed_corpus.feature +15 -0
  27. biblicus-0.4.0/features/integration_mixed_extraction.feature +15 -0
  28. biblicus-0.4.0/features/integration_ocr_image_extraction.feature +11 -0
  29. biblicus-0.4.0/features/integration_pdf_retrieval.feature +20 -0
  30. biblicus-0.4.0/features/integration_unstructured_extraction.feature +11 -0
  31. biblicus-0.4.0/features/ocr_extractor.feature +61 -0
  32. biblicus-0.4.0/features/pdf_text_extraction.feature +41 -0
  33. {biblicus-0.2.0 → biblicus-0.4.0}/features/retrieval_uses_extraction_run.feature +17 -0
  34. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/backend_steps.py +3 -1
  35. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/cli_parsing_steps.py +20 -1
  36. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/cli_steps.py +67 -14
  37. biblicus-0.4.0/features/steps/crawl_steps.py +68 -0
  38. biblicus-0.4.0/features/steps/extraction_run_lifecycle_steps.py +148 -0
  39. biblicus-0.4.0/features/steps/extraction_steps.py +511 -0
  40. biblicus-0.4.0/features/steps/extractor_steps.py +97 -0
  41. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/frontmatter_steps.py +5 -2
  42. biblicus-0.4.0/features/steps/openai_steps.py +236 -0
  43. biblicus-0.4.0/features/steps/pdf_steps.py +115 -0
  44. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/python_api_steps.py +16 -14
  45. biblicus-0.4.0/features/steps/rapidocr_steps.py +145 -0
  46. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/retrieval_steps.py +68 -4
  47. biblicus-0.4.0/features/steps/stt_steps.py +93 -0
  48. biblicus-0.4.0/features/steps/unstructured_steps.py +143 -0
  49. biblicus-0.4.0/features/steps/user_config_steps.py +47 -0
  50. biblicus-0.4.0/features/stt_extractor.feature +139 -0
  51. {biblicus-0.2.0 → biblicus-0.4.0}/features/text_extraction_runs.feature +7 -7
  52. biblicus-0.4.0/features/unstructured_extractor.feature +62 -0
  53. biblicus-0.4.0/features/user_config.feature +39 -0
  54. {biblicus-0.2.0 → biblicus-0.4.0}/pyproject.toml +38 -3
  55. biblicus-0.4.0/scripts/download_audio_samples.py +200 -0
  56. biblicus-0.4.0/scripts/download_image_samples.py +180 -0
  57. biblicus-0.4.0/scripts/download_mixed_samples.py +239 -0
  58. {biblicus-0.2.0 → biblicus-0.4.0}/scripts/download_pdf_samples.py +6 -3
  59. {biblicus-0.2.0 → biblicus-0.4.0}/scripts/download_wikipedia.py +3 -1
  60. {biblicus-0.2.0 → biblicus-0.4.0}/scripts/test.py +24 -3
  61. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/__init__.py +2 -2
  62. biblicus-0.4.0/src/biblicus/_vendor/dotyaml/__init__.py +14 -0
  63. biblicus-0.4.0/src/biblicus/_vendor/dotyaml/interpolation.py +63 -0
  64. biblicus-0.4.0/src/biblicus/_vendor/dotyaml/loader.py +181 -0
  65. biblicus-0.4.0/src/biblicus/_vendor/dotyaml/transformer.py +135 -0
  66. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/backends/__init__.py +0 -2
  67. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/backends/base.py +3 -3
  68. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/backends/scan.py +21 -15
  69. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/backends/sqlite_full_text_search.py +14 -15
  70. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/cli.py +177 -53
  71. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/corpus.py +209 -59
  72. biblicus-0.4.0/src/biblicus/crawl.py +186 -0
  73. biblicus-0.4.0/src/biblicus/errors.py +15 -0
  74. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/evaluation.py +4 -8
  75. biblicus-0.4.0/src/biblicus/extraction.py +531 -0
  76. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/extractors/__init__.py +14 -3
  77. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/extractors/base.py +12 -5
  78. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/extractors/metadata_text.py +13 -5
  79. biblicus-0.4.0/src/biblicus/extractors/openai_stt.py +180 -0
  80. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/extractors/pass_through_text.py +16 -6
  81. biblicus-0.4.0/src/biblicus/extractors/pdf_text.py +100 -0
  82. biblicus-0.4.0/src/biblicus/extractors/pipeline.py +105 -0
  83. biblicus-0.4.0/src/biblicus/extractors/rapidocr_text.py +129 -0
  84. biblicus-0.4.0/src/biblicus/extractors/select_longest_text.py +105 -0
  85. biblicus-0.4.0/src/biblicus/extractors/select_text.py +100 -0
  86. biblicus-0.4.0/src/biblicus/extractors/unstructured_text.py +100 -0
  87. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/frontmatter.py +0 -3
  88. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/hook_logging.py +0 -5
  89. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/hook_manager.py +3 -5
  90. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/hooks.py +3 -7
  91. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/ignore.py +0 -3
  92. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/models.py +118 -0
  93. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/retrieval.py +0 -4
  94. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/sources.py +44 -9
  95. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/time.py +1 -2
  96. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/uris.py +3 -4
  97. biblicus-0.4.0/src/biblicus/user_config.py +138 -0
  98. biblicus-0.2.0/README.md → biblicus-0.4.0/src/biblicus.egg-info/PKG-INFO +115 -17
  99. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus.egg-info/SOURCES.txt +49 -4
  100. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus.egg-info/requires.txt +12 -0
  101. biblicus-0.2.0/docs/EXTRACTION.md +0 -86
  102. biblicus-0.2.0/docs/NEXT_STEPS.md +0 -309
  103. biblicus-0.2.0/docs/TESTING.md +0 -29
  104. biblicus-0.2.0/docs/conf.py +0 -31
  105. biblicus-0.2.0/features/extractor_pipeline.feature +0 -114
  106. biblicus-0.2.0/features/steps/extraction_steps.py +0 -238
  107. biblicus-0.2.0/features/steps/extractor_steps.py +0 -54
  108. biblicus-0.2.0/src/biblicus/extraction.py +0 -330
  109. biblicus-0.2.0/src/biblicus/extractors/cascade.py +0 -101
  110. {biblicus-0.2.0 → biblicus-0.4.0}/LICENSE +0 -0
  111. {biblicus-0.2.0 → biblicus-0.4.0}/datasets/wikipedia_mini.json +0 -0
  112. {biblicus-0.2.0 → biblicus-0.4.0}/docs/ARCHITECTURE.md +0 -0
  113. {biblicus-0.2.0 → biblicus-0.4.0}/docs/BACKENDS.md +0 -0
  114. {biblicus-0.2.0 → biblicus-0.4.0}/docs/api.rst +0 -0
  115. {biblicus-0.2.0 → biblicus-0.4.0}/features/backend_validation.feature +0 -0
  116. {biblicus-0.2.0 → biblicus-0.4.0}/features/biblicus_corpus.feature +0 -0
  117. {biblicus-0.2.0 → biblicus-0.4.0}/features/cli_entrypoint.feature +0 -0
  118. {biblicus-0.2.0 → biblicus-0.4.0}/features/corpus_edge_cases.feature +0 -0
  119. {biblicus-0.2.0 → biblicus-0.4.0}/features/corpus_identity.feature +0 -0
  120. {biblicus-0.2.0 → biblicus-0.4.0}/features/corpus_purge.feature +0 -0
  121. {biblicus-0.2.0 → biblicus-0.4.0}/features/error_cases.feature +0 -0
  122. {biblicus-0.2.0 → biblicus-0.4.0}/features/evaluation.feature +0 -0
  123. {biblicus-0.2.0 → biblicus-0.4.0}/features/extractor_validation.feature +0 -0
  124. {biblicus-0.2.0 → biblicus-0.4.0}/features/frontmatter.feature +0 -0
  125. {biblicus-0.2.0 → biblicus-0.4.0}/features/hook_config_validation.feature +0 -0
  126. {biblicus-0.2.0 → biblicus-0.4.0}/features/hook_error_handling.feature +0 -0
  127. {biblicus-0.2.0 → biblicus-0.4.0}/features/import_tree.feature +0 -0
  128. {biblicus-0.2.0 → biblicus-0.4.0}/features/ingest_sources.feature +0 -0
  129. {biblicus-0.2.0 → biblicus-0.4.0}/features/integration_pdf_samples.feature +0 -0
  130. {biblicus-0.2.0 → biblicus-0.4.0}/features/integration_wikipedia.feature +0 -0
  131. {biblicus-0.2.0 → biblicus-0.4.0}/features/lifecycle_hooks.feature +0 -0
  132. {biblicus-0.2.0 → biblicus-0.4.0}/features/model_validation.feature +0 -0
  133. {biblicus-0.2.0 → biblicus-0.4.0}/features/python_api.feature +0 -0
  134. {biblicus-0.2.0 → biblicus-0.4.0}/features/python_hook_logging.feature +0 -0
  135. {biblicus-0.2.0 → biblicus-0.4.0}/features/retrieval_budget.feature +0 -0
  136. {biblicus-0.2.0 → biblicus-0.4.0}/features/retrieval_scan.feature +0 -0
  137. {biblicus-0.2.0 → biblicus-0.4.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  138. {biblicus-0.2.0 → biblicus-0.4.0}/features/retrieval_utilities.feature +0 -0
  139. {biblicus-0.2.0 → biblicus-0.4.0}/features/source_loading.feature +0 -0
  140. {biblicus-0.2.0 → biblicus-0.4.0}/features/steps/model_steps.py +0 -0
  141. {biblicus-0.2.0 → biblicus-0.4.0}/features/streaming_ingest.feature +0 -0
  142. {biblicus-0.2.0 → biblicus-0.4.0}/setup.cfg +0 -0
  143. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/__main__.py +0 -0
  144. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus/constants.py +0 -0
  145. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  146. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  147. {biblicus-0.2.0 → biblicus-0.4.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  include README.md
2
2
  include LICENSE
3
+ include THIRD_PARTY_NOTICES.md
4
+ include .biblicus/config.example.yml
3
5
  include pyproject.toml
4
6
 
5
7
  recursive-include src *.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -8,20 +8,30 @@ Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: pydantic>=2.0
10
10
  Requires-Dist: PyYAML>=6.0
11
+ Requires-Dist: pypdf>=4.0
11
12
  Provides-Extra: dev
12
13
  Requires-Dist: behave>=1.2.6; extra == "dev"
13
14
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
14
15
  Requires-Dist: sphinx>=7.0; extra == "dev"
15
16
  Requires-Dist: myst-parser>=2.0; extra == "dev"
17
+ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
16
18
  Requires-Dist: ruff>=0.4.0; extra == "dev"
17
19
  Requires-Dist: black>=24.0; extra == "dev"
18
20
  Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
21
+ Provides-Extra: openai
22
+ Requires-Dist: openai>=1.0; extra == "openai"
23
+ Provides-Extra: unstructured
24
+ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
25
+ Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
26
+ Provides-Extra: ocr
27
+ Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
19
28
  Dynamic: license-file
20
29
 
21
30
  # Biblicus
22
31
 
23
32
  ![Continuous integration][continuous-integration-badge]
24
33
  ![Coverage][coverage-badge]
34
+ ![Documentation][documentation-badge]
25
35
 
26
36
  Make your documents usable by your assistant, then decide later how you will search and retrieve them.
27
37
 
@@ -31,28 +41,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
31
41
 
32
42
  This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
33
43
 
34
- It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
44
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
35
45
 
36
46
  See [retrieval augmented generation overview] for a short introduction to the idea.
37
47
 
38
- ## The framework
48
+ ## A beginner friendly mental model
39
49
 
40
- The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
50
+ Think in three stages.
51
+
52
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
53
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
54
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
55
+
56
+ If you learn a few project words, the rest of the system becomes predictable.
41
57
 
42
58
  - Corpus is the folder that holds raw items and their metadata.
43
- - Item is the raw bytes of a document or other artifact, plus its source.
59
+ - Item is the raw bytes plus optional metadata and source information.
44
60
  - Catalog is the rebuildable index of the corpus.
45
- - Evidence is what retrieval returns, ready to be turned into context for a large language model.
46
- - Run is a recorded retrieval build for a corpus.
61
+ - Extraction run is a recorded extraction build that produces text artifacts.
47
62
  - Backend is a pluggable retrieval implementation.
48
- - Recipe is a named configuration for a backend.
49
- - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
63
+ - Run is a recorded retrieval build for a corpus.
64
+ - Evidence is what retrieval returns, with identifiers and source information.
50
65
 
51
66
  ## Diagram
52
67
 
53
68
  This diagram shows how a corpus becomes evidence for an assistant.
54
- The legend shows what the border styles and fill styles mean.
55
- The your code region is where you decide how to turn evidence into context and how to call a model.
69
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
70
+ The legend shows what the block styles mean.
71
+ Your code is where you decide how to turn evidence into context and how to call a model.
56
72
 
57
73
  ```mermaid
58
74
  %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -74,12 +90,19 @@ flowchart LR
74
90
  Raw --> Catalog[Catalog file]
75
91
  end
76
92
 
77
- subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
93
+ subgraph PluggableExtractionPipeline[Pluggable: extraction pipeline]
94
+ direction TB
95
+ Catalog --> Extract[Extract pipeline]
96
+ Extract --> ExtractedText[Extracted text artifacts]
97
+ ExtractedText --> ExtractionRun[Extraction run manifest]
98
+ end
99
+
100
+ subgraph PluggableRetrievalBackend[Pluggable: retrieval backend]
78
101
  direction LR
79
102
 
80
103
  subgraph BackendIngestionIndexing[Ingestion and indexing]
81
104
  direction TB
82
- Catalog --> Build[Build run]
105
+ ExtractionRun --> Build[Build run]
83
106
  Build --> BackendIndex[Backend index]
84
107
  BackendIndex --> Run[Run manifest]
85
108
  end
@@ -100,6 +123,7 @@ flowchart LR
100
123
  end
101
124
 
102
125
  style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
126
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
103
127
  style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
104
128
  style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
105
129
  style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
@@ -107,6 +131,8 @@ flowchart LR
107
131
 
108
132
  style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
133
  style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
134
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
135
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
136
  style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
137
  style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
138
  style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
@@ -115,6 +141,7 @@ flowchart LR
115
141
  style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
142
 
117
143
  style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
144
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
118
145
  style Build fill:#eceff1,stroke:#90a4ae,color:#111111
119
146
  style Query fill:#eceff1,stroke:#90a4ae,color:#111111
120
147
  style Model fill:#eceff1,stroke:#90a4ae,color:#111111
@@ -136,6 +163,8 @@ flowchart LR
136
163
 
137
164
  - Initialize a corpus folder.
138
165
  - Ingest items from file paths, web addresses, or text input.
166
+ - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
167
+ - Run extraction when you want derived text artifacts from non-text sources.
139
168
  - Reindex to refresh the catalog after edits.
140
169
  - Build a retrieval run with a backend.
141
170
  - Query the run to collect evidence and evaluate it with datasets.
@@ -154,17 +183,40 @@ After the first release, you can install it from Python Package Index.
154
183
  python3 -m pip install biblicus
155
184
  ```
156
185
 
186
+ ### Optional extras
187
+
188
+ Some extractors are optional so the base install stays small.
189
+
190
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
191
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
192
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
193
+
157
194
  ## Quick start
158
195
 
159
196
  ```
197
+ mkdir -p notes
198
+ echo "A small file note" > notes/example.txt
199
+
160
200
  biblicus init corpora/example
161
201
  biblicus ingest --corpus corpora/example notes/example.txt
162
202
  echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
163
203
  biblicus list --corpus corpora/example
204
+ biblicus extract build --corpus corpora/example --step pass-through-text --step metadata-text
205
+ biblicus extract list --corpus corpora/example
164
206
  biblicus build --corpus corpora/example --backend scan
165
207
  biblicus query --corpus corpora/example --query "note"
166
208
  ```
167
209
 
210
+ If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
211
+
212
+ ```
213
+ biblicus crawl --corpus corpora/example \\
214
+ --root-url https://example.com/docs/index.html \\
215
+ --allowed-prefix https://example.com/docs/ \\
216
+ --max-items 50 \\
217
+ --tag crawled
218
+ ```
219
+
168
220
  ## Python usage
169
221
 
170
222
  From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
@@ -188,13 +240,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
188
240
 
189
241
  ## Learn more
190
242
 
243
+ Full documentation is published on GitHub Pages: https://anthusai.github.io/Biblicus/
244
+
191
245
  The documents below are written to be read in order.
192
246
 
193
247
  - [Architecture][architecture]
248
+ - [Roadmap][roadmap]
249
+ - [Feature index][feature-index]
194
250
  - [Corpus][corpus]
195
251
  - [Text extraction][text-extraction]
252
+ - [User configuration][user-configuration]
196
253
  - [Backends][backends]
197
- - [Next steps][next-steps]
254
+ - [Demos][demos]
198
255
  - [Testing][testing]
199
256
 
200
257
  ## Metadata and catalog
@@ -212,7 +269,16 @@ corpus/
212
269
  config.json
213
270
  catalog.json
214
271
  runs/
215
- run-id.json
272
+ extraction/
273
+ pipeline/
274
+ <run id>/
275
+ manifest.json
276
+ text/
277
+ <item id>.txt
278
+ retrieval/
279
+ <backend id>/
280
+ <run id>/
281
+ manifest.json
216
282
  ```
217
283
 
218
284
  ## Retrieval backends
@@ -252,10 +318,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
252
318
 
253
319
  ## Documentation
254
320
 
255
- Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
321
+ Reference documentation is generated from Sphinx style docstrings.
322
+
323
+ Install development dependencies:
324
+
325
+ ```
326
+ python3 -m pip install -e ".[dev]"
327
+ ```
328
+
329
+ Build the documentation:
256
330
 
257
331
  ```
258
- sphinx-build -b html docs docs/_build
332
+ python3 -m sphinx -b html docs docs/_build/html
259
333
  ```
260
334
 
261
335
  ## License
@@ -264,11 +338,15 @@ License terms are in `LICENSE`.
264
338
 
265
339
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
266
340
  [architecture]: docs/ARCHITECTURE.md
341
+ [roadmap]: docs/ROADMAP.md
342
+ [feature-index]: docs/FEATURE_INDEX.md
267
343
  [corpus]: docs/CORPUS.md
268
344
  [text-extraction]: docs/EXTRACTION.md
345
+ [user-configuration]: docs/USER_CONFIGURATION.md
269
346
  [backends]: docs/BACKENDS.md
270
- [next-steps]: docs/NEXT_STEPS.md
347
+ [demos]: docs/DEMOS.md
271
348
  [testing]: docs/TESTING.md
272
349
 
273
350
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
274
351
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
352
+ [documentation-badge]: https://img.shields.io/badge/docs-GitHub%20Pages-blue
@@ -1,27 +1,8 @@
1
- Metadata-Version: 2.4
2
- Name: biblicus
3
- Version: 0.2.0
4
- Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
- License: MIT
6
- Requires-Python: >=3.9
7
- Description-Content-Type: text/markdown
8
- License-File: LICENSE
9
- Requires-Dist: pydantic>=2.0
10
- Requires-Dist: PyYAML>=6.0
11
- Provides-Extra: dev
12
- Requires-Dist: behave>=1.2.6; extra == "dev"
13
- Requires-Dist: coverage[toml]>=7.0; extra == "dev"
14
- Requires-Dist: sphinx>=7.0; extra == "dev"
15
- Requires-Dist: myst-parser>=2.0; extra == "dev"
16
- Requires-Dist: ruff>=0.4.0; extra == "dev"
17
- Requires-Dist: black>=24.0; extra == "dev"
18
- Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
19
- Dynamic: license-file
20
-
21
1
  # Biblicus
22
2
 
23
3
  ![Continuous integration][continuous-integration-badge]
24
4
  ![Coverage][coverage-badge]
5
+ ![Documentation][documentation-badge]
25
6
 
26
7
  Make your documents usable by your assistant, then decide later how you will search and retrieve them.
27
8
 
@@ -31,28 +12,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
31
12
 
32
13
  This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
33
14
 
34
- It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
15
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
35
16
 
36
17
  See [retrieval augmented generation overview] for a short introduction to the idea.
37
18
 
38
- ## The framework
19
+ ## A beginner friendly mental model
20
+
21
+ Think in three stages.
22
+
23
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
24
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
25
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
39
26
 
40
- The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
27
+ If you learn a few project words, the rest of the system becomes predictable.
41
28
 
42
29
  - Corpus is the folder that holds raw items and their metadata.
43
- - Item is the raw bytes of a document or other artifact, plus its source.
30
+ - Item is the raw bytes plus optional metadata and source information.
44
31
  - Catalog is the rebuildable index of the corpus.
45
- - Evidence is what retrieval returns, ready to be turned into context for a large language model.
46
- - Run is a recorded retrieval build for a corpus.
32
+ - Extraction run is a recorded extraction build that produces text artifacts.
47
33
  - Backend is a pluggable retrieval implementation.
48
- - Recipe is a named configuration for a backend.
49
- - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
34
+ - Run is a recorded retrieval build for a corpus.
35
+ - Evidence is what retrieval returns, with identifiers and source information.
50
36
 
51
37
  ## Diagram
52
38
 
53
39
  This diagram shows how a corpus becomes evidence for an assistant.
54
- The legend shows what the border styles and fill styles mean.
55
- The your code region is where you decide how to turn evidence into context and how to call a model.
40
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
41
+ The legend shows what the block styles mean.
42
+ Your code is where you decide how to turn evidence into context and how to call a model.
56
43
 
57
44
  ```mermaid
58
45
  %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -74,12 +61,19 @@ flowchart LR
74
61
  Raw --> Catalog[Catalog file]
75
62
  end
76
63
 
77
- subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
64
+ subgraph PluggableExtractionPipeline[Pluggable: extraction pipeline]
65
+ direction TB
66
+ Catalog --> Extract[Extract pipeline]
67
+ Extract --> ExtractedText[Extracted text artifacts]
68
+ ExtractedText --> ExtractionRun[Extraction run manifest]
69
+ end
70
+
71
+ subgraph PluggableRetrievalBackend[Pluggable: retrieval backend]
78
72
  direction LR
79
73
 
80
74
  subgraph BackendIngestionIndexing[Ingestion and indexing]
81
75
  direction TB
82
- Catalog --> Build[Build run]
76
+ ExtractionRun --> Build[Build run]
83
77
  Build --> BackendIndex[Backend index]
84
78
  BackendIndex --> Run[Run manifest]
85
79
  end
@@ -100,6 +94,7 @@ flowchart LR
100
94
  end
101
95
 
102
96
  style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
97
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
103
98
  style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
104
99
  style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
105
100
  style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
@@ -107,6 +102,8 @@ flowchart LR
107
102
 
108
103
  style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
104
  style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
105
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
106
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
107
  style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
108
  style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
109
  style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
@@ -115,6 +112,7 @@ flowchart LR
115
112
  style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
113
 
117
114
  style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
115
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
118
116
  style Build fill:#eceff1,stroke:#90a4ae,color:#111111
119
117
  style Query fill:#eceff1,stroke:#90a4ae,color:#111111
120
118
  style Model fill:#eceff1,stroke:#90a4ae,color:#111111
@@ -136,6 +134,8 @@ flowchart LR
136
134
 
137
135
  - Initialize a corpus folder.
138
136
  - Ingest items from file paths, web addresses, or text input.
137
+ - Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
138
+ - Run extraction when you want derived text artifacts from non-text sources.
139
139
  - Reindex to refresh the catalog after edits.
140
140
  - Build a retrieval run with a backend.
141
141
  - Query the run to collect evidence and evaluate it with datasets.
@@ -154,17 +154,40 @@ After the first release, you can install it from Python Package Index.
154
154
  python3 -m pip install biblicus
155
155
  ```
156
156
 
157
+ ### Optional extras
158
+
159
+ Some extractors are optional so the base install stays small.
160
+
161
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
162
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
163
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
164
+
157
165
  ## Quick start
158
166
 
159
167
  ```
168
+ mkdir -p notes
169
+ echo "A small file note" > notes/example.txt
170
+
160
171
  biblicus init corpora/example
161
172
  biblicus ingest --corpus corpora/example notes/example.txt
162
173
  echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
163
174
  biblicus list --corpus corpora/example
175
+ biblicus extract build --corpus corpora/example --step pass-through-text --step metadata-text
176
+ biblicus extract list --corpus corpora/example
164
177
  biblicus build --corpus corpora/example --backend scan
165
178
  biblicus query --corpus corpora/example --query "note"
166
179
  ```
167
180
 
181
+ If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
182
+
183
+ ```
184
+ biblicus crawl --corpus corpora/example \\
185
+ --root-url https://example.com/docs/index.html \\
186
+ --allowed-prefix https://example.com/docs/ \\
187
+ --max-items 50 \\
188
+ --tag crawled
189
+ ```
190
+
168
191
  ## Python usage
169
192
 
170
193
  From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
@@ -188,13 +211,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
188
211
 
189
212
  ## Learn more
190
213
 
214
+ Full documentation is published on GitHub Pages: https://anthusai.github.io/Biblicus/
215
+
191
216
  The documents below are written to be read in order.
192
217
 
193
218
  - [Architecture][architecture]
219
+ - [Roadmap][roadmap]
220
+ - [Feature index][feature-index]
194
221
  - [Corpus][corpus]
195
222
  - [Text extraction][text-extraction]
223
+ - [User configuration][user-configuration]
196
224
  - [Backends][backends]
197
- - [Next steps][next-steps]
225
+ - [Demos][demos]
198
226
  - [Testing][testing]
199
227
 
200
228
  ## Metadata and catalog
@@ -212,7 +240,16 @@ corpus/
212
240
  config.json
213
241
  catalog.json
214
242
  runs/
215
- run-id.json
243
+ extraction/
244
+ pipeline/
245
+ <run id>/
246
+ manifest.json
247
+ text/
248
+ <item id>.txt
249
+ retrieval/
250
+ <backend id>/
251
+ <run id>/
252
+ manifest.json
216
253
  ```
217
254
 
218
255
  ## Retrieval backends
@@ -252,10 +289,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
252
289
 
253
290
  ## Documentation
254
291
 
255
- Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
292
+ Reference documentation is generated from Sphinx style docstrings.
293
+
294
+ Install development dependencies:
295
+
296
+ ```
297
+ python3 -m pip install -e ".[dev]"
298
+ ```
299
+
300
+ Build the documentation:
256
301
 
257
302
  ```
258
- sphinx-build -b html docs docs/_build
303
+ python3 -m sphinx -b html docs docs/_build/html
259
304
  ```
260
305
 
261
306
  ## License
@@ -264,11 +309,15 @@ License terms are in `LICENSE`.
264
309
 
265
310
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
266
311
  [architecture]: docs/ARCHITECTURE.md
312
+ [roadmap]: docs/ROADMAP.md
313
+ [feature-index]: docs/FEATURE_INDEX.md
267
314
  [corpus]: docs/CORPUS.md
268
315
  [text-extraction]: docs/EXTRACTION.md
316
+ [user-configuration]: docs/USER_CONFIGURATION.md
269
317
  [backends]: docs/BACKENDS.md
270
- [next-steps]: docs/NEXT_STEPS.md
318
+ [demos]: docs/DEMOS.md
271
319
  [testing]: docs/TESTING.md
272
320
 
273
321
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
274
322
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
323
+ [documentation-badge]: https://img.shields.io/badge/docs-GitHub%20Pages-blue
@@ -0,0 +1,36 @@
1
+ # Third-party notices
2
+
3
+ This project includes vendored third-party source code.
4
+
5
+ ## dotyaml
6
+
7
+ Portions of this repository vendor code from the `dotyaml` project.
8
+
9
+ - Project: `dotyaml`
10
+ - Source: `../dotyaml` (vendored into `src/biblicus/_vendor/dotyaml/`)
11
+ - License: MIT
12
+
13
+ ```
14
+ MIT License
15
+
16
+ Copyright (c) 2025 yamlenv
17
+
18
+ Permission is hereby granted, free of charge, to any person obtaining a copy
19
+ of this software and associated documentation files (the "Software"), to deal
20
+ in the Software without restriction, including without limitation the rights
21
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
+ copies of the Software, and to permit persons to whom the Software is
23
+ furnished to do so, subject to the following conditions:
24
+
25
+ The above copyright notice and this permission notice shall be included in all
26
+ copies or substantial portions of the Software.
27
+
28
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34
+ SOFTWARE.
35
+ ```
36
+
@@ -43,6 +43,20 @@ Ingest a web address:
43
43
  python3 -m biblicus ingest --corpus corpora/example https://example.com --tag web
44
44
  ```
45
45
 
46
+ ## Crawl a website prefix
47
+
48
+ To build a corpus from a website section, crawl a root uniform resource locator and restrict the crawl to an allowed prefix.
49
+
50
+ ```
51
+ python3 -m biblicus crawl --corpus corpora/example \\
52
+ --root-url https://example.com/docs/index.html \\
53
+ --allowed-prefix https://example.com/docs/ \\
54
+ --max-items 50 \\
55
+ --tag crawled
56
+ ```
57
+
58
+ The crawl command only follows links within the allowed prefix, and it respects `.biblicusignore` patterns against the path relative to the allowed prefix.
59
+
46
60
  Ingest a text note:
47
61
 
48
62
  ```
@@ -100,4 +114,3 @@ Purging deletes all items and derived artifacts under the corpus. It requires yo
100
114
  ```
101
115
  python3 -m biblicus purge --corpus corpora/example --confirm example
102
116
  ```
103
-
@@ -1,13 +1,9 @@
1
- # Corpus workflows and lifecycle hooks
1
+ # Corpus design
2
2
 
3
- This document records the design decisions and outcomes for corpus management and lifecycle hooks in version zero. It is written in a decision format because the long term shape of the library is determined by corpus workflows more than by any particular retrieval backend.
3
+ This document records design decisions and outcomes for corpus management and lifecycle hooks in version zero.
4
4
 
5
5
  The goal is to make corpus management practical for day to day use, while keeping the raw corpus durable and readable as ordinary files on disk.
6
6
 
7
- ## Initiative constraints
8
-
9
- The project uses strict behavior driven development. Behavior specifications in `features/*.feature` are the authoritative definition of system behavior.
10
-
11
7
  ## What exists today
12
8
 
13
9
  The project already supports: