biblicus 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {biblicus-0.2.0 → biblicus-0.3.0}/MANIFEST.in +2 -0
  2. {biblicus-0.2.0 → biblicus-0.3.0}/PKG-INFO +78 -16
  3. biblicus-0.2.0/src/biblicus.egg-info/PKG-INFO → biblicus-0.3.0/README.md +68 -35
  4. biblicus-0.3.0/THIRD_PARTY_NOTICES.md +36 -0
  5. biblicus-0.2.0/docs/CORPUS_WORKFLOWS.md → biblicus-0.3.0/docs/CORPUS_DESIGN.md +2 -6
  6. biblicus-0.3.0/docs/DEMOS.md +334 -0
  7. biblicus-0.3.0/docs/EXTRACTION.md +186 -0
  8. biblicus-0.3.0/docs/FEATURE_INDEX.md +228 -0
  9. biblicus-0.3.0/docs/ROADMAP.md +174 -0
  10. biblicus-0.3.0/docs/TESTING.md +53 -0
  11. biblicus-0.3.0/docs/USER_CONFIGURATION.md +36 -0
  12. biblicus-0.3.0/docs/conf.py +56 -0
  13. {biblicus-0.2.0 → biblicus-0.3.0}/docs/index.rst +8 -2
  14. {biblicus-0.2.0 → biblicus-0.3.0}/features/cli_parsing.feature +5 -0
  15. {biblicus-0.2.0 → biblicus-0.3.0}/features/content_sniffing.feature +48 -0
  16. {biblicus-0.2.0 → biblicus-0.3.0}/features/environment.py +71 -0
  17. biblicus-0.3.0/features/extraction_error_handling.feature +32 -0
  18. biblicus-0.3.0/features/extraction_selection.feature +72 -0
  19. biblicus-0.3.0/features/extraction_selection_longest.feature +66 -0
  20. biblicus-0.3.0/features/extractor_pipeline.feature +105 -0
  21. biblicus-0.3.0/features/integration_audio_samples.feature +13 -0
  22. biblicus-0.3.0/features/integration_image_samples.feature +11 -0
  23. biblicus-0.3.0/features/integration_mixed_corpus.feature +15 -0
  24. biblicus-0.3.0/features/integration_mixed_extraction.feature +15 -0
  25. biblicus-0.3.0/features/integration_ocr_image_extraction.feature +11 -0
  26. biblicus-0.3.0/features/integration_pdf_retrieval.feature +20 -0
  27. biblicus-0.3.0/features/integration_unstructured_extraction.feature +11 -0
  28. biblicus-0.3.0/features/ocr_extractor.feature +61 -0
  29. biblicus-0.3.0/features/pdf_text_extraction.feature +41 -0
  30. {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_uses_extraction_run.feature +17 -0
  31. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/backend_steps.py +3 -1
  32. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/cli_parsing_steps.py +20 -1
  33. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/cli_steps.py +67 -14
  34. biblicus-0.3.0/features/steps/extraction_steps.py +479 -0
  35. biblicus-0.3.0/features/steps/extractor_steps.py +97 -0
  36. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/frontmatter_steps.py +5 -2
  37. biblicus-0.3.0/features/steps/openai_steps.py +236 -0
  38. biblicus-0.3.0/features/steps/pdf_steps.py +115 -0
  39. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/python_api_steps.py +16 -14
  40. biblicus-0.3.0/features/steps/rapidocr_steps.py +145 -0
  41. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/retrieval_steps.py +68 -4
  42. biblicus-0.3.0/features/steps/stt_steps.py +93 -0
  43. biblicus-0.3.0/features/steps/unstructured_steps.py +143 -0
  44. biblicus-0.3.0/features/steps/user_config_steps.py +47 -0
  45. biblicus-0.3.0/features/stt_extractor.feature +139 -0
  46. {biblicus-0.2.0 → biblicus-0.3.0}/features/text_extraction_runs.feature +6 -6
  47. biblicus-0.3.0/features/unstructured_extractor.feature +62 -0
  48. biblicus-0.3.0/features/user_config.feature +39 -0
  49. {biblicus-0.2.0 → biblicus-0.3.0}/pyproject.toml +38 -3
  50. biblicus-0.3.0/scripts/download_audio_samples.py +200 -0
  51. biblicus-0.3.0/scripts/download_image_samples.py +180 -0
  52. biblicus-0.3.0/scripts/download_mixed_samples.py +239 -0
  53. {biblicus-0.2.0 → biblicus-0.3.0}/scripts/download_pdf_samples.py +6 -3
  54. {biblicus-0.2.0 → biblicus-0.3.0}/scripts/download_wikipedia.py +3 -1
  55. {biblicus-0.2.0 → biblicus-0.3.0}/scripts/test.py +24 -3
  56. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/__init__.py +2 -2
  57. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/__init__.py +14 -0
  58. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/interpolation.py +63 -0
  59. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/loader.py +181 -0
  60. biblicus-0.3.0/src/biblicus/_vendor/dotyaml/transformer.py +135 -0
  61. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/__init__.py +0 -2
  62. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/base.py +3 -3
  63. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/scan.py +21 -15
  64. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/sqlite_full_text_search.py +14 -15
  65. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/cli.py +33 -49
  66. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/corpus.py +39 -58
  67. biblicus-0.3.0/src/biblicus/errors.py +15 -0
  68. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/evaluation.py +4 -8
  69. biblicus-0.3.0/src/biblicus/extraction.py +529 -0
  70. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/__init__.py +14 -3
  71. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/base.py +12 -5
  72. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/metadata_text.py +13 -5
  73. biblicus-0.3.0/src/biblicus/extractors/openai_stt.py +180 -0
  74. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/pass_through_text.py +16 -6
  75. biblicus-0.3.0/src/biblicus/extractors/pdf_text.py +100 -0
  76. biblicus-0.3.0/src/biblicus/extractors/pipeline.py +105 -0
  77. biblicus-0.3.0/src/biblicus/extractors/rapidocr_text.py +129 -0
  78. biblicus-0.3.0/src/biblicus/extractors/select_longest_text.py +105 -0
  79. biblicus-0.3.0/src/biblicus/extractors/select_text.py +100 -0
  80. biblicus-0.3.0/src/biblicus/extractors/unstructured_text.py +100 -0
  81. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/frontmatter.py +0 -3
  82. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hook_logging.py +0 -5
  83. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hook_manager.py +3 -5
  84. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hooks.py +3 -7
  85. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/ignore.py +0 -3
  86. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/models.py +87 -0
  87. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/retrieval.py +0 -4
  88. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/sources.py +44 -9
  89. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/time.py +0 -1
  90. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/uris.py +3 -4
  91. biblicus-0.3.0/src/biblicus/user_config.py +138 -0
  92. biblicus-0.2.0/README.md → biblicus-0.3.0/src/biblicus.egg-info/PKG-INFO +97 -15
  93. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/SOURCES.txt +44 -4
  94. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/requires.txt +12 -0
  95. biblicus-0.2.0/docs/EXTRACTION.md +0 -86
  96. biblicus-0.2.0/docs/NEXT_STEPS.md +0 -309
  97. biblicus-0.2.0/docs/TESTING.md +0 -29
  98. biblicus-0.2.0/docs/conf.py +0 -31
  99. biblicus-0.2.0/features/extractor_pipeline.feature +0 -114
  100. biblicus-0.2.0/features/steps/extraction_steps.py +0 -238
  101. biblicus-0.2.0/features/steps/extractor_steps.py +0 -54
  102. biblicus-0.2.0/src/biblicus/extraction.py +0 -330
  103. biblicus-0.2.0/src/biblicus/extractors/cascade.py +0 -101
  104. {biblicus-0.2.0 → biblicus-0.3.0}/LICENSE +0 -0
  105. {biblicus-0.2.0 → biblicus-0.3.0}/datasets/wikipedia_mini.json +0 -0
  106. {biblicus-0.2.0 → biblicus-0.3.0}/docs/ARCHITECTURE.md +0 -0
  107. {biblicus-0.2.0 → biblicus-0.3.0}/docs/BACKENDS.md +0 -0
  108. {biblicus-0.2.0 → biblicus-0.3.0}/docs/CORPUS.md +0 -0
  109. {biblicus-0.2.0 → biblicus-0.3.0}/docs/api.rst +0 -0
  110. {biblicus-0.2.0 → biblicus-0.3.0}/features/backend_validation.feature +0 -0
  111. {biblicus-0.2.0 → biblicus-0.3.0}/features/biblicus_corpus.feature +0 -0
  112. {biblicus-0.2.0 → biblicus-0.3.0}/features/cli_entrypoint.feature +0 -0
  113. {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_edge_cases.feature +0 -0
  114. {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_identity.feature +0 -0
  115. {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_purge.feature +0 -0
  116. {biblicus-0.2.0 → biblicus-0.3.0}/features/error_cases.feature +0 -0
  117. {biblicus-0.2.0 → biblicus-0.3.0}/features/evaluation.feature +0 -0
  118. {biblicus-0.2.0 → biblicus-0.3.0}/features/extractor_validation.feature +0 -0
  119. {biblicus-0.2.0 → biblicus-0.3.0}/features/frontmatter.feature +0 -0
  120. {biblicus-0.2.0 → biblicus-0.3.0}/features/hook_config_validation.feature +0 -0
  121. {biblicus-0.2.0 → biblicus-0.3.0}/features/hook_error_handling.feature +0 -0
  122. {biblicus-0.2.0 → biblicus-0.3.0}/features/import_tree.feature +0 -0
  123. {biblicus-0.2.0 → biblicus-0.3.0}/features/ingest_sources.feature +0 -0
  124. {biblicus-0.2.0 → biblicus-0.3.0}/features/integration_pdf_samples.feature +0 -0
  125. {biblicus-0.2.0 → biblicus-0.3.0}/features/integration_wikipedia.feature +0 -0
  126. {biblicus-0.2.0 → biblicus-0.3.0}/features/lifecycle_hooks.feature +0 -0
  127. {biblicus-0.2.0 → biblicus-0.3.0}/features/model_validation.feature +0 -0
  128. {biblicus-0.2.0 → biblicus-0.3.0}/features/python_api.feature +0 -0
  129. {biblicus-0.2.0 → biblicus-0.3.0}/features/python_hook_logging.feature +0 -0
  130. {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_budget.feature +0 -0
  131. {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_scan.feature +0 -0
  132. {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  133. {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_utilities.feature +0 -0
  134. {biblicus-0.2.0 → biblicus-0.3.0}/features/source_loading.feature +0 -0
  135. {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/model_steps.py +0 -0
  136. {biblicus-0.2.0 → biblicus-0.3.0}/features/streaming_ingest.feature +0 -0
  137. {biblicus-0.2.0 → biblicus-0.3.0}/setup.cfg +0 -0
  138. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/__main__.py +0 -0
  139. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/constants.py +0 -0
  140. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  141. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  142. {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  include README.md
2
2
  include LICENSE
3
+ include THIRD_PARTY_NOTICES.md
4
+ include .biblicus/config.example.yml
3
5
  include pyproject.toml
4
6
 
5
7
  recursive-include src *.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -8,20 +8,30 @@ Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: pydantic>=2.0
10
10
  Requires-Dist: PyYAML>=6.0
11
+ Requires-Dist: pypdf>=4.0
11
12
  Provides-Extra: dev
12
13
  Requires-Dist: behave>=1.2.6; extra == "dev"
13
14
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
14
15
  Requires-Dist: sphinx>=7.0; extra == "dev"
15
16
  Requires-Dist: myst-parser>=2.0; extra == "dev"
17
+ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
16
18
  Requires-Dist: ruff>=0.4.0; extra == "dev"
17
19
  Requires-Dist: black>=24.0; extra == "dev"
18
20
  Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
21
+ Provides-Extra: openai
22
+ Requires-Dist: openai>=1.0; extra == "openai"
23
+ Provides-Extra: unstructured
24
+ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
25
+ Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
26
+ Provides-Extra: ocr
27
+ Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
19
28
  Dynamic: license-file
20
29
 
21
30
  # Biblicus
22
31
 
23
32
  ![Continuous integration][continuous-integration-badge]
24
33
  ![Coverage][coverage-badge]
34
+ ![Documentation][documentation-badge]
25
35
 
26
36
  Make your documents usable by your assistant, then decide later how you will search and retrieve them.
27
37
 
@@ -31,28 +41,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
31
41
 
32
42
  This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
33
43
 
34
- It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
44
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
35
45
 
36
46
  See [retrieval augmented generation overview] for a short introduction to the idea.
37
47
 
38
- ## The framework
48
+ ## A beginner friendly mental model
39
49
 
40
- The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
50
+ Think in three stages.
51
+
52
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
53
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
54
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
55
+
56
+ If you learn a few project words, the rest of the system becomes predictable.
41
57
 
42
58
  - Corpus is the folder that holds raw items and their metadata.
43
- - Item is the raw bytes of a document or other artifact, plus its source.
59
+ - Item is the raw bytes plus optional metadata and source information.
44
60
  - Catalog is the rebuildable index of the corpus.
45
- - Evidence is what retrieval returns, ready to be turned into context for a large language model.
46
- - Run is a recorded retrieval build for a corpus.
61
+ - Extraction run is a recorded extraction build that produces text artifacts.
47
62
  - Backend is a pluggable retrieval implementation.
48
- - Recipe is a named configuration for a backend.
49
- - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
63
+ - Run is a recorded retrieval build for a corpus.
64
+ - Evidence is what retrieval returns, with identifiers and source information.
50
65
 
51
66
  ## Diagram
52
67
 
53
68
  This diagram shows how a corpus becomes evidence for an assistant.
54
- The legend shows what the border styles and fill styles mean.
55
- The your code region is where you decide how to turn evidence into context and how to call a model.
69
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
70
+ The legend shows what the block styles mean.
71
+ Your code is where you decide how to turn evidence into context and how to call a model.
56
72
 
57
73
  ```mermaid
58
74
  %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -61,7 +77,10 @@ flowchart LR
61
77
  direction LR
62
78
  LegendArtifact[Stored artifact or evidence]
63
79
  LegendStep[Step]
80
+ LegendStable[Stable region]
81
+ LegendPluggable[Pluggable region]
64
82
  LegendArtifact --- LegendStep
83
+ LegendStable --- LegendPluggable
65
84
  end
66
85
 
67
86
  subgraph Main[" "]
@@ -74,12 +93,19 @@ flowchart LR
74
93
  Raw --> Catalog[Catalog file]
75
94
  end
76
95
 
96
+ subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
97
+ direction TB
98
+ Catalog --> Extract[Extract pipeline]
99
+ Extract --> ExtractedText[Extracted text artifacts]
100
+ ExtractedText --> ExtractionRun[Extraction run manifest]
101
+ end
102
+
77
103
  subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
78
104
  direction LR
79
105
 
80
106
  subgraph BackendIngestionIndexing[Ingestion and indexing]
81
107
  direction TB
82
- Catalog --> Build[Build run]
108
+ ExtractionRun --> Build[Build run]
83
109
  Build --> BackendIndex[Backend index]
84
110
  BackendIndex --> Run[Run manifest]
85
111
  end
@@ -100,6 +126,7 @@ flowchart LR
100
126
  end
101
127
 
102
128
  style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
129
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
103
130
  style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
104
131
  style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
105
132
  style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
@@ -107,6 +134,8 @@ flowchart LR
107
134
 
108
135
  style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
136
  style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
137
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
138
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
139
  style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
140
  style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
141
  style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
@@ -115,6 +144,7 @@ flowchart LR
115
144
  style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
145
 
117
146
  style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
147
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
118
148
  style Build fill:#eceff1,stroke:#90a4ae,color:#111111
119
149
  style Query fill:#eceff1,stroke:#90a4ae,color:#111111
120
150
  style Model fill:#eceff1,stroke:#90a4ae,color:#111111
@@ -124,6 +154,8 @@ flowchart LR
124
154
  style Main fill:#ffffff,stroke:#ffffff,color:#111111
125
155
  style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
126
156
  style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
157
+ style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
158
+ style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
127
159
  ```
128
160
 
129
161
  ## Practical value
@@ -136,6 +168,7 @@ flowchart LR
136
168
 
137
169
  - Initialize a corpus folder.
138
170
  - Ingest items from file paths, web addresses, or text input.
171
+ - Run extraction when you want derived text artifacts from non-text sources.
139
172
  - Reindex to refresh the catalog after edits.
140
173
  - Build a retrieval run with a backend.
141
174
  - Query the run to collect evidence and evaluate it with datasets.
@@ -154,13 +187,25 @@ After the first release, you can install it from Python Package Index.
154
187
  python3 -m pip install biblicus
155
188
  ```
156
189
 
190
+ ### Optional extras
191
+
192
+ Some extractors are optional so the base install stays small.
193
+
194
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
195
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
196
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
197
+
157
198
  ## Quick start
158
199
 
159
200
  ```
201
+ mkdir -p notes
202
+ echo "A small file note" > notes/example.txt
203
+
160
204
  biblicus init corpora/example
161
205
  biblicus ingest --corpus corpora/example notes/example.txt
162
206
  echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
163
207
  biblicus list --corpus corpora/example
208
+ biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
164
209
  biblicus build --corpus corpora/example --backend scan
165
210
  biblicus query --corpus corpora/example --query "note"
166
211
  ```
@@ -188,13 +233,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
188
233
 
189
234
  ## Learn more
190
235
 
236
+ Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
237
+
191
238
  The documents below are written to be read in order.
192
239
 
193
240
  - [Architecture][architecture]
241
+ - [Roadmap][roadmap]
242
+ - [Feature index][feature-index]
194
243
  - [Corpus][corpus]
195
244
  - [Text extraction][text-extraction]
245
+ - [User configuration][user-configuration]
196
246
  - [Backends][backends]
197
- - [Next steps][next-steps]
247
+ - [Demos][demos]
198
248
  - [Testing][testing]
199
249
 
200
250
  ## Metadata and catalog
@@ -252,10 +302,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
252
302
 
253
303
  ## Documentation
254
304
 
255
- Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
305
+ Reference documentation is generated from Sphinx style docstrings.
306
+
307
+ Install development dependencies:
308
+
309
+ ```
310
+ python3 -m pip install -e ".[dev]"
311
+ ```
312
+
313
+ Build the documentation:
256
314
 
257
315
  ```
258
- sphinx-build -b html docs docs/_build
316
+ python3 -m sphinx -b html docs docs/_build
259
317
  ```
260
318
 
261
319
  ## License
@@ -264,11 +322,15 @@ License terms are in `LICENSE`.
264
322
 
265
323
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
266
324
  [architecture]: docs/ARCHITECTURE.md
325
+ [roadmap]: docs/ROADMAP.md
326
+ [feature-index]: docs/FEATURE_INDEX.md
267
327
  [corpus]: docs/CORPUS.md
268
328
  [text-extraction]: docs/EXTRACTION.md
329
+ [user-configuration]: docs/USER_CONFIGURATION.md
269
330
  [backends]: docs/BACKENDS.md
270
- [next-steps]: docs/NEXT_STEPS.md
331
+ [demos]: docs/DEMOS.md
271
332
  [testing]: docs/TESTING.md
272
333
 
273
334
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
274
335
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
336
+ [documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
@@ -1,27 +1,8 @@
1
- Metadata-Version: 2.4
2
- Name: biblicus
3
- Version: 0.2.0
4
- Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
- License: MIT
6
- Requires-Python: >=3.9
7
- Description-Content-Type: text/markdown
8
- License-File: LICENSE
9
- Requires-Dist: pydantic>=2.0
10
- Requires-Dist: PyYAML>=6.0
11
- Provides-Extra: dev
12
- Requires-Dist: behave>=1.2.6; extra == "dev"
13
- Requires-Dist: coverage[toml]>=7.0; extra == "dev"
14
- Requires-Dist: sphinx>=7.0; extra == "dev"
15
- Requires-Dist: myst-parser>=2.0; extra == "dev"
16
- Requires-Dist: ruff>=0.4.0; extra == "dev"
17
- Requires-Dist: black>=24.0; extra == "dev"
18
- Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
19
- Dynamic: license-file
20
-
21
1
  # Biblicus
22
2
 
23
3
  ![Continuous integration][continuous-integration-badge]
24
4
  ![Coverage][coverage-badge]
5
+ ![Documentation][documentation-badge]
25
6
 
26
7
  Make your documents usable by your assistant, then decide later how you will search and retrieve them.
27
8
 
@@ -31,28 +12,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
31
12
 
32
13
  This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
33
14
 
34
- It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
15
+ It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
35
16
 
36
17
  See [retrieval augmented generation overview] for a short introduction to the idea.
37
18
 
38
- ## The framework
19
+ ## A beginner friendly mental model
20
+
21
+ Think in three stages.
22
+
23
+ - Ingest puts raw items into a corpus. This is file first and human inspectable.
24
+ - Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
25
+ - Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
39
26
 
40
- The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
27
+ If you learn a few project words, the rest of the system becomes predictable.
41
28
 
42
29
  - Corpus is the folder that holds raw items and their metadata.
43
- - Item is the raw bytes of a document or other artifact, plus its source.
30
+ - Item is the raw bytes plus optional metadata and source information.
44
31
  - Catalog is the rebuildable index of the corpus.
45
- - Evidence is what retrieval returns, ready to be turned into context for a large language model.
46
- - Run is a recorded retrieval build for a corpus.
32
+ - Extraction run is a recorded extraction build that produces text artifacts.
47
33
  - Backend is a pluggable retrieval implementation.
48
- - Recipe is a named configuration for a backend.
49
- - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
34
+ - Run is a recorded retrieval build for a corpus.
35
+ - Evidence is what retrieval returns, with identifiers and source information.
50
36
 
51
37
  ## Diagram
52
38
 
53
39
  This diagram shows how a corpus becomes evidence for an assistant.
54
- The legend shows what the border styles and fill styles mean.
55
- The your code region is where you decide how to turn evidence into context and how to call a model.
40
+ Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
41
+ The legend shows what the block styles mean.
42
+ Your code is where you decide how to turn evidence into context and how to call a model.
56
43
 
57
44
  ```mermaid
58
45
  %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -61,7 +48,10 @@ flowchart LR
61
48
  direction LR
62
49
  LegendArtifact[Stored artifact or evidence]
63
50
  LegendStep[Step]
51
+ LegendStable[Stable region]
52
+ LegendPluggable[Pluggable region]
64
53
  LegendArtifact --- LegendStep
54
+ LegendStable --- LegendPluggable
65
55
  end
66
56
 
67
57
  subgraph Main[" "]
@@ -74,12 +64,19 @@ flowchart LR
74
64
  Raw --> Catalog[Catalog file]
75
65
  end
76
66
 
67
+ subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
68
+ direction TB
69
+ Catalog --> Extract[Extract pipeline]
70
+ Extract --> ExtractedText[Extracted text artifacts]
71
+ ExtractedText --> ExtractionRun[Extraction run manifest]
72
+ end
73
+
77
74
  subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
78
75
  direction LR
79
76
 
80
77
  subgraph BackendIngestionIndexing[Ingestion and indexing]
81
78
  direction TB
82
- Catalog --> Build[Build run]
79
+ ExtractionRun --> Build[Build run]
83
80
  Build --> BackendIndex[Backend index]
84
81
  BackendIndex --> Run[Run manifest]
85
82
  end
@@ -100,6 +97,7 @@ flowchart LR
100
97
  end
101
98
 
102
99
  style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
100
+ style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
103
101
  style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
104
102
  style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
105
103
  style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
@@ -107,6 +105,8 @@ flowchart LR
107
105
 
108
106
  style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
107
  style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
108
+ style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
+ style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
110
  style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
111
  style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
112
  style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
@@ -115,6 +115,7 @@ flowchart LR
115
115
  style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
116
 
117
117
  style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
118
+ style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
118
119
  style Build fill:#eceff1,stroke:#90a4ae,color:#111111
119
120
  style Query fill:#eceff1,stroke:#90a4ae,color:#111111
120
121
  style Model fill:#eceff1,stroke:#90a4ae,color:#111111
@@ -124,6 +125,8 @@ flowchart LR
124
125
  style Main fill:#ffffff,stroke:#ffffff,color:#111111
125
126
  style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
126
127
  style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
128
+ style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
129
+ style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
127
130
  ```
128
131
 
129
132
  ## Practical value
@@ -136,6 +139,7 @@ flowchart LR
136
139
 
137
140
  - Initialize a corpus folder.
138
141
  - Ingest items from file paths, web addresses, or text input.
142
+ - Run extraction when you want derived text artifacts from non-text sources.
139
143
  - Reindex to refresh the catalog after edits.
140
144
  - Build a retrieval run with a backend.
141
145
  - Query the run to collect evidence and evaluate it with datasets.
@@ -154,13 +158,25 @@ After the first release, you can install it from Python Package Index.
154
158
  python3 -m pip install biblicus
155
159
  ```
156
160
 
161
+ ### Optional extras
162
+
163
+ Some extractors are optional so the base install stays small.
164
+
165
+ - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
166
+ - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
167
+ - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
168
+
157
169
  ## Quick start
158
170
 
159
171
  ```
172
+ mkdir -p notes
173
+ echo "A small file note" > notes/example.txt
174
+
160
175
  biblicus init corpora/example
161
176
  biblicus ingest --corpus corpora/example notes/example.txt
162
177
  echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
163
178
  biblicus list --corpus corpora/example
179
+ biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
164
180
  biblicus build --corpus corpora/example --backend scan
165
181
  biblicus query --corpus corpora/example --query "note"
166
182
  ```
@@ -188,13 +204,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
188
204
 
189
205
  ## Learn more
190
206
 
207
+ Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
208
+
191
209
  The documents below are written to be read in order.
192
210
 
193
211
  - [Architecture][architecture]
212
+ - [Roadmap][roadmap]
213
+ - [Feature index][feature-index]
194
214
  - [Corpus][corpus]
195
215
  - [Text extraction][text-extraction]
216
+ - [User configuration][user-configuration]
196
217
  - [Backends][backends]
197
- - [Next steps][next-steps]
218
+ - [Demos][demos]
198
219
  - [Testing][testing]
199
220
 
200
221
  ## Metadata and catalog
@@ -252,10 +273,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
252
273
 
253
274
  ## Documentation
254
275
 
255
- Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
276
+ Reference documentation is generated from Sphinx style docstrings.
277
+
278
+ Install development dependencies:
279
+
280
+ ```
281
+ python3 -m pip install -e ".[dev]"
282
+ ```
283
+
284
+ Build the documentation:
256
285
 
257
286
  ```
258
- sphinx-build -b html docs docs/_build
287
+ python3 -m sphinx -b html docs docs/_build
259
288
  ```
260
289
 
261
290
  ## License
@@ -264,11 +293,15 @@ License terms are in `LICENSE`.
264
293
 
265
294
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
266
295
  [architecture]: docs/ARCHITECTURE.md
296
+ [roadmap]: docs/ROADMAP.md
297
+ [feature-index]: docs/FEATURE_INDEX.md
267
298
  [corpus]: docs/CORPUS.md
268
299
  [text-extraction]: docs/EXTRACTION.md
300
+ [user-configuration]: docs/USER_CONFIGURATION.md
269
301
  [backends]: docs/BACKENDS.md
270
- [next-steps]: docs/NEXT_STEPS.md
302
+ [demos]: docs/DEMOS.md
271
303
  [testing]: docs/TESTING.md
272
304
 
273
305
  [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
274
306
  [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
307
+ [documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
@@ -0,0 +1,36 @@
1
+ # Third-party notices
2
+
3
+ This project includes vendored third-party source code.
4
+
5
+ ## dotyaml
6
+
7
+ Portions of this repository vendor code from the `dotyaml` project.
8
+
9
+ - Project: `dotyaml`
10
+ - Source: `../dotyaml` (vendored into `src/biblicus/_vendor/dotyaml/`)
11
+ - License: MIT
12
+
13
+ ```
14
+ MIT License
15
+
16
+ Copyright (c) 2025 yamlenv
17
+
18
+ Permission is hereby granted, free of charge, to any person obtaining a copy
19
+ of this software and associated documentation files (the "Software"), to deal
20
+ in the Software without restriction, including without limitation the rights
21
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
+ copies of the Software, and to permit persons to whom the Software is
23
+ furnished to do so, subject to the following conditions:
24
+
25
+ The above copyright notice and this permission notice shall be included in all
26
+ copies or substantial portions of the Software.
27
+
28
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34
+ SOFTWARE.
35
+ ```
36
+
@@ -1,13 +1,9 @@
1
- # Corpus workflows and lifecycle hooks
1
+ # Corpus design
2
2
 
3
- This document records the design decisions and outcomes for corpus management and lifecycle hooks in version zero. It is written in a decision format because the long term shape of the library is determined by corpus workflows more than by any particular retrieval backend.
3
+ This document records design decisions and outcomes for corpus management and lifecycle hooks in version zero.
4
4
 
5
5
  The goal is to make corpus management practical for day to day use, while keeping the raw corpus durable and readable as ordinary files on disk.
6
6
 
7
- ## Initiative constraints
8
-
9
- The project uses strict behavior driven development. Behavior specifications in `features/*.feature` are the authoritative definition of system behavior.
10
-
11
7
  ## What exists today
12
8
 
13
9
  The project already supports: