biblicus 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.2.0 → biblicus-0.3.0}/MANIFEST.in +2 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/PKG-INFO +78 -16
- biblicus-0.2.0/src/biblicus.egg-info/PKG-INFO → biblicus-0.3.0/README.md +68 -35
- biblicus-0.3.0/THIRD_PARTY_NOTICES.md +36 -0
- biblicus-0.2.0/docs/CORPUS_WORKFLOWS.md → biblicus-0.3.0/docs/CORPUS_DESIGN.md +2 -6
- biblicus-0.3.0/docs/DEMOS.md +334 -0
- biblicus-0.3.0/docs/EXTRACTION.md +186 -0
- biblicus-0.3.0/docs/FEATURE_INDEX.md +228 -0
- biblicus-0.3.0/docs/ROADMAP.md +174 -0
- biblicus-0.3.0/docs/TESTING.md +53 -0
- biblicus-0.3.0/docs/USER_CONFIGURATION.md +36 -0
- biblicus-0.3.0/docs/conf.py +56 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/docs/index.rst +8 -2
- {biblicus-0.2.0 → biblicus-0.3.0}/features/cli_parsing.feature +5 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/content_sniffing.feature +48 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/environment.py +71 -0
- biblicus-0.3.0/features/extraction_error_handling.feature +32 -0
- biblicus-0.3.0/features/extraction_selection.feature +72 -0
- biblicus-0.3.0/features/extraction_selection_longest.feature +66 -0
- biblicus-0.3.0/features/extractor_pipeline.feature +105 -0
- biblicus-0.3.0/features/integration_audio_samples.feature +13 -0
- biblicus-0.3.0/features/integration_image_samples.feature +11 -0
- biblicus-0.3.0/features/integration_mixed_corpus.feature +15 -0
- biblicus-0.3.0/features/integration_mixed_extraction.feature +15 -0
- biblicus-0.3.0/features/integration_ocr_image_extraction.feature +11 -0
- biblicus-0.3.0/features/integration_pdf_retrieval.feature +20 -0
- biblicus-0.3.0/features/integration_unstructured_extraction.feature +11 -0
- biblicus-0.3.0/features/ocr_extractor.feature +61 -0
- biblicus-0.3.0/features/pdf_text_extraction.feature +41 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_uses_extraction_run.feature +17 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/backend_steps.py +3 -1
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/cli_parsing_steps.py +20 -1
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/cli_steps.py +67 -14
- biblicus-0.3.0/features/steps/extraction_steps.py +479 -0
- biblicus-0.3.0/features/steps/extractor_steps.py +97 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/frontmatter_steps.py +5 -2
- biblicus-0.3.0/features/steps/openai_steps.py +236 -0
- biblicus-0.3.0/features/steps/pdf_steps.py +115 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/python_api_steps.py +16 -14
- biblicus-0.3.0/features/steps/rapidocr_steps.py +145 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/retrieval_steps.py +68 -4
- biblicus-0.3.0/features/steps/stt_steps.py +93 -0
- biblicus-0.3.0/features/steps/unstructured_steps.py +143 -0
- biblicus-0.3.0/features/steps/user_config_steps.py +47 -0
- biblicus-0.3.0/features/stt_extractor.feature +139 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/text_extraction_runs.feature +6 -6
- biblicus-0.3.0/features/unstructured_extractor.feature +62 -0
- biblicus-0.3.0/features/user_config.feature +39 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/pyproject.toml +38 -3
- biblicus-0.3.0/scripts/download_audio_samples.py +200 -0
- biblicus-0.3.0/scripts/download_image_samples.py +180 -0
- biblicus-0.3.0/scripts/download_mixed_samples.py +239 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/scripts/download_pdf_samples.py +6 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/scripts/download_wikipedia.py +3 -1
- {biblicus-0.2.0 → biblicus-0.3.0}/scripts/test.py +24 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/__init__.py +2 -2
- biblicus-0.3.0/src/biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus-0.3.0/src/biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus-0.3.0/src/biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus-0.3.0/src/biblicus/_vendor/dotyaml/transformer.py +135 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/__init__.py +0 -2
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/base.py +3 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/scan.py +21 -15
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/backends/sqlite_full_text_search.py +14 -15
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/cli.py +33 -49
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/corpus.py +39 -58
- biblicus-0.3.0/src/biblicus/errors.py +15 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/evaluation.py +4 -8
- biblicus-0.3.0/src/biblicus/extraction.py +529 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/__init__.py +14 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/base.py +12 -5
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/metadata_text.py +13 -5
- biblicus-0.3.0/src/biblicus/extractors/openai_stt.py +180 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/extractors/pass_through_text.py +16 -6
- biblicus-0.3.0/src/biblicus/extractors/pdf_text.py +100 -0
- biblicus-0.3.0/src/biblicus/extractors/pipeline.py +105 -0
- biblicus-0.3.0/src/biblicus/extractors/rapidocr_text.py +129 -0
- biblicus-0.3.0/src/biblicus/extractors/select_longest_text.py +105 -0
- biblicus-0.3.0/src/biblicus/extractors/select_text.py +100 -0
- biblicus-0.3.0/src/biblicus/extractors/unstructured_text.py +100 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/frontmatter.py +0 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hook_logging.py +0 -5
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hook_manager.py +3 -5
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/hooks.py +3 -7
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/ignore.py +0 -3
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/models.py +87 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/retrieval.py +0 -4
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/sources.py +44 -9
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/time.py +0 -1
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/uris.py +3 -4
- biblicus-0.3.0/src/biblicus/user_config.py +138 -0
- biblicus-0.2.0/README.md → biblicus-0.3.0/src/biblicus.egg-info/PKG-INFO +97 -15
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/SOURCES.txt +44 -4
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/requires.txt +12 -0
- biblicus-0.2.0/docs/EXTRACTION.md +0 -86
- biblicus-0.2.0/docs/NEXT_STEPS.md +0 -309
- biblicus-0.2.0/docs/TESTING.md +0 -29
- biblicus-0.2.0/docs/conf.py +0 -31
- biblicus-0.2.0/features/extractor_pipeline.feature +0 -114
- biblicus-0.2.0/features/steps/extraction_steps.py +0 -238
- biblicus-0.2.0/features/steps/extractor_steps.py +0 -54
- biblicus-0.2.0/src/biblicus/extraction.py +0 -330
- biblicus-0.2.0/src/biblicus/extractors/cascade.py +0 -101
- {biblicus-0.2.0 → biblicus-0.3.0}/LICENSE +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/docs/BACKENDS.md +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/docs/CORPUS.md +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/docs/api.rst +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/backend_validation.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/error_cases.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/evaluation.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/frontmatter.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/import_tree.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/model_validation.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/python_api.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/source_loading.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/setup.cfg +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.2.0 → biblicus-0.3.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -8,20 +8,30 @@ Description-Content-Type: text/markdown
|
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
Requires-Dist: pydantic>=2.0
|
|
10
10
|
Requires-Dist: PyYAML>=6.0
|
|
11
|
+
Requires-Dist: pypdf>=4.0
|
|
11
12
|
Provides-Extra: dev
|
|
12
13
|
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
13
14
|
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
14
15
|
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
15
16
|
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
16
18
|
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
17
19
|
Requires-Dist: black>=24.0; extra == "dev"
|
|
18
20
|
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
21
|
+
Provides-Extra: openai
|
|
22
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
|
+
Provides-Extra: unstructured
|
|
24
|
+
Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
25
|
+
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
|
+
Provides-Extra: ocr
|
|
27
|
+
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
19
28
|
Dynamic: license-file
|
|
20
29
|
|
|
21
30
|
# Biblicus
|
|
22
31
|
|
|
23
32
|
![Continuous integration][continuous-integration-badge]
|
|
24
33
|
![Coverage][coverage-badge]
|
|
34
|
+
![Documentation][documentation-badge]
|
|
25
35
|
|
|
26
36
|
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
27
37
|
|
|
@@ -31,28 +41,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
|
|
|
31
41
|
|
|
32
42
|
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
33
43
|
|
|
34
|
-
It
|
|
44
|
+
It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
|
|
35
45
|
|
|
36
46
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
37
47
|
|
|
38
|
-
##
|
|
48
|
+
## A beginner friendly mental model
|
|
39
49
|
|
|
40
|
-
|
|
50
|
+
Think in three stages.
|
|
51
|
+
|
|
52
|
+
- Ingest puts raw items into a corpus. This is file first and human inspectable.
|
|
53
|
+
- Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
|
|
54
|
+
- Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
|
|
55
|
+
|
|
56
|
+
If you learn a few project words, the rest of the system becomes predictable.
|
|
41
57
|
|
|
42
58
|
- Corpus is the folder that holds raw items and their metadata.
|
|
43
|
-
- Item is the raw bytes
|
|
59
|
+
- Item is the raw bytes plus optional metadata and source information.
|
|
44
60
|
- Catalog is the rebuildable index of the corpus.
|
|
45
|
-
-
|
|
46
|
-
- Run is a recorded retrieval build for a corpus.
|
|
61
|
+
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
47
62
|
- Backend is a pluggable retrieval implementation.
|
|
48
|
-
-
|
|
49
|
-
-
|
|
63
|
+
- Run is a recorded retrieval build for a corpus.
|
|
64
|
+
- Evidence is what retrieval returns, with identifiers and source information.
|
|
50
65
|
|
|
51
66
|
## Diagram
|
|
52
67
|
|
|
53
68
|
This diagram shows how a corpus becomes evidence for an assistant.
|
|
54
|
-
|
|
55
|
-
The
|
|
69
|
+
Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
|
|
70
|
+
The legend shows what the block styles mean.
|
|
71
|
+
Your code is where you decide how to turn evidence into context and how to call a model.
|
|
56
72
|
|
|
57
73
|
```mermaid
|
|
58
74
|
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
@@ -61,7 +77,10 @@ flowchart LR
|
|
|
61
77
|
direction LR
|
|
62
78
|
LegendArtifact[Stored artifact or evidence]
|
|
63
79
|
LegendStep[Step]
|
|
80
|
+
LegendStable[Stable region]
|
|
81
|
+
LegendPluggable[Pluggable region]
|
|
64
82
|
LegendArtifact --- LegendStep
|
|
83
|
+
LegendStable --- LegendPluggable
|
|
65
84
|
end
|
|
66
85
|
|
|
67
86
|
subgraph Main[" "]
|
|
@@ -74,12 +93,19 @@ flowchart LR
|
|
|
74
93
|
Raw --> Catalog[Catalog file]
|
|
75
94
|
end
|
|
76
95
|
|
|
96
|
+
subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
|
|
97
|
+
direction TB
|
|
98
|
+
Catalog --> Extract[Extract pipeline]
|
|
99
|
+
Extract --> ExtractedText[Extracted text artifacts]
|
|
100
|
+
ExtractedText --> ExtractionRun[Extraction run manifest]
|
|
101
|
+
end
|
|
102
|
+
|
|
77
103
|
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
78
104
|
direction LR
|
|
79
105
|
|
|
80
106
|
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
81
107
|
direction TB
|
|
82
|
-
|
|
108
|
+
ExtractionRun --> Build[Build run]
|
|
83
109
|
Build --> BackendIndex[Backend index]
|
|
84
110
|
BackendIndex --> Run[Run manifest]
|
|
85
111
|
end
|
|
@@ -100,6 +126,7 @@ flowchart LR
|
|
|
100
126
|
end
|
|
101
127
|
|
|
102
128
|
style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
129
|
+
style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
103
130
|
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
104
131
|
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
105
132
|
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
@@ -107,6 +134,8 @@ flowchart LR
|
|
|
107
134
|
|
|
108
135
|
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
109
136
|
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
137
|
+
style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
138
|
+
style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
110
139
|
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
111
140
|
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
112
141
|
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
@@ -115,6 +144,7 @@ flowchart LR
|
|
|
115
144
|
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
116
145
|
|
|
117
146
|
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
147
|
+
style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
118
148
|
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
119
149
|
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
120
150
|
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
@@ -124,6 +154,8 @@ flowchart LR
|
|
|
124
154
|
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
125
155
|
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
126
156
|
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
157
|
+
style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
158
|
+
style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
127
159
|
```
|
|
128
160
|
|
|
129
161
|
## Practical value
|
|
@@ -136,6 +168,7 @@ flowchart LR
|
|
|
136
168
|
|
|
137
169
|
- Initialize a corpus folder.
|
|
138
170
|
- Ingest items from file paths, web addresses, or text input.
|
|
171
|
+
- Run extraction when you want derived text artifacts from non-text sources.
|
|
139
172
|
- Reindex to refresh the catalog after edits.
|
|
140
173
|
- Build a retrieval run with a backend.
|
|
141
174
|
- Query the run to collect evidence and evaluate it with datasets.
|
|
@@ -154,13 +187,25 @@ After the first release, you can install it from Python Package Index.
|
|
|
154
187
|
python3 -m pip install biblicus
|
|
155
188
|
```
|
|
156
189
|
|
|
190
|
+
### Optional extras
|
|
191
|
+
|
|
192
|
+
Some extractors are optional so the base install stays small.
|
|
193
|
+
|
|
194
|
+
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
195
|
+
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
196
|
+
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
197
|
+
|
|
157
198
|
## Quick start
|
|
158
199
|
|
|
159
200
|
```
|
|
201
|
+
mkdir -p notes
|
|
202
|
+
echo "A small file note" > notes/example.txt
|
|
203
|
+
|
|
160
204
|
biblicus init corpora/example
|
|
161
205
|
biblicus ingest --corpus corpora/example notes/example.txt
|
|
162
206
|
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
163
207
|
biblicus list --corpus corpora/example
|
|
208
|
+
biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
|
|
164
209
|
biblicus build --corpus corpora/example --backend scan
|
|
165
210
|
biblicus query --corpus corpora/example --query "note"
|
|
166
211
|
```
|
|
@@ -188,13 +233,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
|
|
|
188
233
|
|
|
189
234
|
## Learn more
|
|
190
235
|
|
|
236
|
+
Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
|
|
237
|
+
|
|
191
238
|
The documents below are written to be read in order.
|
|
192
239
|
|
|
193
240
|
- [Architecture][architecture]
|
|
241
|
+
- [Roadmap][roadmap]
|
|
242
|
+
- [Feature index][feature-index]
|
|
194
243
|
- [Corpus][corpus]
|
|
195
244
|
- [Text extraction][text-extraction]
|
|
245
|
+
- [User configuration][user-configuration]
|
|
196
246
|
- [Backends][backends]
|
|
197
|
-
- [
|
|
247
|
+
- [Demos][demos]
|
|
198
248
|
- [Testing][testing]
|
|
199
249
|
|
|
200
250
|
## Metadata and catalog
|
|
@@ -252,10 +302,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
|
|
|
252
302
|
|
|
253
303
|
## Documentation
|
|
254
304
|
|
|
255
|
-
Reference documentation is generated from Sphinx style docstrings.
|
|
305
|
+
Reference documentation is generated from Sphinx style docstrings.
|
|
306
|
+
|
|
307
|
+
Install development dependencies:
|
|
308
|
+
|
|
309
|
+
```
|
|
310
|
+
python3 -m pip install -e ".[dev]"
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Build the documentation:
|
|
256
314
|
|
|
257
315
|
```
|
|
258
|
-
|
|
316
|
+
python3 -m sphinx -b html docs docs/_build
|
|
259
317
|
```
|
|
260
318
|
|
|
261
319
|
## License
|
|
@@ -264,11 +322,15 @@ License terms are in `LICENSE`.
|
|
|
264
322
|
|
|
265
323
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
266
324
|
[architecture]: docs/ARCHITECTURE.md
|
|
325
|
+
[roadmap]: docs/ROADMAP.md
|
|
326
|
+
[feature-index]: docs/FEATURE_INDEX.md
|
|
267
327
|
[corpus]: docs/CORPUS.md
|
|
268
328
|
[text-extraction]: docs/EXTRACTION.md
|
|
329
|
+
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
269
330
|
[backends]: docs/BACKENDS.md
|
|
270
|
-
[
|
|
331
|
+
[demos]: docs/DEMOS.md
|
|
271
332
|
[testing]: docs/TESTING.md
|
|
272
333
|
|
|
273
334
|
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
274
335
|
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
336
|
+
[documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
|
|
@@ -1,27 +1,8 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: biblicus
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.9
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: pydantic>=2.0
|
|
10
|
-
Requires-Dist: PyYAML>=6.0
|
|
11
|
-
Provides-Extra: dev
|
|
12
|
-
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
13
|
-
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
14
|
-
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
15
|
-
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
16
|
-
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
17
|
-
Requires-Dist: black>=24.0; extra == "dev"
|
|
18
|
-
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
19
|
-
Dynamic: license-file
|
|
20
|
-
|
|
21
1
|
# Biblicus
|
|
22
2
|
|
|
23
3
|
![Continuous integration][continuous-integration-badge]
|
|
24
4
|
![Coverage][coverage-badge]
|
|
5
|
+
![Documentation][documentation-badge]
|
|
25
6
|
|
|
26
7
|
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
27
8
|
|
|
@@ -31,28 +12,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
|
|
|
31
12
|
|
|
32
13
|
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
33
14
|
|
|
34
|
-
It
|
|
15
|
+
It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
|
|
35
16
|
|
|
36
17
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
37
18
|
|
|
38
|
-
##
|
|
19
|
+
## A beginner friendly mental model
|
|
20
|
+
|
|
21
|
+
Think in three stages.
|
|
22
|
+
|
|
23
|
+
- Ingest puts raw items into a corpus. This is file first and human inspectable.
|
|
24
|
+
- Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
|
|
25
|
+
- Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
|
|
39
26
|
|
|
40
|
-
|
|
27
|
+
If you learn a few project words, the rest of the system becomes predictable.
|
|
41
28
|
|
|
42
29
|
- Corpus is the folder that holds raw items and their metadata.
|
|
43
|
-
- Item is the raw bytes
|
|
30
|
+
- Item is the raw bytes plus optional metadata and source information.
|
|
44
31
|
- Catalog is the rebuildable index of the corpus.
|
|
45
|
-
-
|
|
46
|
-
- Run is a recorded retrieval build for a corpus.
|
|
32
|
+
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
47
33
|
- Backend is a pluggable retrieval implementation.
|
|
48
|
-
-
|
|
49
|
-
-
|
|
34
|
+
- Run is a recorded retrieval build for a corpus.
|
|
35
|
+
- Evidence is what retrieval returns, with identifiers and source information.
|
|
50
36
|
|
|
51
37
|
## Diagram
|
|
52
38
|
|
|
53
39
|
This diagram shows how a corpus becomes evidence for an assistant.
|
|
54
|
-
|
|
55
|
-
The
|
|
40
|
+
Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
|
|
41
|
+
The legend shows what the block styles mean.
|
|
42
|
+
Your code is where you decide how to turn evidence into context and how to call a model.
|
|
56
43
|
|
|
57
44
|
```mermaid
|
|
58
45
|
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
@@ -61,7 +48,10 @@ flowchart LR
|
|
|
61
48
|
direction LR
|
|
62
49
|
LegendArtifact[Stored artifact or evidence]
|
|
63
50
|
LegendStep[Step]
|
|
51
|
+
LegendStable[Stable region]
|
|
52
|
+
LegendPluggable[Pluggable region]
|
|
64
53
|
LegendArtifact --- LegendStep
|
|
54
|
+
LegendStable --- LegendPluggable
|
|
65
55
|
end
|
|
66
56
|
|
|
67
57
|
subgraph Main[" "]
|
|
@@ -74,12 +64,19 @@ flowchart LR
|
|
|
74
64
|
Raw --> Catalog[Catalog file]
|
|
75
65
|
end
|
|
76
66
|
|
|
67
|
+
subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
|
|
68
|
+
direction TB
|
|
69
|
+
Catalog --> Extract[Extract pipeline]
|
|
70
|
+
Extract --> ExtractedText[Extracted text artifacts]
|
|
71
|
+
ExtractedText --> ExtractionRun[Extraction run manifest]
|
|
72
|
+
end
|
|
73
|
+
|
|
77
74
|
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
78
75
|
direction LR
|
|
79
76
|
|
|
80
77
|
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
81
78
|
direction TB
|
|
82
|
-
|
|
79
|
+
ExtractionRun --> Build[Build run]
|
|
83
80
|
Build --> BackendIndex[Backend index]
|
|
84
81
|
BackendIndex --> Run[Run manifest]
|
|
85
82
|
end
|
|
@@ -100,6 +97,7 @@ flowchart LR
|
|
|
100
97
|
end
|
|
101
98
|
|
|
102
99
|
style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
100
|
+
style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
103
101
|
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
104
102
|
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
105
103
|
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
@@ -107,6 +105,8 @@ flowchart LR
|
|
|
107
105
|
|
|
108
106
|
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
109
107
|
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
108
|
+
style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
109
|
+
style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
110
110
|
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
111
111
|
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
112
112
|
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
@@ -115,6 +115,7 @@ flowchart LR
|
|
|
115
115
|
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
116
116
|
|
|
117
117
|
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
118
|
+
style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
118
119
|
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
119
120
|
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
120
121
|
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
@@ -124,6 +125,8 @@ flowchart LR
|
|
|
124
125
|
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
125
126
|
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
126
127
|
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
128
|
+
style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
129
|
+
style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
127
130
|
```
|
|
128
131
|
|
|
129
132
|
## Practical value
|
|
@@ -136,6 +139,7 @@ flowchart LR
|
|
|
136
139
|
|
|
137
140
|
- Initialize a corpus folder.
|
|
138
141
|
- Ingest items from file paths, web addresses, or text input.
|
|
142
|
+
- Run extraction when you want derived text artifacts from non-text sources.
|
|
139
143
|
- Reindex to refresh the catalog after edits.
|
|
140
144
|
- Build a retrieval run with a backend.
|
|
141
145
|
- Query the run to collect evidence and evaluate it with datasets.
|
|
@@ -154,13 +158,25 @@ After the first release, you can install it from Python Package Index.
|
|
|
154
158
|
python3 -m pip install biblicus
|
|
155
159
|
```
|
|
156
160
|
|
|
161
|
+
### Optional extras
|
|
162
|
+
|
|
163
|
+
Some extractors are optional so the base install stays small.
|
|
164
|
+
|
|
165
|
+
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
166
|
+
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
167
|
+
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
168
|
+
|
|
157
169
|
## Quick start
|
|
158
170
|
|
|
159
171
|
```
|
|
172
|
+
mkdir -p notes
|
|
173
|
+
echo "A small file note" > notes/example.txt
|
|
174
|
+
|
|
160
175
|
biblicus init corpora/example
|
|
161
176
|
biblicus ingest --corpus corpora/example notes/example.txt
|
|
162
177
|
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
163
178
|
biblicus list --corpus corpora/example
|
|
179
|
+
biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
|
|
164
180
|
biblicus build --corpus corpora/example --backend scan
|
|
165
181
|
biblicus query --corpus corpora/example --query "note"
|
|
166
182
|
```
|
|
@@ -188,13 +204,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
|
|
|
188
204
|
|
|
189
205
|
## Learn more
|
|
190
206
|
|
|
207
|
+
Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
|
|
208
|
+
|
|
191
209
|
The documents below are written to be read in order.
|
|
192
210
|
|
|
193
211
|
- [Architecture][architecture]
|
|
212
|
+
- [Roadmap][roadmap]
|
|
213
|
+
- [Feature index][feature-index]
|
|
194
214
|
- [Corpus][corpus]
|
|
195
215
|
- [Text extraction][text-extraction]
|
|
216
|
+
- [User configuration][user-configuration]
|
|
196
217
|
- [Backends][backends]
|
|
197
|
-
- [
|
|
218
|
+
- [Demos][demos]
|
|
198
219
|
- [Testing][testing]
|
|
199
220
|
|
|
200
221
|
## Metadata and catalog
|
|
@@ -252,10 +273,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
|
|
|
252
273
|
|
|
253
274
|
## Documentation
|
|
254
275
|
|
|
255
|
-
Reference documentation is generated from Sphinx style docstrings.
|
|
276
|
+
Reference documentation is generated from Sphinx style docstrings.
|
|
277
|
+
|
|
278
|
+
Install development dependencies:
|
|
279
|
+
|
|
280
|
+
```
|
|
281
|
+
python3 -m pip install -e ".[dev]"
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Build the documentation:
|
|
256
285
|
|
|
257
286
|
```
|
|
258
|
-
|
|
287
|
+
python3 -m sphinx -b html docs docs/_build
|
|
259
288
|
```
|
|
260
289
|
|
|
261
290
|
## License
|
|
@@ -264,11 +293,15 @@ License terms are in `LICENSE`.
|
|
|
264
293
|
|
|
265
294
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
266
295
|
[architecture]: docs/ARCHITECTURE.md
|
|
296
|
+
[roadmap]: docs/ROADMAP.md
|
|
297
|
+
[feature-index]: docs/FEATURE_INDEX.md
|
|
267
298
|
[corpus]: docs/CORPUS.md
|
|
268
299
|
[text-extraction]: docs/EXTRACTION.md
|
|
300
|
+
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
269
301
|
[backends]: docs/BACKENDS.md
|
|
270
|
-
[
|
|
302
|
+
[demos]: docs/DEMOS.md
|
|
271
303
|
[testing]: docs/TESTING.md
|
|
272
304
|
|
|
273
305
|
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
274
306
|
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
307
|
+
[documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Third-party notices
|
|
2
|
+
|
|
3
|
+
This project includes vendored third-party source code.
|
|
4
|
+
|
|
5
|
+
## dotyaml
|
|
6
|
+
|
|
7
|
+
Portions of this repository vendor code from the `dotyaml` project.
|
|
8
|
+
|
|
9
|
+
- Project: `dotyaml`
|
|
10
|
+
- Source: `../dotyaml` (vendored into `src/biblicus/_vendor/dotyaml/`)
|
|
11
|
+
- License: MIT
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
MIT License
|
|
15
|
+
|
|
16
|
+
Copyright (c) 2025 yamlenv
|
|
17
|
+
|
|
18
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
19
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
20
|
+
in the Software without restriction, including without limitation the rights
|
|
21
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
22
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
23
|
+
furnished to do so, subject to the following conditions:
|
|
24
|
+
|
|
25
|
+
The above copyright notice and this permission notice shall be included in all
|
|
26
|
+
copies or substantial portions of the Software.
|
|
27
|
+
|
|
28
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
29
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
30
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
31
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
32
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
33
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
34
|
+
SOFTWARE.
|
|
35
|
+
```
|
|
36
|
+
|
|
@@ -1,13 +1,9 @@
|
|
|
1
|
-
# Corpus
|
|
1
|
+
# Corpus design
|
|
2
2
|
|
|
3
|
-
This document records
|
|
3
|
+
This document records design decisions and outcomes for corpus management and lifecycle hooks in version zero.
|
|
4
4
|
|
|
5
5
|
The goal is to make corpus management practical for day to day use, while keeping the raw corpus durable and readable as ordinary files on disk.
|
|
6
6
|
|
|
7
|
-
## Initiative constraints
|
|
8
|
-
|
|
9
|
-
The project uses strict behavior driven development. Behavior specifications in `features/*.feature` are the authoritative definition of system behavior.
|
|
10
|
-
|
|
11
7
|
## What exists today
|
|
12
8
|
|
|
13
9
|
The project already supports:
|