biblicus 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.1.1/src/biblicus.egg-info → biblicus-0.2.0}/PKG-INFO +101 -1
- {biblicus-0.1.1 → biblicus-0.2.0}/README.md +100 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/docs/BACKENDS.md +1 -0
- biblicus-0.2.0/docs/CORPUS.md +103 -0
- biblicus-0.2.0/docs/CORPUS_WORKFLOWS.md +408 -0
- biblicus-0.2.0/docs/EXTRACTION.md +86 -0
- biblicus-0.2.0/docs/NEXT_STEPS.md +309 -0
- biblicus-0.2.0/docs/TESTING.md +29 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/docs/index.rst +2 -0
- biblicus-0.2.0/features/content_sniffing.feature +63 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/environment.py +0 -3
- biblicus-0.2.0/features/extractor_pipeline.feature +114 -0
- biblicus-0.2.0/features/extractor_validation.feature +7 -0
- biblicus-0.2.0/features/hook_config_validation.feature +28 -0
- biblicus-0.2.0/features/hook_error_handling.feature +15 -0
- biblicus-0.2.0/features/import_tree.feature +54 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/ingest_sources.feature +14 -0
- biblicus-0.2.0/features/integration_pdf_samples.feature +8 -0
- biblicus-0.2.0/features/lifecycle_hooks.feature +96 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/python_api.feature +17 -0
- biblicus-0.2.0/features/python_hook_logging.feature +10 -0
- biblicus-0.2.0/features/retrieval_uses_extraction_run.feature +93 -0
- biblicus-0.2.0/features/source_loading.feature +9 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/cli_steps.py +199 -0
- biblicus-0.2.0/features/steps/extraction_steps.py +238 -0
- biblicus-0.2.0/features/steps/extractor_steps.py +54 -0
- biblicus-0.2.0/features/steps/python_api_steps.py +414 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/retrieval_steps.py +14 -0
- biblicus-0.2.0/features/streaming_ingest.feature +11 -0
- biblicus-0.2.0/features/text_extraction_runs.feature +85 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/pyproject.toml +1 -1
- biblicus-0.2.0/scripts/download_pdf_samples.py +133 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/scripts/test.py +14 -1
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/__init__.py +1 -1
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/backends/scan.py +81 -4
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/backends/sqlite_full_text_search.py +63 -2
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/cli.py +123 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/constants.py +2 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/corpus.py +431 -2
- biblicus-0.2.0/src/biblicus/extraction.py +330 -0
- biblicus-0.2.0/src/biblicus/extractors/__init__.py +33 -0
- biblicus-0.2.0/src/biblicus/extractors/base.py +61 -0
- biblicus-0.2.0/src/biblicus/extractors/cascade.py +101 -0
- biblicus-0.2.0/src/biblicus/extractors/metadata_text.py +98 -0
- biblicus-0.2.0/src/biblicus/extractors/pass_through_text.py +74 -0
- biblicus-0.2.0/src/biblicus/hook_logging.py +185 -0
- biblicus-0.2.0/src/biblicus/hook_manager.py +205 -0
- biblicus-0.2.0/src/biblicus/hooks.py +265 -0
- biblicus-0.2.0/src/biblicus/ignore.py +67 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/models.py +20 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/sources.py +45 -0
- {biblicus-0.1.1 → biblicus-0.2.0/src/biblicus.egg-info}/PKG-INFO +101 -1
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus.egg-info/SOURCES.txt +32 -1
- biblicus-0.1.1/features/steps/python_api_steps.py +0 -196
- {biblicus-0.1.1 → biblicus-0.2.0}/LICENSE +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/MANIFEST.in +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/docs/api.rst +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/docs/conf.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/backend_validation.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/error_cases.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/evaluation.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/frontmatter.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/model_validation.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/scripts/download_wikipedia.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/setup.cfg +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/time.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus.egg-info/requires.txt +0 -0
- {biblicus-0.1.1 → biblicus-0.2.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -20,6 +20,9 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
# Biblicus
|
|
22
22
|
|
|
23
|
+
![Continuous integration][continuous-integration-badge]
|
|
24
|
+
![Coverage][coverage-badge]
|
|
25
|
+
|
|
23
26
|
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
24
27
|
|
|
25
28
|
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
@@ -45,6 +48,84 @@ The framework is a small, explicit vocabulary that appears in code, specificatio
|
|
|
45
48
|
- Recipe is a named configuration for a backend.
|
|
46
49
|
- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
|
|
47
50
|
|
|
51
|
+
## Diagram
|
|
52
|
+
|
|
53
|
+
This diagram shows how a corpus becomes evidence for an assistant.
|
|
54
|
+
The legend shows what the border styles and fill styles mean.
|
|
55
|
+
The your code region is where you decide how to turn evidence into context and how to call a model.
|
|
56
|
+
|
|
57
|
+
```mermaid
|
|
58
|
+
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
59
|
+
flowchart LR
|
|
60
|
+
subgraph Legend[Legend]
|
|
61
|
+
direction LR
|
|
62
|
+
LegendArtifact[Stored artifact or evidence]
|
|
63
|
+
LegendStep[Step]
|
|
64
|
+
LegendArtifact --- LegendStep
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
subgraph Main[" "]
|
|
68
|
+
direction TB
|
|
69
|
+
|
|
70
|
+
subgraph StableCore[Stable core]
|
|
71
|
+
direction TB
|
|
72
|
+
Source[Source items] --> Ingest[Ingest]
|
|
73
|
+
Ingest --> Raw[Raw item files]
|
|
74
|
+
Raw --> Catalog[Catalog file]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
78
|
+
direction LR
|
|
79
|
+
|
|
80
|
+
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
81
|
+
direction TB
|
|
82
|
+
Catalog --> Build[Build run]
|
|
83
|
+
Build --> BackendIndex[Backend index]
|
|
84
|
+
BackendIndex --> Run[Run manifest]
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
subgraph BackendRetrievalGeneration[Retrieval and generation]
|
|
88
|
+
direction TB
|
|
89
|
+
Run --> Query[Query]
|
|
90
|
+
Query --> Evidence[Evidence]
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
Evidence --> Context
|
|
95
|
+
|
|
96
|
+
subgraph YourCode[Your code]
|
|
97
|
+
direction TB
|
|
98
|
+
Context[Assistant context] --> Model[Large language model call]
|
|
99
|
+
Model --> Answer[Answer]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
103
|
+
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
104
|
+
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
105
|
+
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
106
|
+
style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
107
|
+
|
|
108
|
+
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
109
|
+
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
110
|
+
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
111
|
+
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
112
|
+
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
113
|
+
style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
114
|
+
style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
115
|
+
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
116
|
+
|
|
117
|
+
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
118
|
+
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
119
|
+
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
120
|
+
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
style Legend fill:#ffffff,stroke:#ffffff,color:#111111
|
|
124
|
+
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
125
|
+
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
126
|
+
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
127
|
+
```
|
|
128
|
+
|
|
48
129
|
## Practical value
|
|
49
130
|
|
|
50
131
|
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
@@ -110,7 +191,11 @@ In an assistant system, retrieval usually produces context for a model call. Thi
|
|
|
110
191
|
The documents below are written to be read in order.
|
|
111
192
|
|
|
112
193
|
- [Architecture][architecture]
|
|
194
|
+
- [Corpus][corpus]
|
|
195
|
+
- [Text extraction][text-extraction]
|
|
113
196
|
- [Backends][backends]
|
|
197
|
+
- [Next steps][next-steps]
|
|
198
|
+
- [Testing][testing]
|
|
114
199
|
|
|
115
200
|
## Metadata and catalog
|
|
116
201
|
|
|
@@ -143,12 +228,20 @@ Use `scripts/download_wikipedia.py` to download a small integration corpus from
|
|
|
143
228
|
|
|
144
229
|
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
145
230
|
|
|
231
|
+
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
232
|
+
|
|
146
233
|
## Tests and coverage
|
|
147
234
|
|
|
148
235
|
```
|
|
149
236
|
python3 scripts/test.py
|
|
150
237
|
```
|
|
151
238
|
|
|
239
|
+
To include integration scenarios that download public test data at runtime, run this command.
|
|
240
|
+
|
|
241
|
+
```
|
|
242
|
+
python3 scripts/test.py --integration
|
|
243
|
+
```
|
|
244
|
+
|
|
152
245
|
## Releases
|
|
153
246
|
|
|
154
247
|
Releases are automated from the main branch using semantic versioning and conventional commit messages.
|
|
@@ -171,4 +264,11 @@ License terms are in `LICENSE`.
|
|
|
171
264
|
|
|
172
265
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
173
266
|
[architecture]: docs/ARCHITECTURE.md
|
|
267
|
+
[corpus]: docs/CORPUS.md
|
|
268
|
+
[text-extraction]: docs/EXTRACTION.md
|
|
174
269
|
[backends]: docs/BACKENDS.md
|
|
270
|
+
[next-steps]: docs/NEXT_STEPS.md
|
|
271
|
+
[testing]: docs/TESTING.md
|
|
272
|
+
|
|
273
|
+
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
274
|
+
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# Biblicus
|
|
2
2
|
|
|
3
|
+
![Continuous integration][continuous-integration-badge]
|
|
4
|
+
![Coverage][coverage-badge]
|
|
5
|
+
|
|
3
6
|
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
4
7
|
|
|
5
8
|
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
@@ -25,6 +28,84 @@ The framework is a small, explicit vocabulary that appears in code, specificatio
|
|
|
25
28
|
- Recipe is a named configuration for a backend.
|
|
26
29
|
- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
|
|
27
30
|
|
|
31
|
+
## Diagram
|
|
32
|
+
|
|
33
|
+
This diagram shows how a corpus becomes evidence for an assistant.
|
|
34
|
+
The legend shows what the border styles and fill styles mean.
|
|
35
|
+
The your code region is where you decide how to turn evidence into context and how to call a model.
|
|
36
|
+
|
|
37
|
+
```mermaid
|
|
38
|
+
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
39
|
+
flowchart LR
|
|
40
|
+
subgraph Legend[Legend]
|
|
41
|
+
direction LR
|
|
42
|
+
LegendArtifact[Stored artifact or evidence]
|
|
43
|
+
LegendStep[Step]
|
|
44
|
+
LegendArtifact --- LegendStep
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
subgraph Main[" "]
|
|
48
|
+
direction TB
|
|
49
|
+
|
|
50
|
+
subgraph StableCore[Stable core]
|
|
51
|
+
direction TB
|
|
52
|
+
Source[Source items] --> Ingest[Ingest]
|
|
53
|
+
Ingest --> Raw[Raw item files]
|
|
54
|
+
Raw --> Catalog[Catalog file]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
58
|
+
direction LR
|
|
59
|
+
|
|
60
|
+
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
61
|
+
direction TB
|
|
62
|
+
Catalog --> Build[Build run]
|
|
63
|
+
Build --> BackendIndex[Backend index]
|
|
64
|
+
BackendIndex --> Run[Run manifest]
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
subgraph BackendRetrievalGeneration[Retrieval and generation]
|
|
68
|
+
direction TB
|
|
69
|
+
Run --> Query[Query]
|
|
70
|
+
Query --> Evidence[Evidence]
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
Evidence --> Context
|
|
75
|
+
|
|
76
|
+
subgraph YourCode[Your code]
|
|
77
|
+
direction TB
|
|
78
|
+
Context[Assistant context] --> Model[Large language model call]
|
|
79
|
+
Model --> Answer[Answer]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
83
|
+
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
84
|
+
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
85
|
+
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
86
|
+
style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
87
|
+
|
|
88
|
+
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
89
|
+
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
90
|
+
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
91
|
+
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
92
|
+
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
93
|
+
style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
94
|
+
style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
95
|
+
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
96
|
+
|
|
97
|
+
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
98
|
+
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
99
|
+
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
100
|
+
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
style Legend fill:#ffffff,stroke:#ffffff,color:#111111
|
|
104
|
+
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
105
|
+
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
106
|
+
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
107
|
+
```
|
|
108
|
+
|
|
28
109
|
## Practical value
|
|
29
110
|
|
|
30
111
|
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
@@ -90,7 +171,11 @@ In an assistant system, retrieval usually produces context for a model call. Thi
|
|
|
90
171
|
The documents below are written to be read in order.
|
|
91
172
|
|
|
92
173
|
- [Architecture][architecture]
|
|
174
|
+
- [Corpus][corpus]
|
|
175
|
+
- [Text extraction][text-extraction]
|
|
93
176
|
- [Backends][backends]
|
|
177
|
+
- [Next steps][next-steps]
|
|
178
|
+
- [Testing][testing]
|
|
94
179
|
|
|
95
180
|
## Metadata and catalog
|
|
96
181
|
|
|
@@ -123,12 +208,20 @@ Use `scripts/download_wikipedia.py` to download a small integration corpus from
|
|
|
123
208
|
|
|
124
209
|
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
125
210
|
|
|
211
|
+
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
212
|
+
|
|
126
213
|
## Tests and coverage
|
|
127
214
|
|
|
128
215
|
```
|
|
129
216
|
python3 scripts/test.py
|
|
130
217
|
```
|
|
131
218
|
|
|
219
|
+
To include integration scenarios that download public test data at runtime, run this command.
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
python3 scripts/test.py --integration
|
|
223
|
+
```
|
|
224
|
+
|
|
132
225
|
## Releases
|
|
133
226
|
|
|
134
227
|
Releases are automated from the main branch using semantic versioning and conventional commit messages.
|
|
@@ -151,4 +244,11 @@ License terms are in `LICENSE`.
|
|
|
151
244
|
|
|
152
245
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
153
246
|
[architecture]: docs/ARCHITECTURE.md
|
|
247
|
+
[corpus]: docs/CORPUS.md
|
|
248
|
+
[text-extraction]: docs/EXTRACTION.md
|
|
154
249
|
[backends]: docs/BACKENDS.md
|
|
250
|
+
[next-steps]: docs/NEXT_STEPS.md
|
|
251
|
+
[testing]: docs/TESTING.md
|
|
252
|
+
|
|
253
|
+
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
254
|
+
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
@@ -27,6 +27,7 @@ Backends implement two operations:
|
|
|
27
27
|
- Treat **runs** as immutable manifests with reproducible parameters.
|
|
28
28
|
- If your backend needs artifacts, store them under `.biblicus/runs/` and record paths in `artifact_paths`.
|
|
29
29
|
- Keep **text extraction** in explicit pipeline stages, not in backend ingestion.
|
|
30
|
+
See `docs/EXTRACTION.md` for how extraction runs are built and referenced from backend configs.
|
|
30
31
|
|
|
31
32
|
## Examples
|
|
32
33
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Corpus
|
|
2
|
+
|
|
3
|
+
A corpus is a normal folder on disk. It is the source of truth for your raw items.
|
|
4
|
+
|
|
5
|
+
The main goals are:
|
|
6
|
+
|
|
7
|
+
- You can ingest an item once and keep it as a file you can open and inspect.
|
|
8
|
+
- You can rebuild the catalog at any time.
|
|
9
|
+
- You can add derived artifacts later without changing the raw corpus.
|
|
10
|
+
|
|
11
|
+
## On disk layout
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
corpus/
|
|
15
|
+
raw/
|
|
16
|
+
<item files>
|
|
17
|
+
.biblicus/
|
|
18
|
+
config.json
|
|
19
|
+
catalog.json
|
|
20
|
+
runs/
|
|
21
|
+
<run manifests and artifacts>
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Ingest items
|
|
25
|
+
|
|
26
|
+
The simplest ingestion flows use the command line interface.
|
|
27
|
+
|
|
28
|
+
Create a corpus:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
python3 -m biblicus init corpora/example
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Ingest a local file:
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
python3 -m biblicus ingest --corpus corpora/example path/to/file.pdf --tag paper
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Ingest a web address:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
python3 -m biblicus ingest --corpus corpora/example https://example.com --tag web
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Ingest a text note:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
python3 -m biblicus ingest --corpus corpora/example --note "Hello" --title "First note" --tag notes
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
List items:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
python3 -m biblicus list --corpus corpora/example
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Show an item:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
python3 -m biblicus show --corpus corpora/example ITEM_ID
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Metadata
|
|
65
|
+
|
|
66
|
+
Metadata is intentionally simple and file based.
|
|
67
|
+
|
|
68
|
+
For Markdown items, metadata lives in a YAML front matter block.
|
|
69
|
+
|
|
70
|
+
For non Markdown items, metadata lives in a sidecar file with the suffix `.biblicus.yml`.
|
|
71
|
+
|
|
72
|
+
The raw file and its metadata file are meant to be opened, edited, and backed up with ordinary tools.
|
|
73
|
+
|
|
74
|
+
## Ignore rules
|
|
75
|
+
|
|
76
|
+
If you are importing a folder tree, ignore rules can prevent accidental ingestion of build artifacts, caches, or other irrelevant files.
|
|
77
|
+
|
|
78
|
+
Create a `.biblicusignore` file in the corpus root and add ignore patterns.
|
|
79
|
+
|
|
80
|
+
## Import a folder tree
|
|
81
|
+
|
|
82
|
+
To ingest an existing folder tree into a corpus while preserving relative paths, use the import command.
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
python3 -m biblicus import-tree --corpus corpora/example /path/to/folder/tree --tag imported
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Reindex
|
|
89
|
+
|
|
90
|
+
The catalog is rebuildable. If you edit files or sidecar metadata, refresh the catalog.
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
python3 -m biblicus reindex --corpus corpora/example
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Purge
|
|
97
|
+
|
|
98
|
+
Purging deletes all items and derived artifacts under the corpus. It requires you to type the corpus name as confirmation.
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
python3 -m biblicus purge --corpus corpora/example --confirm example
|
|
102
|
+
```
|
|
103
|
+
|