proactive-librarian 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proactive_librarian-0.1.0/LICENSE +21 -0
- proactive_librarian-0.1.0/PKG-INFO +194 -0
- proactive_librarian-0.1.0/README.md +164 -0
- proactive_librarian-0.1.0/proactive_librarian/__init__.py +7 -0
- proactive_librarian-0.1.0/proactive_librarian/__main__.py +5 -0
- proactive_librarian-0.1.0/proactive_librarian/cli.py +107 -0
- proactive_librarian-0.1.0/proactive_librarian/config.py +143 -0
- proactive_librarian-0.1.0/proactive_librarian/ingest.py +303 -0
- proactive_librarian-0.1.0/proactive_librarian/query.py +160 -0
- proactive_librarian-0.1.0/proactive_librarian/taxonomy.py +82 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/PKG-INFO +194 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/SOURCES.txt +20 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/dependency_links.txt +1 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/entry_points.txt +2 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/requires.txt +9 -0
- proactive_librarian-0.1.0/proactive_librarian.egg-info/top_level.txt +1 -0
- proactive_librarian-0.1.0/pyproject.toml +54 -0
- proactive_librarian-0.1.0/setup.cfg +4 -0
- proactive_librarian-0.1.0/tests/test_config.py +79 -0
- proactive_librarian-0.1.0/tests/test_ingest.py +104 -0
- proactive_librarian-0.1.0/tests/test_query.py +76 -0
- proactive_librarian-0.1.0/tests/test_taxonomy.py +71 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 David Orban
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proactive-librarian
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Index a PDF library and answer queries with page-accurate citations. Backend-agnostic; ships with a QMD adapter.
|
|
5
|
+
Author: David Orban
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Source, https://github.com/davidorban/proactive-librarian
|
|
8
|
+
Keywords: pdf,search,rag,citation,qmd
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pypdf>=4.0
|
|
22
|
+
Requires-Dist: cryptography>=3.1
|
|
23
|
+
Requires-Dist: tqdm>=4.65
|
|
24
|
+
Requires-Dist: PyYAML>=6.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# proactive-librarian
|
|
32
|
+
|
|
33
|
+
[](https://www.python.org/)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
[](CHANGELOG.md)
|
|
36
|
+
|
|
37
|
+
> Turn a directory of PDFs into a searchable index and get back **page-accurate citations** like `Report.pdf:p.27`.
|
|
38
|
+
|
|
39
|
+
`proactive-librarian` indexes a large document library once, then answers queries with the exact file **and page number** — ready to paste into a memo, brief, paper, or deck. It is a sharp, single-purpose CLI tool: not a chatbot, not a RAG framework, not a vector database you have to operate. Backend-agnostic, with a [QMD](https://github.com/eatonphil/qmd) adapter shipped in the box.
|
|
40
|
+
|
|
41
|
+
It replaces the brutal friction of citing your own research at scale — folder grep is slow, filenames lie, and Ctrl+F across thousands of PDFs is unworkable.
|
|
42
|
+
|
|
43
|
+
## Features
|
|
44
|
+
|
|
45
|
+
- 📄 **Page-accurate citations** — results come back as `file.pdf:p.N`, the page number derived from the cache path, not parsed (and mis-parsed) from a snippet.
|
|
46
|
+
- 🔍 **Hybrid retrieval** — keyword + semantic search via the backend (QMD adapter included; bring your own).
|
|
47
|
+
- ♻️ **Deterministic, reproducible cache** — rebuildable from the PDFs at any time; safe to delete.
|
|
48
|
+
- 🧱 **Page-per-file layout** — pure page text with no metadata leakage into snippets (see [why](#why-page-per-file)).
|
|
49
|
+
- 🔐 **Robust ingest** — handles AES-encrypted PDFs, sanitises surrogate/4-byte characters, and logs every failure with a reason to `_failures.json`.
|
|
50
|
+
- 🗂️ **Optional taxonomy enforcement** — constrain the library to a known subject/geography scheme and fail loudly on stragglers.
|
|
51
|
+
- ⚡ **Incremental** — sha1 manifest means re-ingest only touches changed PDFs.
|
|
52
|
+
- ✅ **Tested** — 36+ pytest cases over hashing, path mapping, manifest persistence, atomic writes, failure logging, and taxonomy validation.
|
|
53
|
+
|
|
54
|
+
## Who it's for
|
|
55
|
+
|
|
56
|
+
Built originally for one person's research-citation workflow, but it solves a general problem — **anyone who maintains a large PDF/document library and needs to quote it with precision**:
|
|
57
|
+
|
|
58
|
+
- researchers and analysts citing a corpus of papers/reports,
|
|
59
|
+
- policy, legal, and compliance teams working from regulatory documents,
|
|
60
|
+
- technical writers and maintainers grounding docs in source material,
|
|
61
|
+
- AI agents/assistants that need **citation-grounded retrieval** they can shell out to (`librarian query "…"`).
|
|
62
|
+
|
|
63
|
+
## Install
|
|
64
|
+
|
|
65
|
+
Not yet on PyPI. Install from source:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install git+https://github.com/davidorban/proactive-librarian
|
|
69
|
+
# or, for local development:
|
|
70
|
+
git clone https://github.com/davidorban/proactive-librarian
|
|
71
|
+
cd proactive-librarian
|
|
72
|
+
python -m venv .venv && source .venv/bin/activate
|
|
73
|
+
pip install -e ".[dev]"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
You also need a search backend. The shipped adapter targets **QMD** — install it from https://github.com/eatonphil/qmd (or implement another backend; see [ADR-001](docs/adr/ADR-001-storage-substrate.md)).
|
|
77
|
+
|
|
78
|
+
## Quickstart
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# 1. Point at your PDF library
|
|
82
|
+
cat > proactive-librarian.yaml <<'EOF'
|
|
83
|
+
pdf_root: /path/to/your/pdfs
|
|
84
|
+
collection_name: research
|
|
85
|
+
EOF
|
|
86
|
+
|
|
87
|
+
# 2. Index (first run; ~30 min for ~1,400 PDFs, then incremental)
|
|
88
|
+
librarian ingest --full
|
|
89
|
+
|
|
90
|
+
# 3. Tell your backend about the new cache (one-time; see backend docs)
|
|
91
|
+
|
|
92
|
+
# 4. Query
|
|
93
|
+
librarian query "stablecoin regulation in the UAE"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Example output:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
1. Digital-Assets-Regulatory-Landscape.pdf:p.27 (0.81)
|
|
100
|
+
"...the payment-token services regulation establishes a licensing regime for fiat-backed..."
|
|
101
|
+
2. ADGM-Virtual-Asset-Framework-2025.pdf:p.12 (0.74)
|
|
102
|
+
"...issuers must maintain reserves on a 1:1 basis, audited monthly by an approved..."
|
|
103
|
+
3. Stablecoin-Market-Structure.pdf:p.4 (0.69)
|
|
104
|
+
"...redemption-at-par guarantees are the load-bearing assumption behind..."
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Paste the `file.pdf:p.N` straight into your draft.
|
|
108
|
+
|
|
109
|
+
## What it produces
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
<pdf_root>/.derived/
|
|
113
|
+
├── _manifest.json # sha1 cache for incremental re-ingest
|
|
114
|
+
├── _failures.json # last run's failed PDFs (with reasons)
|
|
115
|
+
└── <rel-pdf-stem-tree>/
|
|
116
|
+
├── _meta.json # per-PDF metadata sidecar (NOT indexed)
|
|
117
|
+
├── p0001.md # pure page text — no frontmatter
|
|
118
|
+
├── p0002.md
|
|
119
|
+
└── ...
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The cache lives next to the PDFs in `.derived/` (gitignored), is **reproducible** from the source PDFs, and is safe to delete and rebuild. The PDFs stay canonical; the markdown is a disposable index ([ADR-004](docs/adr/ADR-004-pdf-canonical-cache.md)).
|
|
123
|
+
|
|
124
|
+
## Why page-per-file?
|
|
125
|
+
|
|
126
|
+
Single-file-per-PDF chunking has two failure modes:
|
|
127
|
+
|
|
128
|
+
1. **Metadata leaks into snippets.** YAML frontmatter gets indexed with the content, so citation snippets end up containing `sha1: abc…` and `source_pdf: …`.
|
|
129
|
+
2. **Page numbers become unreliable.** When a chunk spans page boundaries, parsing `## Page N` markers collapses everything to `p.1` and citations lose precision.
|
|
130
|
+
|
|
131
|
+
Page-per-file fixes both: each markdown file holds only one page's text (nothing to leak), and the page number is a property of the path (`p0042.md` → page 42). The trade-off is a high file count (~1,377 PDFs × ~50 pages ≈ 67k files); modern filesystems (APFS, ext4) handle it fine. Full rationale in [ADR-002](docs/adr/ADR-002-page-level-granularity.md).
|
|
132
|
+
|
|
133
|
+
## Configuration
|
|
134
|
+
|
|
135
|
+
`proactive-librarian.yaml` in the working directory, or `--config /path/to/config.yaml`:
|
|
136
|
+
|
|
137
|
+
```yaml
|
|
138
|
+
# Required
|
|
139
|
+
pdf_root: /path/to/pdfs
|
|
140
|
+
|
|
141
|
+
# Optional (defaults shown)
|
|
142
|
+
derived_dir: .derived # relative to pdf_root
|
|
143
|
+
collection_name: research # backend collection key
|
|
144
|
+
|
|
145
|
+
taxonomy: # optional subject-category enforcement
|
|
146
|
+
enabled: false
|
|
147
|
+
allowed_subjects: [] # first path component under pdf_root must match
|
|
148
|
+
|
|
149
|
+
backend:
|
|
150
|
+
type: qmd
|
|
151
|
+
binary: qmd # on PATH, or an absolute path
|
|
152
|
+
timeout_seconds: 30
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
With no config file present, it runs with permissive defaults (`pdf_root: .`, no taxonomy). See [`examples/`](examples/) for annotated config and taxonomy files.
|
|
156
|
+
|
|
157
|
+
## Architecture
|
|
158
|
+
|
|
159
|
+
Five load-bearing decisions, each documented as an ADR:
|
|
160
|
+
|
|
161
|
+
| ADR | Decision |
|
|
162
|
+
|-----|----------|
|
|
163
|
+
| [ADR-001](docs/adr/ADR-001-storage-substrate.md) | QMD as the search substrate, not a standalone vector DB |
|
|
164
|
+
| [ADR-002](docs/adr/ADR-002-page-level-granularity.md) | Page-per-file granularity, not paragraph chunks |
|
|
165
|
+
| [ADR-003](docs/adr/ADR-003-explicit-invocation.md) | Explicit CLI invocation, not an ambient hook (yet) |
|
|
166
|
+
| [ADR-004](docs/adr/ADR-004-pdf-canonical-cache.md) | PDFs canonical; markdown is a disposable cache |
|
|
167
|
+
| [ADR-005](docs/adr/ADR-005-no-new-services.md) | A Python package + adapter, not a long-running service |
|
|
168
|
+
|
|
169
|
+
## Development
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
pip install -e ".[dev]"
|
|
173
|
+
pytest # 36+ cases
|
|
174
|
+
ruff check . # lint
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Roadmap
|
|
178
|
+
|
|
179
|
+
- Additional backends beyond QMD (the adapter boundary already exists).
|
|
180
|
+
- Publish to PyPI for a plain `pip install proactive-librarian`.
|
|
181
|
+
- Optional ambient/agent invocation surface (the explicit-CLI decision in ADR-003 is revisitable as adoption grows).
|
|
182
|
+
- Richer query output formats (JSON, BibTeX-style citation export).
|
|
183
|
+
|
|
184
|
+
## Contributing
|
|
185
|
+
|
|
186
|
+
Issues and pull requests welcome. The codebase is small and well-tested — run `pytest` and `ruff check .` before opening a PR. If you're adding a backend, start from the QMD adapter and the ADR-001 rationale.
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
[MIT](LICENSE) © David Orban.
|
|
191
|
+
|
|
192
|
+
## Status
|
|
193
|
+
|
|
194
|
+
**v0.1.0** — in single-user production use; the API will stabilise as more backends and surfaces are added. The cache format and CLI are stable. See [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# proactive-librarian
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](CHANGELOG.md)
|
|
6
|
+
|
|
7
|
+
> Turn a directory of PDFs into a searchable index and get back **page-accurate citations** like `Report.pdf:p.27`.
|
|
8
|
+
|
|
9
|
+
`proactive-librarian` indexes a large document library once, then answers queries with the exact file **and page number** — ready to paste into a memo, brief, paper, or deck. It is a sharp, single-purpose CLI tool: not a chatbot, not a RAG framework, not a vector database you have to operate. Backend-agnostic, with a [QMD](https://github.com/eatonphil/qmd) adapter shipped in the box.
|
|
10
|
+
|
|
11
|
+
It replaces the brutal friction of citing your own research at scale — folder grep is slow, filenames lie, and Ctrl+F across thousands of PDFs is unworkable.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- 📄 **Page-accurate citations** — results come back as `file.pdf:p.N`, the page number derived from the cache path, not parsed (and mis-parsed) from a snippet.
|
|
16
|
+
- 🔍 **Hybrid retrieval** — keyword + semantic search via the backend (QMD adapter included; bring your own).
|
|
17
|
+
- ♻️ **Deterministic, reproducible cache** — rebuildable from the PDFs at any time; safe to delete.
|
|
18
|
+
- 🧱 **Page-per-file layout** — pure page text with no metadata leakage into snippets (see [why](#why-page-per-file)).
|
|
19
|
+
- 🔐 **Robust ingest** — handles AES-encrypted PDFs, sanitises surrogate/4-byte characters, and logs every failure with a reason to `_failures.json`.
|
|
20
|
+
- 🗂️ **Optional taxonomy enforcement** — constrain the library to a known subject/geography scheme and fail loudly on stragglers.
|
|
21
|
+
- ⚡ **Incremental** — sha1 manifest means re-ingest only touches changed PDFs.
|
|
22
|
+
- ✅ **Tested** — 36+ pytest cases over hashing, path mapping, manifest persistence, atomic writes, failure logging, and taxonomy validation.
|
|
23
|
+
|
|
24
|
+
## Who it's for
|
|
25
|
+
|
|
26
|
+
Built originally for one person's research-citation workflow, but it solves a general problem — **anyone who maintains a large PDF/document library and needs to quote it with precision**:
|
|
27
|
+
|
|
28
|
+
- researchers and analysts citing a corpus of papers/reports,
|
|
29
|
+
- policy, legal, and compliance teams working from regulatory documents,
|
|
30
|
+
- technical writers and maintainers grounding docs in source material,
|
|
31
|
+
- AI agents/assistants that need **citation-grounded retrieval** they can shell out to (`librarian query "…"`).
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
Not yet on PyPI. Install from source:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install git+https://github.com/davidorban/proactive-librarian
|
|
39
|
+
# or, for local development:
|
|
40
|
+
git clone https://github.com/davidorban/proactive-librarian
|
|
41
|
+
cd proactive-librarian
|
|
42
|
+
python -m venv .venv && source .venv/bin/activate
|
|
43
|
+
pip install -e ".[dev]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
You also need a search backend. The shipped adapter targets **QMD** — install it from https://github.com/eatonphil/qmd (or implement another backend; see [ADR-001](docs/adr/ADR-001-storage-substrate.md)).
|
|
47
|
+
|
|
48
|
+
## Quickstart
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# 1. Point at your PDF library
|
|
52
|
+
cat > proactive-librarian.yaml <<'EOF'
|
|
53
|
+
pdf_root: /path/to/your/pdfs
|
|
54
|
+
collection_name: research
|
|
55
|
+
EOF
|
|
56
|
+
|
|
57
|
+
# 2. Index (first run; ~30 min for ~1,400 PDFs, then incremental)
|
|
58
|
+
librarian ingest --full
|
|
59
|
+
|
|
60
|
+
# 3. Tell your backend about the new cache (one-time; see backend docs)
|
|
61
|
+
|
|
62
|
+
# 4. Query
|
|
63
|
+
librarian query "stablecoin regulation in the UAE"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Example output:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
1. Digital-Assets-Regulatory-Landscape.pdf:p.27 (0.81)
|
|
70
|
+
"...the payment-token services regulation establishes a licensing regime for fiat-backed..."
|
|
71
|
+
2. ADGM-Virtual-Asset-Framework-2025.pdf:p.12 (0.74)
|
|
72
|
+
"...issuers must maintain reserves on a 1:1 basis, audited monthly by an approved..."
|
|
73
|
+
3. Stablecoin-Market-Structure.pdf:p.4 (0.69)
|
|
74
|
+
"...redemption-at-par guarantees are the load-bearing assumption behind..."
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Paste the `file.pdf:p.N` straight into your draft.
|
|
78
|
+
|
|
79
|
+
## What it produces
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
<pdf_root>/.derived/
|
|
83
|
+
├── _manifest.json # sha1 cache for incremental re-ingest
|
|
84
|
+
├── _failures.json # last run's failed PDFs (with reasons)
|
|
85
|
+
└── <rel-pdf-stem-tree>/
|
|
86
|
+
├── _meta.json # per-PDF metadata sidecar (NOT indexed)
|
|
87
|
+
├── p0001.md # pure page text — no frontmatter
|
|
88
|
+
├── p0002.md
|
|
89
|
+
└── ...
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The cache lives next to the PDFs in `.derived/` (gitignored), is **reproducible** from the source PDFs, and is safe to delete and rebuild. The PDFs stay canonical; the markdown is a disposable index ([ADR-004](docs/adr/ADR-004-pdf-canonical-cache.md)).
|
|
93
|
+
|
|
94
|
+
## Why page-per-file?
|
|
95
|
+
|
|
96
|
+
Single-file-per-PDF chunking has two failure modes:
|
|
97
|
+
|
|
98
|
+
1. **Metadata leaks into snippets.** YAML frontmatter gets indexed with the content, so citation snippets end up containing `sha1: abc…` and `source_pdf: …`.
|
|
99
|
+
2. **Page numbers become unreliable.** When a chunk spans page boundaries, parsing `## Page N` markers collapses everything to `p.1` and citations lose precision.
|
|
100
|
+
|
|
101
|
+
Page-per-file fixes both: each markdown file holds only one page's text (nothing to leak), and the page number is a property of the path (`p0042.md` → page 42). The trade-off is a high file count (~1,377 PDFs × ~50 pages ≈ 67k files); modern filesystems (APFS, ext4) handle it fine. Full rationale in [ADR-002](docs/adr/ADR-002-page-level-granularity.md).
|
|
102
|
+
|
|
103
|
+
## Configuration
|
|
104
|
+
|
|
105
|
+
`proactive-librarian.yaml` in the working directory, or `--config /path/to/config.yaml`:
|
|
106
|
+
|
|
107
|
+
```yaml
|
|
108
|
+
# Required
|
|
109
|
+
pdf_root: /path/to/pdfs
|
|
110
|
+
|
|
111
|
+
# Optional (defaults shown)
|
|
112
|
+
derived_dir: .derived # relative to pdf_root
|
|
113
|
+
collection_name: research # backend collection key
|
|
114
|
+
|
|
115
|
+
taxonomy: # optional subject-category enforcement
|
|
116
|
+
enabled: false
|
|
117
|
+
allowed_subjects: [] # first path component under pdf_root must match
|
|
118
|
+
|
|
119
|
+
backend:
|
|
120
|
+
type: qmd
|
|
121
|
+
binary: qmd # on PATH, or an absolute path
|
|
122
|
+
timeout_seconds: 30
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
With no config file present, it runs with permissive defaults (`pdf_root: .`, no taxonomy). See [`examples/`](examples/) for annotated config and taxonomy files.
|
|
126
|
+
|
|
127
|
+
## Architecture
|
|
128
|
+
|
|
129
|
+
Five load-bearing decisions, each documented as an ADR:
|
|
130
|
+
|
|
131
|
+
| ADR | Decision |
|
|
132
|
+
|-----|----------|
|
|
133
|
+
| [ADR-001](docs/adr/ADR-001-storage-substrate.md) | QMD as the search substrate, not a standalone vector DB |
|
|
134
|
+
| [ADR-002](docs/adr/ADR-002-page-level-granularity.md) | Page-per-file granularity, not paragraph chunks |
|
|
135
|
+
| [ADR-003](docs/adr/ADR-003-explicit-invocation.md) | Explicit CLI invocation, not an ambient hook (yet) |
|
|
136
|
+
| [ADR-004](docs/adr/ADR-004-pdf-canonical-cache.md) | PDFs canonical; markdown is a disposable cache |
|
|
137
|
+
| [ADR-005](docs/adr/ADR-005-no-new-services.md) | A Python package + adapter, not a long-running service |
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
pip install -e ".[dev]"
|
|
143
|
+
pytest # 36+ cases
|
|
144
|
+
ruff check . # lint
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Roadmap
|
|
148
|
+
|
|
149
|
+
- Additional backends beyond QMD (the adapter boundary already exists).
|
|
150
|
+
- Publish to PyPI for a plain `pip install proactive-librarian`.
|
|
151
|
+
- Optional ambient/agent invocation surface (the explicit-CLI decision in ADR-003 is revisitable as adoption grows).
|
|
152
|
+
- Richer query output formats (JSON, BibTeX-style citation export).
|
|
153
|
+
|
|
154
|
+
## Contributing
|
|
155
|
+
|
|
156
|
+
Issues and pull requests welcome. The codebase is small and well-tested — run `pytest` and `ruff check .` before opening a PR. If you're adding a backend, start from the QMD adapter and the ADR-001 rationale.
|
|
157
|
+
|
|
158
|
+
## License
|
|
159
|
+
|
|
160
|
+
[MIT](LICENSE) © David Orban.
|
|
161
|
+
|
|
162
|
+
## Status
|
|
163
|
+
|
|
164
|
+
**v0.1.0** — in single-user production use; the API will stabilise as more backends and surfaces are added. The cache format and CLI are stable. See [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Command-line entry: `librarian ingest` and `librarian query`."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from proactive_librarian import __version__
|
|
10
|
+
from proactive_librarian.config import load_config
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _add_common_args(p: argparse.ArgumentParser) -> None:
|
|
14
|
+
p.add_argument("--config", type=Path, default=None,
|
|
15
|
+
help="Path to proactive-librarian.yaml (overrides default discovery)")
|
|
16
|
+
p.add_argument("--pdf-root", type=str, default=None,
|
|
17
|
+
help="Override pdf_root from config")
|
|
18
|
+
p.add_argument("--collection", type=str, default=None,
|
|
19
|
+
help="Override backend collection_name")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
23
|
+
parser = argparse.ArgumentParser(
|
|
24
|
+
prog="librarian",
|
|
25
|
+
description="Index a PDF library and answer queries with page-accurate citations.",
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
28
|
+
|
|
29
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
30
|
+
|
|
31
|
+
# --- ingest ---
|
|
32
|
+
ing = sub.add_parser("ingest", help="Walk pdf_root and refresh the cache")
|
|
33
|
+
_add_common_args(ing)
|
|
34
|
+
ing.add_argument("--full", action="store_true",
|
|
35
|
+
help="Ignore manifest sha1 cache; re-extract every PDF")
|
|
36
|
+
ing.add_argument("--incremental", action="store_true", default=True,
|
|
37
|
+
help="Default mode: only process new or sha1-changed PDFs")
|
|
38
|
+
ing.add_argument("--dry-run", action="store_true",
|
|
39
|
+
help="Walk + validate + sha1 but write nothing")
|
|
40
|
+
ing.add_argument("--limit", type=int, default=0,
|
|
41
|
+
help="Process at most N PDFs (useful for testing)")
|
|
42
|
+
ing.add_argument("--category", type=str, default=None,
|
|
43
|
+
help="Only process PDFs under this exact top-level subject")
|
|
44
|
+
ing.add_argument("--clean", action="store_true",
|
|
45
|
+
help="Delete the derived cache first (use for format migrations)")
|
|
46
|
+
ing.add_argument("--verbose", "-v", action="store_true",
|
|
47
|
+
help="Print every file decision")
|
|
48
|
+
|
|
49
|
+
# --- query ---
|
|
50
|
+
qry = sub.add_parser("query", help="Search the indexed library")
|
|
51
|
+
_add_common_args(qry)
|
|
52
|
+
qry.add_argument("query", type=str, help="Natural language query")
|
|
53
|
+
qry.add_argument("--limit", "-n", type=int, default=5,
|
|
54
|
+
help="Max results (default 5)")
|
|
55
|
+
qry.add_argument("--json", action="store_true",
|
|
56
|
+
help="Raw JSON output instead of formatted citations")
|
|
57
|
+
|
|
58
|
+
return parser
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _build_overrides(args: argparse.Namespace) -> dict:
|
|
62
|
+
overrides: dict = {}
|
|
63
|
+
if getattr(args, "pdf_root", None):
|
|
64
|
+
overrides["pdf_root"] = args.pdf_root
|
|
65
|
+
if getattr(args, "collection", None):
|
|
66
|
+
overrides["collection_name"] = args.collection
|
|
67
|
+
return overrides
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
71
|
+
parser = build_parser()
|
|
72
|
+
args = parser.parse_args(argv)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
config = load_config(
|
|
76
|
+
explicit_path=args.config,
|
|
77
|
+
cli_overrides=_build_overrides(args),
|
|
78
|
+
)
|
|
79
|
+
except (FileNotFoundError, ValueError) as e:
|
|
80
|
+
print(f"Config error: {e}", file=sys.stderr)
|
|
81
|
+
return 2
|
|
82
|
+
|
|
83
|
+
if args.command == "ingest":
|
|
84
|
+
from proactive_librarian.ingest import run_ingest
|
|
85
|
+
stats = run_ingest(
|
|
86
|
+
config,
|
|
87
|
+
full=args.full,
|
|
88
|
+
dry_run=args.dry_run,
|
|
89
|
+
limit=args.limit,
|
|
90
|
+
category=args.category,
|
|
91
|
+
clean=args.clean,
|
|
92
|
+
verbose=args.verbose,
|
|
93
|
+
)
|
|
94
|
+
return 1 if stats.errors > 0 else 0
|
|
95
|
+
|
|
96
|
+
if args.command == "query":
|
|
97
|
+
from proactive_librarian.query import run_query
|
|
98
|
+
return run_query(
|
|
99
|
+
args.query, config, limit=args.limit, as_json=args.json,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
parser.print_help()
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
sys.exit(main())
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Configuration loader for proactive-librarian.
|
|
2
|
+
|
|
3
|
+
Everything that was hardcoded in the original personal-vault implementation
|
|
4
|
+
lives here as overridable settings. The intent is "config file in repo root,
|
|
5
|
+
sensible defaults everywhere else."
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
DEFAULT_CONFIG_FILENAMES = ("proactive-librarian.yaml", "proactive-librarian.yml")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class TaxonomyConfig:
|
|
22
|
+
"""Optional subject-category enforcement."""
|
|
23
|
+
enabled: bool = False
|
|
24
|
+
allowed_subjects: tuple[str, ...] = ()
|
|
25
|
+
guide_reference: Optional[str] = None # human-readable pointer to a docs file
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class BackendConfig:
|
|
30
|
+
"""Retrieval backend settings. Only QMD is implemented today."""
|
|
31
|
+
type: str = "qmd"
|
|
32
|
+
binary: str = "qmd"
|
|
33
|
+
timeout_seconds: int = 30
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class Config:
|
|
38
|
+
"""All settings for a single proactive-librarian instance.
|
|
39
|
+
|
|
40
|
+
Construct via `load_config(...)`. The dataclass is frozen so callers can
|
|
41
|
+
safely share it across threads / passes.
|
|
42
|
+
"""
|
|
43
|
+
pdf_root: Path
|
|
44
|
+
derived_dir: str = ".derived"
|
|
45
|
+
collection_name: str = "research"
|
|
46
|
+
page_filename_padding: int = 4
|
|
47
|
+
taxonomy: TaxonomyConfig = field(default_factory=TaxonomyConfig)
|
|
48
|
+
backend: BackendConfig = field(default_factory=BackendConfig)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def derived_root(self) -> Path:
|
|
52
|
+
return self.pdf_root / self.derived_dir
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def manifest_path(self) -> Path:
|
|
56
|
+
return self.derived_root / "_manifest.json"
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def failures_path(self) -> Path:
|
|
60
|
+
return self.derived_root / "_failures.json"
|
|
61
|
+
|
|
62
|
+
def page_filename(self, n: int) -> str:
|
|
63
|
+
return f"p{n:0{self.page_filename_padding}d}.md"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _find_default_config(start: Path) -> Optional[Path]:
|
|
67
|
+
"""Look for a config file in `start` (typically cwd). No upward search —
|
|
68
|
+
explicit beats implicit."""
|
|
69
|
+
for name in DEFAULT_CONFIG_FILENAMES:
|
|
70
|
+
candidate = start / name
|
|
71
|
+
if candidate.exists():
|
|
72
|
+
return candidate
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_config(
|
|
77
|
+
explicit_path: Optional[Path] = None,
|
|
78
|
+
cli_overrides: Optional[dict] = None,
|
|
79
|
+
) -> Config:
|
|
80
|
+
"""Resolve config from (in priority order): CLI overrides > config file > defaults.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
explicit_path: an absolute path passed via `--config`. Required to exist if given.
|
|
84
|
+
cli_overrides: a dict like `{"pdf_root": "/x/y"}` from argparse. Wins over file values.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Frozen `Config`.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
FileNotFoundError if `explicit_path` was passed but doesn't exist.
|
|
91
|
+
ValueError if `pdf_root` cannot be resolved from any source.
|
|
92
|
+
"""
|
|
93
|
+
cli_overrides = cli_overrides or {}
|
|
94
|
+
|
|
95
|
+
file_data: dict = {}
|
|
96
|
+
config_path: Optional[Path] = None
|
|
97
|
+
|
|
98
|
+
if explicit_path is not None:
|
|
99
|
+
if not explicit_path.exists():
|
|
100
|
+
raise FileNotFoundError(f"Config file not found: {explicit_path}")
|
|
101
|
+
config_path = explicit_path
|
|
102
|
+
else:
|
|
103
|
+
config_path = _find_default_config(Path.cwd())
|
|
104
|
+
|
|
105
|
+
if config_path is not None:
|
|
106
|
+
try:
|
|
107
|
+
file_data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
108
|
+
except yaml.YAMLError as e:
|
|
109
|
+
raise ValueError(f"Invalid YAML in {config_path}: {e}") from e
|
|
110
|
+
|
|
111
|
+
# Layer: defaults < file < CLI
|
|
112
|
+
merged: dict = {**file_data, **cli_overrides}
|
|
113
|
+
|
|
114
|
+
pdf_root_str = merged.get("pdf_root") or os.environ.get("LIBRARIAN_PDF_ROOT")
|
|
115
|
+
if not pdf_root_str:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
"pdf_root is required. Set it in proactive-librarian.yaml, "
|
|
118
|
+
"via --pdf-root, or via $LIBRARIAN_PDF_ROOT."
|
|
119
|
+
)
|
|
120
|
+
pdf_root = Path(pdf_root_str).expanduser().resolve()
|
|
121
|
+
|
|
122
|
+
taxonomy_data = merged.get("taxonomy") or {}
|
|
123
|
+
taxonomy = TaxonomyConfig(
|
|
124
|
+
enabled=bool(taxonomy_data.get("enabled", False)),
|
|
125
|
+
allowed_subjects=tuple(taxonomy_data.get("allowed_subjects", []) or []),
|
|
126
|
+
guide_reference=taxonomy_data.get("guide_reference"),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
backend_data = merged.get("backend") or {}
|
|
130
|
+
backend = BackendConfig(
|
|
131
|
+
type=backend_data.get("type", "qmd"),
|
|
132
|
+
binary=backend_data.get("binary", "qmd"),
|
|
133
|
+
timeout_seconds=int(backend_data.get("timeout_seconds", 30)),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return Config(
|
|
137
|
+
pdf_root=pdf_root,
|
|
138
|
+
derived_dir=merged.get("derived_dir", ".derived"),
|
|
139
|
+
collection_name=merged.get("collection_name", "research"),
|
|
140
|
+
page_filename_padding=int(merged.get("page_filename_padding", 4)),
|
|
141
|
+
taxonomy=taxonomy,
|
|
142
|
+
backend=backend,
|
|
143
|
+
)
|