wst-library 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wst_library-0.1.0/LICENSE +50 -0
- wst_library-0.1.0/PKG-INFO +178 -0
- wst_library-0.1.0/README.md +147 -0
- wst_library-0.1.0/pyproject.toml +58 -0
- wst_library-0.1.0/setup.cfg +4 -0
- wst_library-0.1.0/src/wst/__init__.py +1 -0
- wst_library-0.1.0/src/wst/ai.py +110 -0
- wst_library-0.1.0/src/wst/backup.py +218 -0
- wst_library-0.1.0/src/wst/browse.py +195 -0
- wst_library-0.1.0/src/wst/cli.py +351 -0
- wst_library-0.1.0/src/wst/config.py +18 -0
- wst_library-0.1.0/src/wst/db.py +249 -0
- wst_library-0.1.0/src/wst/ingest.py +142 -0
- wst_library-0.1.0/src/wst/models.py +51 -0
- wst_library-0.1.0/src/wst/pdf.py +37 -0
- wst_library-0.1.0/src/wst/storage.py +95 -0
- wst_library-0.1.0/src/wst_library.egg-info/PKG-INFO +178 -0
- wst_library-0.1.0/src/wst_library.egg-info/SOURCES.txt +28 -0
- wst_library-0.1.0/src/wst_library.egg-info/dependency_links.txt +1 -0
- wst_library-0.1.0/src/wst_library.egg-info/entry_points.txt +2 -0
- wst_library-0.1.0/src/wst_library.egg-info/requires.txt +8 -0
- wst_library-0.1.0/src/wst_library.egg-info/top_level.txt +1 -0
- wst_library-0.1.0/tests/test_ai.py +62 -0
- wst_library-0.1.0/tests/test_backup.py +78 -0
- wst_library-0.1.0/tests/test_cli.py +77 -0
- wst_library-0.1.0/tests/test_config.py +34 -0
- wst_library-0.1.0/tests/test_db.py +122 -0
- wst_library-0.1.0/tests/test_ingest.py +68 -0
- wst_library-0.1.0/tests/test_models.py +76 -0
- wst_library-0.1.0/tests/test_storage.py +138 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
MIT License with Commons Clause
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 cnexans
|
|
4
|
+
|
|
5
|
+
"Commons Clause" License Condition v1.0
|
|
6
|
+
|
|
7
|
+
The Software is provided to you by the Licensor under the License, as
|
|
8
|
+
defined below, subject to the following condition.
|
|
9
|
+
|
|
10
|
+
Without limiting other conditions in the License, the grant of rights
|
|
11
|
+
under the License will not include, and the License does not grant to
|
|
12
|
+
you, the right to Sell the Software.
|
|
13
|
+
|
|
14
|
+
For purposes of the foregoing, "Sell" means practicing any or all of
|
|
15
|
+
the rights granted to you under the License to provide to third
|
|
16
|
+
parties, for a fee or other consideration (including without
|
|
17
|
+
limitation fees for hosting or consulting/support services related to
|
|
18
|
+
the Software), a product or service whose value derives, entirely or
|
|
19
|
+
substantially, from the functionality of the Software. Any license
|
|
20
|
+
notice or attribution required by the License must also include this
|
|
21
|
+
Commons Clause License Condition notice.
|
|
22
|
+
|
|
23
|
+
Software: wst
|
|
24
|
+
|
|
25
|
+
License: MIT License
|
|
26
|
+
|
|
27
|
+
Licensor: cnexans
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
MIT License
|
|
32
|
+
|
|
33
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
34
|
+
a copy of this software and associated documentation files (the
|
|
35
|
+
"Software"), to deal in the Software without restriction, including
|
|
36
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
37
|
+
distribute, and/or sublicense copies of the Software, and to permit
|
|
38
|
+
persons to whom the Software is furnished to do so, subject to the
|
|
39
|
+
following conditions:
|
|
40
|
+
|
|
41
|
+
The above copyright notice and this permission notice shall be
|
|
42
|
+
included in all copies or substantial portions of the Software.
|
|
43
|
+
|
|
44
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
45
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
46
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
47
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
48
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
49
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
50
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wst-library
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI tool for organizing books and PDFs with AI-powered metadata
|
|
5
|
+
Author: cnexans
|
|
6
|
+
License-Expression: LicenseRef-Proprietary
|
|
7
|
+
Project-URL: Homepage, https://github.com/cnexans/wst
|
|
8
|
+
Project-URL: Repository, https://github.com/cnexans/wst
|
|
9
|
+
Project-URL: Issues, https://github.com/cnexans/wst/issues
|
|
10
|
+
Keywords: pdf,books,library,metadata,cli,organizer
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: click>=8.0
|
|
24
|
+
Requires-Dist: pymupdf>=1.24
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Requires-Dist: InquirerPy>=0.3
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# wst — Wan Shi Tong
|
|
33
|
+
|
|
34
|
+
<div align="center">
|
|
35
|
+
|
|
36
|
+
<img src="docs/images/wan-shi-tong.png" alt="Wan Shi Tong" width="300">
|
|
37
|
+
|
|
38
|
+
*"I am Wan Shi Tong, he who knows ten thousand things."*
|
|
39
|
+
|
|
40
|
+
<sub>Character from Avatar: The Last Airbender. Avatar: The Last Airbender is a trademark of Viacom International Inc. Image used for illustrative purposes only.</sub>
|
|
41
|
+
|
|
42
|
+
</div>
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
CLI tool for organizing books and PDFs with AI-powered metadata generation.
|
|
47
|
+
|
|
48
|
+
Named after **Wan Shi Tong**, the ancient spirit who collected every piece of knowledge in the world and guarded the great library in the desert. This tool aspires to do the same for your PDFs — just with less hostility toward humans.
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **AI-powered metadata**: Automatically extracts and completes metadata (title, author, type, year, summary, tags, etc.) using Claude CLI with web search for missing fields (year, ISBN, publisher)
|
|
53
|
+
- **Organized library**: Files sorted by type (`books/`, `papers/`, `notes/`, `exercises/`, `guides/`) with consistent naming (`Author - Title (Year).pdf`)
|
|
54
|
+
- **SQLite search index**: Full-text search across title, author, tags, subject, and summary via FTS5
|
|
55
|
+
- **Interactive browser**: Fuzzy-search your library, view and edit metadata interactively
|
|
56
|
+
- **Cloud backup**: Backup files to iCloud Drive (macOS/Windows), with extensible provider system for future S3 support
|
|
57
|
+
- **Extensible backends**: Abstract layers for AI (Claude CLI, future API/SDK) and storage (local filesystem, future S3)
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
### pipx (recommended, all platforms)
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pipx install wst-library
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### pip
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install wst-library
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Homebrew (macOS/Linux)
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
brew tap cnexans/tap
|
|
77
|
+
brew install wst
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Chocolatey (Windows)
|
|
81
|
+
|
|
82
|
+
```powershell
|
|
83
|
+
choco install wst
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### From source
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
git clone https://github.com/cnexans/wst.git
|
|
90
|
+
cd wst
|
|
91
|
+
make install
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Quick Start
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Ingest PDFs from a folder
|
|
98
|
+
wst ingest ~/Documents/papers/
|
|
99
|
+
|
|
100
|
+
# Ingest from current directory
|
|
101
|
+
wst ingest .
|
|
102
|
+
|
|
103
|
+
# Ingest from default inbox (~/wst/inbox/)
|
|
104
|
+
wst ingest
|
|
105
|
+
|
|
106
|
+
# Ingest with manual confirmation for each file
|
|
107
|
+
wst ingest --confirm
|
|
108
|
+
|
|
109
|
+
# Re-ingest files with fresh AI metadata (e.g. after enabling web search)
|
|
110
|
+
wst ingest --reprocess
|
|
111
|
+
|
|
112
|
+
# Search
|
|
113
|
+
wst search "machine learning"
|
|
114
|
+
wst search --author "Knuth"
|
|
115
|
+
wst search --type textbook
|
|
116
|
+
wst search --subject "Mathematics"
|
|
117
|
+
|
|
118
|
+
# List all documents
|
|
119
|
+
wst list
|
|
120
|
+
wst list --type paper --sort year
|
|
121
|
+
|
|
122
|
+
# Show full details
|
|
123
|
+
wst show 1
|
|
124
|
+
wst show "Design Patterns"
|
|
125
|
+
|
|
126
|
+
# Interactive browser — fuzzy search, view and edit metadata
|
|
127
|
+
wst browse
|
|
128
|
+
|
|
129
|
+
# Edit a specific document
|
|
130
|
+
wst edit 1
|
|
131
|
+
wst edit "Player's Handbook"
|
|
132
|
+
|
|
133
|
+
# Backup to iCloud
|
|
134
|
+
wst backup icloud # interactive: all or select file
|
|
135
|
+
wst backup icloud 1 # backup specific file by ID
|
|
136
|
+
wst backup icloud "Player's Handbook" # backup by title
|
|
137
|
+
wst backup # interactive: choose provider
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Commands
|
|
141
|
+
|
|
142
|
+
| Command | Description |
|
|
143
|
+
|---------|-------------|
|
|
144
|
+
| `wst ingest [PATH] [--confirm] [--reprocess]` | Ingest PDFs from a path or the inbox, generate metadata with AI |
|
|
145
|
+
| `wst search <query> [--author] [--type] [--subject]` | Full-text search across the index |
|
|
146
|
+
| `wst list [--type] [--sort]` | List all documents in the library |
|
|
147
|
+
| `wst show <id-or-title>` | Show complete metadata for a document |
|
|
148
|
+
| `wst edit <id-or-title>` | Interactively edit metadata for a document |
|
|
149
|
+
| `wst browse` | Interactive TUI for browsing and editing documents |
|
|
150
|
+
| `wst backup [provider] [id-or-title]` | Backup files to a cloud provider (iCloud, future S3) |
|
|
151
|
+
|
|
152
|
+
## Library Structure
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
~/wst/
|
|
156
|
+
├── inbox/ # PDFs pending ingestion
|
|
157
|
+
└── library/
|
|
158
|
+
├── books/ # book, novel, textbook
|
|
159
|
+
├── papers/ # paper
|
|
160
|
+
├── notes/ # class-notes
|
|
161
|
+
├── exercises/ # exercises
|
|
162
|
+
├── guides/ # guide-theory, guide-practice
|
|
163
|
+
└── wst.db # SQLite index
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Documentation
|
|
167
|
+
|
|
168
|
+
See [docs/README.md](docs/README.md) for architecture details and diagrams.
|
|
169
|
+
|
|
170
|
+
## Requirements
|
|
171
|
+
|
|
172
|
+
- Python 3.11+
|
|
173
|
+
- `claude` CLI (authenticated) for AI metadata generation
|
|
174
|
+
- macOS, Windows, or Linux
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT with Commons Clause — free to use, modify, and distribute. Commercial sale rights reserved to the author. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# wst — Wan Shi Tong
|
|
2
|
+
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+
<img src="docs/images/wan-shi-tong.png" alt="Wan Shi Tong" width="300">
|
|
6
|
+
|
|
7
|
+
*"I am Wan Shi Tong, he who knows ten thousand things."*
|
|
8
|
+
|
|
9
|
+
<sub>Character from Avatar: The Last Airbender. Avatar: The Last Airbender is a trademark of Viacom International Inc. Image used for illustrative purposes only.</sub>
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
CLI tool for organizing books and PDFs with AI-powered metadata generation.
|
|
16
|
+
|
|
17
|
+
Named after **Wan Shi Tong**, the ancient spirit who collected every piece of knowledge in the world and guarded the great library in the desert. This tool aspires to do the same for your PDFs — just with less hostility toward humans.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **AI-powered metadata**: Automatically extracts and completes metadata (title, author, type, year, summary, tags, etc.) using Claude CLI with web search for missing fields (year, ISBN, publisher)
|
|
22
|
+
- **Organized library**: Files sorted by type (`books/`, `papers/`, `notes/`, `exercises/`, `guides/`) with consistent naming (`Author - Title (Year).pdf`)
|
|
23
|
+
- **SQLite search index**: Full-text search across title, author, tags, subject, and summary via FTS5
|
|
24
|
+
- **Interactive browser**: Fuzzy-search your library, view and edit metadata interactively
|
|
25
|
+
- **Cloud backup**: Backup files to iCloud Drive (macOS/Windows), with extensible provider system for future S3 support
|
|
26
|
+
- **Extensible backends**: Abstract layers for AI (Claude CLI, future API/SDK) and storage (local filesystem, future S3)
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
### pipx (recommended, all platforms)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pipx install wst-library
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### pip
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install wst-library
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Homebrew (macOS/Linux)
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
brew tap cnexans/tap
|
|
46
|
+
brew install wst
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Chocolatey (Windows)
|
|
50
|
+
|
|
51
|
+
```powershell
|
|
52
|
+
choco install wst
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### From source
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/cnexans/wst.git
|
|
59
|
+
cd wst
|
|
60
|
+
make install
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Ingest PDFs from a folder
|
|
67
|
+
wst ingest ~/Documents/papers/
|
|
68
|
+
|
|
69
|
+
# Ingest from current directory
|
|
70
|
+
wst ingest .
|
|
71
|
+
|
|
72
|
+
# Ingest from default inbox (~/wst/inbox/)
|
|
73
|
+
wst ingest
|
|
74
|
+
|
|
75
|
+
# Ingest with manual confirmation for each file
|
|
76
|
+
wst ingest --confirm
|
|
77
|
+
|
|
78
|
+
# Re-ingest files with fresh AI metadata (e.g. after enabling web search)
|
|
79
|
+
wst ingest --reprocess
|
|
80
|
+
|
|
81
|
+
# Search
|
|
82
|
+
wst search "machine learning"
|
|
83
|
+
wst search --author "Knuth"
|
|
84
|
+
wst search --type textbook
|
|
85
|
+
wst search --subject "Mathematics"
|
|
86
|
+
|
|
87
|
+
# List all documents
|
|
88
|
+
wst list
|
|
89
|
+
wst list --type paper --sort year
|
|
90
|
+
|
|
91
|
+
# Show full details
|
|
92
|
+
wst show 1
|
|
93
|
+
wst show "Design Patterns"
|
|
94
|
+
|
|
95
|
+
# Interactive browser — fuzzy search, view and edit metadata
|
|
96
|
+
wst browse
|
|
97
|
+
|
|
98
|
+
# Edit a specific document
|
|
99
|
+
wst edit 1
|
|
100
|
+
wst edit "Player's Handbook"
|
|
101
|
+
|
|
102
|
+
# Backup to iCloud
|
|
103
|
+
wst backup icloud # interactive: all or select file
|
|
104
|
+
wst backup icloud 1 # backup specific file by ID
|
|
105
|
+
wst backup icloud "Player's Handbook" # backup by title
|
|
106
|
+
wst backup # interactive: choose provider
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Commands
|
|
110
|
+
|
|
111
|
+
| Command | Description |
|
|
112
|
+
|---------|-------------|
|
|
113
|
+
| `wst ingest [PATH] [--confirm] [--reprocess]` | Ingest PDFs from a path or the inbox, generate metadata with AI |
|
|
114
|
+
| `wst search <query> [--author] [--type] [--subject]` | Full-text search across the index |
|
|
115
|
+
| `wst list [--type] [--sort]` | List all documents in the library |
|
|
116
|
+
| `wst show <id-or-title>` | Show complete metadata for a document |
|
|
117
|
+
| `wst edit <id-or-title>` | Interactively edit metadata for a document |
|
|
118
|
+
| `wst browse` | Interactive TUI for browsing and editing documents |
|
|
119
|
+
| `wst backup [provider] [id-or-title]` | Backup files to a cloud provider (iCloud, future S3) |
|
|
120
|
+
|
|
121
|
+
## Library Structure
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
~/wst/
|
|
125
|
+
├── inbox/ # PDFs pending ingestion
|
|
126
|
+
└── library/
|
|
127
|
+
├── books/ # book, novel, textbook
|
|
128
|
+
├── papers/ # paper
|
|
129
|
+
├── notes/ # class-notes
|
|
130
|
+
├── exercises/ # exercises
|
|
131
|
+
├── guides/ # guide-theory, guide-practice
|
|
132
|
+
└── wst.db # SQLite index
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Documentation
|
|
136
|
+
|
|
137
|
+
See [docs/README.md](docs/README.md) for architecture details and diagrams.
|
|
138
|
+
|
|
139
|
+
## Requirements
|
|
140
|
+
|
|
141
|
+
- Python 3.11+
|
|
142
|
+
- `claude` CLI (authenticated) for AI metadata generation
|
|
143
|
+
- macOS, Windows, or Linux
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
MIT with Commons Clause — free to use, modify, and distribute. Commercial sale rights reserved to the author. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "wst-library"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CLI tool for organizing books and PDFs with AI-powered metadata"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "LicenseRef-Proprietary"
|
|
7
|
+
license-files = ["LICENSE"]
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
authors = [{name = "cnexans"}]
|
|
10
|
+
keywords = ["pdf", "books", "library", "metadata", "cli", "organizer"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Environment :: Console",
|
|
14
|
+
"Intended Audience :: End Users/Desktop",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Utilities",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"click>=8.0",
|
|
24
|
+
"pymupdf>=1.24",
|
|
25
|
+
"pydantic>=2.0",
|
|
26
|
+
"InquirerPy>=0.3",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/cnexans/wst"
|
|
31
|
+
Repository = "https://github.com/cnexans/wst"
|
|
32
|
+
Issues = "https://github.com/cnexans/wst/issues"
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
wst = "wst.cli:cli"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["setuptools>=68"]
|
|
39
|
+
build-backend = "setuptools.build_meta"
|
|
40
|
+
|
|
41
|
+
[project.optional-dependencies]
|
|
42
|
+
dev = [
|
|
43
|
+
"pytest>=8.0",
|
|
44
|
+
"ruff>=0.4",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
target-version = "py311"
|
|
52
|
+
line-length = 100
|
|
53
|
+
|
|
54
|
+
[tool.ruff.lint]
|
|
55
|
+
select = ["E", "F", "I", "W", "UP"]
|
|
56
|
+
|
|
57
|
+
[tool.pytest.ini_options]
|
|
58
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""wst — CLI tool for organizing books and PDFs."""
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
import subprocess
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
|
|
6
|
+
from wst.models import DocumentMetadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AIBackend(ABC):
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def generate_metadata(
|
|
12
|
+
self, existing_meta: dict, text_sample: str, filename: str
|
|
13
|
+
) -> DocumentMetadata: ...
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ClaudeCLIBackend(AIBackend):
|
|
17
|
+
def __init__(self, model: str = "sonnet"):
|
|
18
|
+
self.model = model
|
|
19
|
+
|
|
20
|
+
def generate_metadata(
|
|
21
|
+
self, existing_meta: dict, text_sample: str, filename: str
|
|
22
|
+
) -> DocumentMetadata:
|
|
23
|
+
schema = json.dumps(DocumentMetadata.model_json_schema())
|
|
24
|
+
prompt = self._build_prompt(existing_meta, text_sample, filename, schema)
|
|
25
|
+
|
|
26
|
+
result = subprocess.run(
|
|
27
|
+
[
|
|
28
|
+
"claude",
|
|
29
|
+
"-p",
|
|
30
|
+
"--model",
|
|
31
|
+
self.model,
|
|
32
|
+
"--output-format",
|
|
33
|
+
"json",
|
|
34
|
+
"--allowedTools",
|
|
35
|
+
"WebSearch",
|
|
36
|
+
"WebFetch",
|
|
37
|
+
],
|
|
38
|
+
input=prompt,
|
|
39
|
+
capture_output=True,
|
|
40
|
+
text=True,
|
|
41
|
+
timeout=180,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if result.returncode != 0:
|
|
45
|
+
raise RuntimeError(f"claude CLI failed: {result.stderr}")
|
|
46
|
+
|
|
47
|
+
wrapper = json.loads(result.stdout)
|
|
48
|
+
raw = wrapper.get("result", "")
|
|
49
|
+
|
|
50
|
+
return DocumentMetadata.model_validate(self._extract_json(raw))
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _extract_json(text: str) -> dict:
|
|
54
|
+
"""Extract JSON object from a response that may contain markdown fences."""
|
|
55
|
+
# Try direct parse first
|
|
56
|
+
text = text.strip()
|
|
57
|
+
if text.startswith("{"):
|
|
58
|
+
return json.loads(text)
|
|
59
|
+
# Extract from ```json ... ``` block
|
|
60
|
+
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
|
|
61
|
+
if match:
|
|
62
|
+
return json.loads(match.group(1))
|
|
63
|
+
raise ValueError(f"Could not extract JSON from AI response: {text[:200]}")
|
|
64
|
+
|
|
65
|
+
def _build_prompt(
|
|
66
|
+
self, existing_meta: dict, text_sample: str, filename: str, schema: str
|
|
67
|
+
) -> str:
|
|
68
|
+
meta_str = json.dumps({k: v for k, v in existing_meta.items() if v}, indent=2)
|
|
69
|
+
max_chars = 8000
|
|
70
|
+
if len(text_sample) > max_chars:
|
|
71
|
+
text_sample = text_sample[:max_chars] + "\n[...truncated]"
|
|
72
|
+
|
|
73
|
+
return f"""Analyze this PDF and return ONLY a JSON object matching the schema below.
|
|
74
|
+
No explanation, no markdown, just the raw JSON.
|
|
75
|
+
|
|
76
|
+
## JSON Schema
|
|
77
|
+
{schema}
|
|
78
|
+
|
|
79
|
+
## Filename
|
|
80
|
+
{filename}
|
|
81
|
+
|
|
82
|
+
## Existing PDF metadata
|
|
83
|
+
{meta_str}
|
|
84
|
+
|
|
85
|
+
## Text from first pages
|
|
86
|
+
{text_sample}
|
|
87
|
+
|
|
88
|
+
## Field guidelines
|
|
89
|
+
- doc_type: one of book, novel, textbook, paper, class-notes, exercises,
|
|
90
|
+
guide-theory, guide-practice
|
|
91
|
+
- language: ISO 639-1 code (e.g. "en", "es")
|
|
92
|
+
- tags: relevant topics and keywords
|
|
93
|
+
- summary: 2-3 sentence description
|
|
94
|
+
- table_of_contents: chapter titles if visible, otherwise null
|
|
95
|
+
- subject: broad knowledge area (e.g. "Mathematics", "Computer Science")
|
|
96
|
+
- Use null for fields that cannot be determined
|
|
97
|
+
- Always provide title and author — infer from content if needed
|
|
98
|
+
- IMPORTANT: If year, publisher, or ISBN are missing from the PDF text,
|
|
99
|
+
use web search to find the correct publication year, publisher, and ISBN.
|
|
100
|
+
Search for the book title and author to find this information."""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_ai_backend(name: str, model: str = "sonnet") -> AIBackend:
|
|
104
|
+
backends = {
|
|
105
|
+
"claude": ClaudeCLIBackend,
|
|
106
|
+
}
|
|
107
|
+
cls = backends.get(name)
|
|
108
|
+
if cls is None:
|
|
109
|
+
raise ValueError(f"Unknown AI backend: {name}. Available: {', '.join(backends)}")
|
|
110
|
+
return cls(model=model)
|