paper-manager 0.12.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -2
- package/dist/commands/literature.js +4 -0
- package/dist/dep/index.js +19 -0
- package/dist/extractor/markdown.js +1 -19
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -8,10 +8,11 @@ A CLI tool for managing academic papers with knowledge base and vector search su
|
|
|
8
8
|
## Features
|
|
9
9
|
|
|
10
10
|
- **Semantic search** — FAISS vector indexing with configurable embedding models, query your papers by meaning rather than keywords
|
|
11
|
+
- **Add papers by DOI** — automatically download Open Access PDFs via [Unpaywall API](https://unpaywall.org/products/api) with `--doi` ([integration guide](paper-cli/unpaywall.md))
|
|
11
12
|
- **PDF metadata extraction** — automatically extracts title, author, keywords, DOI, and more from PDF files
|
|
12
13
|
- **DOI deduplication** — detects duplicate papers by DOI before adding, with `--force` override
|
|
13
14
|
- **Multi-format support** — import from PDF, TXT, MD, TEX, and other text-based formats
|
|
14
|
-
- **PDF-to-Markdown conversion** — optional high-quality conversion via [opendataloader-pdf](https://github.com/
|
|
15
|
+
- **PDF-to-Markdown conversion** — optional high-quality conversion via [opendataloader-pdf](https://github.com/opendataloader-project/opendataloader-pdf) with image extraction ([integration guide](paper-cli/opendataloader-pdf.md))
|
|
15
16
|
- **Dual-scope data model** — user-level (`~/.paper-manager/`) for global collections and project-level (`./.paper-manager/`) for project-specific papers, with automatic scope resolution
|
|
16
17
|
- **DOI-to-BibTeX** — convert DOI to BibTeX citation in one command
|
|
17
18
|
- **Machine-readable output** — `--json` and `--jq` flags on all read commands for scripting and automation
|
|
@@ -49,6 +50,10 @@ paper kb create my-papers -d "My research papers"
|
|
|
49
50
|
# Add a paper (supports PDF, TXT, MD, TEX, etc.)
|
|
50
51
|
paper lit add <knowledge-base-id> ./paper.pdf
|
|
51
52
|
|
|
53
|
+
# Or add an Open Access paper by DOI
|
|
54
|
+
paper config set email '"you@example.com"' --user # one-time setup for Unpaywall API
|
|
55
|
+
paper lit add <knowledge-base-id> --doi 10.1038/nature12373
|
|
56
|
+
|
|
52
57
|
# Search across papers
|
|
53
58
|
paper kb query <knowledge-base-id> "attention mechanism"
|
|
54
59
|
```
|
|
@@ -78,7 +83,8 @@ paper kb query <id> <query-text> [--json] [--jq <expr>] # Query a knowledge bas
|
|
|
78
83
|
### Literature (`paper lit`)
|
|
79
84
|
|
|
80
85
|
```bash
|
|
81
|
-
paper lit add <kb-id> <file-path> [-f] # Add a literature (auto-extracts PDF metadata
|
|
86
|
+
paper lit add <kb-id> <file-path> [-f] # Add a literature from file (auto-extracts PDF metadata)
|
|
87
|
+
paper lit add <kb-id> --doi <doi> [-f] # Add an Open Access paper by DOI via Unpaywall
|
|
82
88
|
paper lit remove <kb-id> <id> # Remove a literature
|
|
83
89
|
paper lit update <kb-id> <id> [opts] # Update literature metadata
|
|
84
90
|
paper lit list <kb-id> [--json] [--jq <expr>] # List literatures
|
|
@@ -96,6 +102,72 @@ paper util doi2bib <doi> # Convert a DOI to BibTeX citation
|
|
|
96
102
|
paper util pdf-meta <file> [--json] [--jq <expr>] # Extract metadata from a PDF file
|
|
97
103
|
```
|
|
98
104
|
|
|
105
|
+
## Usage Scenarios
|
|
106
|
+
|
|
107
|
+
### Building a paper collection for a research project
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
# Initialize a project-scoped data directory (version-controllable)
|
|
111
|
+
paper config init
|
|
112
|
+
|
|
113
|
+
# Create a knowledge base for your topic
|
|
114
|
+
paper kb create "llm-agents" -d "Papers on LLM-based autonomous agents"
|
|
115
|
+
# Output: Knowledge base created: 9f3a...
|
|
116
|
+
|
|
117
|
+
# Add papers — by file or by DOI
|
|
118
|
+
paper lit add 9f3a ./downloaded-paper.pdf
|
|
119
|
+
paper lit add 9f3a --doi 10.48550/arXiv.2305.10601
|
|
120
|
+
|
|
121
|
+
# Ask questions across all your papers
|
|
122
|
+
paper kb query 9f3a "how do agents handle long-term memory?"
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Quick-adding Open Access papers you find online
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# Spot a DOI in a reference list? One command to ingest it.
|
|
129
|
+
paper lit add <kb-id> --doi 10.1038/nature12373
|
|
130
|
+
|
|
131
|
+
# Unpaywall checks OA status, downloads the PDF, extracts metadata,
|
|
132
|
+
# and builds a vector index — all in one step.
|
|
133
|
+
# If the paper is not OA, you'll get a clear error with instructions.
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Generating BibTeX citations for your bibliography
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
paper util doi2bib 10.1145/3586183.3606763
|
|
140
|
+
# @inproceedings{...}
|
|
141
|
+
|
|
142
|
+
# Combine with jq to batch-extract DOIs from your knowledge base
|
|
143
|
+
paper lit list <kb-id> --jq '.[].doi | select(. != null)'
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Scripting with JSON output
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Export all papers in a knowledge base as JSON
|
|
150
|
+
paper lit list <kb-id> --json > papers.json
|
|
151
|
+
|
|
152
|
+
# Filter with jq expressions inline
|
|
153
|
+
paper lit list <kb-id> --jq '[.[] | {title, doi, author}]'
|
|
154
|
+
|
|
155
|
+
# Find papers by a specific author
|
|
156
|
+
paper lit search <kb-id> -a "Vaswani" --json
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Annotating papers for a literature review
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Attach notes to track your reading progress
|
|
163
|
+
paper lit note set <lit-id> status "read"
|
|
164
|
+
paper lit note set <lit-id> relevance "high"
|
|
165
|
+
paper lit note set <lit-id> summary "Proposes transformer architecture..."
|
|
166
|
+
|
|
167
|
+
# Review all notes
|
|
168
|
+
paper lit note list <lit-id>
|
|
169
|
+
```
|
|
170
|
+
|
|
99
171
|
## Configuration
|
|
100
172
|
|
|
101
173
|
See [Configuration Reference](docs/configuration.md) for all available config fields and detailed usage.
|
|
@@ -9,6 +9,7 @@ import * as projectKb from "../db/project/knowledge-bases.js";
|
|
|
9
9
|
import * as projectLit from "../db/project/literatures.js";
|
|
10
10
|
import * as userKb from "../db/user/knowledge-bases.js";
|
|
11
11
|
import * as userLit from "../db/user/literatures.js";
|
|
12
|
+
import { isHybridBackendAvailable } from "../dep/index.js";
|
|
12
13
|
import { extractContent, extractPdfMetadata } from "../extractor/index.js";
|
|
13
14
|
import { convertPdfToMarkdown, isOpendataLoaderAvailable, removeImageDir, saveConvertResult, } from "../extractor/markdown.js";
|
|
14
15
|
import { log } from "../logger.js";
|
|
@@ -217,6 +218,9 @@ export function createLiteratureCommand() {
|
|
|
217
218
|
fs.copyFileSync(absolutePath, path.join(filesDir, `${literature.id}${ext}`));
|
|
218
219
|
// Convert PDF to Markdown if opendataloader is available
|
|
219
220
|
if (isPdf && (await isOpendataLoaderAvailable())) {
|
|
221
|
+
if (!(await isHybridBackendAvailable())) {
|
|
222
|
+
log.step("Hybrid backend (localhost:5002) is not running; using basic conversion. Start the backend for better quality.");
|
|
223
|
+
}
|
|
220
224
|
const result = await convertPdfToMarkdown(absolutePath);
|
|
221
225
|
if (result) {
|
|
222
226
|
saveConvertResult(filesDir, literature.id, result);
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { request } from "node:http";
|
|
2
|
+
const HYBRID_BACKEND_URL = "http://localhost:5002";
|
|
3
|
+
const HYBRID_PROBE_TIMEOUT_MS = 1500;
|
|
4
|
+
/** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
|
|
5
|
+
export function isHybridBackendAvailable() {
|
|
6
|
+
return new Promise((resolve) => {
|
|
7
|
+
const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
|
|
8
|
+
// Any response means the server is running
|
|
9
|
+
res.resume();
|
|
10
|
+
resolve(true);
|
|
11
|
+
});
|
|
12
|
+
req.on("error", () => resolve(false));
|
|
13
|
+
req.on("timeout", () => {
|
|
14
|
+
req.destroy();
|
|
15
|
+
resolve(false);
|
|
16
|
+
});
|
|
17
|
+
req.end();
|
|
18
|
+
});
|
|
19
|
+
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { execFile } from "node:child_process";
|
|
2
2
|
import { existsSync, mkdirSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { request } from "node:http";
|
|
4
3
|
import { tmpdir } from "node:os";
|
|
5
4
|
import * as path from "node:path";
|
|
5
|
+
import { isHybridBackendAvailable } from "../dep/index.js";
|
|
6
6
|
/**
|
|
7
7
|
* Check whether opendataloader-pdf is available (package installed + Java runtime).
|
|
8
8
|
* Result is cached after the first call.
|
|
@@ -124,24 +124,6 @@ export async function checkOpendataLoaderStatus() {
|
|
|
124
124
|
hybridBackendAvailable,
|
|
125
125
|
};
|
|
126
126
|
}
|
|
127
|
-
const HYBRID_BACKEND_URL = "http://localhost:5002";
|
|
128
|
-
const HYBRID_PROBE_TIMEOUT_MS = 1500;
|
|
129
|
-
/** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
|
|
130
|
-
function isHybridBackendAvailable() {
|
|
131
|
-
return new Promise((resolve) => {
|
|
132
|
-
const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
|
|
133
|
-
// Any response means the server is running
|
|
134
|
-
res.resume();
|
|
135
|
-
resolve(true);
|
|
136
|
-
});
|
|
137
|
-
req.on("error", () => resolve(false));
|
|
138
|
-
req.on("timeout", () => {
|
|
139
|
-
req.destroy();
|
|
140
|
-
resolve(false);
|
|
141
|
-
});
|
|
142
|
-
req.end();
|
|
143
|
-
});
|
|
144
|
-
}
|
|
145
127
|
// execFile is safe — arguments are passed as an array, no shell interpolation.
|
|
146
128
|
function getJavaVersion() {
|
|
147
129
|
return new Promise((resolve) => {
|