paper-manager 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,10 +8,11 @@ A CLI tool for managing academic papers with knowledge base and vector search su
8
8
  ## Features
9
9
 
10
10
  - **Semantic search** — FAISS vector indexing with configurable embedding models, query your papers by meaning rather than keywords
11
+ - **Add papers by DOI** — automatically download Open Access PDFs via [Unpaywall API](https://unpaywall.org/products/api) with `--doi` ([integration guide](paper-cli/unpaywall.md))
11
12
  - **PDF metadata extraction** — automatically extracts title, author, keywords, DOI, and more from PDF files
12
13
  - **DOI deduplication** — detects duplicate papers by DOI before adding, with `--force` override
13
14
  - **Multi-format support** — import from PDF, TXT, MD, TEX, and other text-based formats
14
- - **PDF-to-Markdown conversion** — optional high-quality conversion via [opendataloader-pdf](https://github.com/nicobailon/opendataloader-pdf) with image extraction
15
+ - **PDF-to-Markdown conversion** — optional high-quality conversion via [opendataloader-pdf](https://github.com/opendataloader-project/opendataloader-pdf) with image extraction ([integration guide](paper-cli/opendataloader-pdf.md))
15
16
  - **Dual-scope data model** — user-level (`~/.paper-manager/`) for global collections and project-level (`./.paper-manager/`) for project-specific papers, with automatic scope resolution
16
17
  - **DOI-to-BibTeX** — convert DOI to BibTeX citation in one command
17
18
  - **Machine-readable output** — `--json` and `--jq` flags on all read commands for scripting and automation
@@ -49,6 +50,10 @@ paper kb create my-papers -d "My research papers"
49
50
  # Add a paper (supports PDF, TXT, MD, TEX, etc.)
50
51
  paper lit add <knowledge-base-id> ./paper.pdf
51
52
 
53
+ # Or add an Open Access paper by DOI
54
+ paper config set email '"you@example.com"' --user # one-time setup for Unpaywall API
55
+ paper lit add <knowledge-base-id> --doi 10.1038/nature12373
56
+
52
57
  # Search across papers
53
58
  paper kb query <knowledge-base-id> "attention mechanism"
54
59
  ```
@@ -78,7 +83,8 @@ paper kb query <id> <query-text> [--json] [--jq <expr>] # Query a knowledge bas
78
83
  ### Literature (`paper lit`)
79
84
 
80
85
  ```bash
81
- paper lit add <kb-id> <file-path> [-f] # Add a literature (auto-extracts PDF metadata, rejects duplicate DOI)
86
+ paper lit add <kb-id> <file-path> [-f] # Add a literature from file (auto-extracts PDF metadata)
87
+ paper lit add <kb-id> --doi <doi> [-f] # Add an Open Access paper by DOI via Unpaywall
82
88
  paper lit remove <kb-id> <id> # Remove a literature
83
89
  paper lit update <kb-id> <id> [opts] # Update literature metadata
84
90
  paper lit list <kb-id> [--json] [--jq <expr>] # List literatures
@@ -96,6 +102,72 @@ paper util doi2bib <doi> # Convert a DOI to BibTeX citation
96
102
  paper util pdf-meta <file> [--json] [--jq <expr>] # Extract metadata from a PDF file
97
103
  ```
98
104
 
105
+ ## Usage Scenarios
106
+
107
+ ### Building a paper collection for a research project
108
+
109
+ ```bash
110
+ # Initialize a project-scoped data directory (version-controllable)
111
+ paper config init
112
+
113
+ # Create a knowledge base for your topic
114
+ paper kb create "llm-agents" -d "Papers on LLM-based autonomous agents"
115
+ # Output: Knowledge base created: 9f3a...
116
+
117
+ # Add papers — by file or by DOI
118
+ paper lit add 9f3a ./downloaded-paper.pdf
119
+ paper lit add 9f3a --doi 10.48550/arXiv.2305.10601
120
+
121
+ # Ask questions across all your papers
122
+ paper kb query 9f3a "how do agents handle long-term memory?"
123
+ ```
124
+
125
+ ### Quick-adding Open Access papers you find online
126
+
127
+ ```bash
128
+ # Spot a DOI in a reference list? One command to ingest it.
129
+ paper lit add <kb-id> --doi 10.1038/nature12373
130
+
131
+ # Unpaywall checks OA status, downloads the PDF, extracts metadata,
132
+ # and builds a vector index — all in one step.
133
+ # If the paper is not OA, you'll get a clear error with instructions.
134
+ ```
135
+
136
+ ### Generating BibTeX citations for your bibliography
137
+
138
+ ```bash
139
+ paper util doi2bib 10.1145/3586183.3606763
140
+ # @inproceedings{...}
141
+
142
+ # Combine with jq to batch-extract DOIs from your knowledge base
143
+ paper lit list <kb-id> --jq '.[].doi | select(. != null)'
144
+ ```
145
+
146
+ ### Scripting with JSON output
147
+
148
+ ```bash
149
+ # Export all papers in a knowledge base as JSON
150
+ paper lit list <kb-id> --json > papers.json
151
+
152
+ # Filter with jq expressions inline
153
+ paper lit list <kb-id> --jq '[.[] | {title, doi, author}]'
154
+
155
+ # Find papers by a specific author
156
+ paper lit search <kb-id> -a "Vaswani" --json
157
+ ```
158
+
159
+ ### Annotating papers for a literature review
160
+
161
+ ```bash
162
+ # Attach notes to track your reading progress
163
+ paper lit note set <lit-id> status "read"
164
+ paper lit note set <lit-id> relevance "high"
165
+ paper lit note set <lit-id> summary "Proposes transformer architecture..."
166
+
167
+ # Review all notes
168
+ paper lit note list <lit-id>
169
+ ```
170
+
99
171
  ## Configuration
100
172
 
101
173
  See [Configuration Reference](docs/configuration.md) for all available config fields and detailed usage.
@@ -9,6 +9,7 @@ import * as projectKb from "../db/project/knowledge-bases.js";
9
9
  import * as projectLit from "../db/project/literatures.js";
10
10
  import * as userKb from "../db/user/knowledge-bases.js";
11
11
  import * as userLit from "../db/user/literatures.js";
12
+ import { isHybridBackendAvailable } from "../dep/index.js";
12
13
  import { extractContent, extractPdfMetadata } from "../extractor/index.js";
13
14
  import { convertPdfToMarkdown, isOpendataLoaderAvailable, removeImageDir, saveConvertResult, } from "../extractor/markdown.js";
14
15
  import { log } from "../logger.js";
@@ -217,6 +218,9 @@ export function createLiteratureCommand() {
217
218
  fs.copyFileSync(absolutePath, path.join(filesDir, `${literature.id}${ext}`));
218
219
  // Convert PDF to Markdown if opendataloader is available
219
220
  if (isPdf && (await isOpendataLoaderAvailable())) {
221
+ if (!(await isHybridBackendAvailable())) {
222
+ log.step("Hybrid backend (localhost:5002) is not running; using basic conversion. Start the backend for better quality.");
223
+ }
220
224
  const result = await convertPdfToMarkdown(absolutePath);
221
225
  if (result) {
222
226
  saveConvertResult(filesDir, literature.id, result);
@@ -0,0 +1,19 @@
1
+ import { request } from "node:http";
2
+ const HYBRID_BACKEND_URL = "http://localhost:5002";
3
+ const HYBRID_PROBE_TIMEOUT_MS = 1500;
4
+ /** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
5
+ export function isHybridBackendAvailable() {
6
+ return new Promise((resolve) => {
7
+ const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
8
+ // Any response means the server is running
9
+ res.resume();
10
+ resolve(true);
11
+ });
12
+ req.on("error", () => resolve(false));
13
+ req.on("timeout", () => {
14
+ req.destroy();
15
+ resolve(false);
16
+ });
17
+ req.end();
18
+ });
19
+ }
@@ -1,8 +1,8 @@
1
1
  import { execFile } from "node:child_process";
2
2
  import { existsSync, mkdirSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
3
- import { request } from "node:http";
4
3
  import { tmpdir } from "node:os";
5
4
  import * as path from "node:path";
5
+ import { isHybridBackendAvailable } from "../dep/index.js";
6
6
  /**
7
7
  * Check whether opendataloader-pdf is available (package installed + Java runtime).
8
8
  * Result is cached after the first call.
@@ -124,24 +124,6 @@ export async function checkOpendataLoaderStatus() {
124
124
  hybridBackendAvailable,
125
125
  };
126
126
  }
127
- const HYBRID_BACKEND_URL = "http://localhost:5002";
128
- const HYBRID_PROBE_TIMEOUT_MS = 1500;
129
- /** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
130
- function isHybridBackendAvailable() {
131
- return new Promise((resolve) => {
132
- const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
133
- // Any response means the server is running
134
- res.resume();
135
- resolve(true);
136
- });
137
- req.on("error", () => resolve(false));
138
- req.on("timeout", () => {
139
- req.destroy();
140
- resolve(false);
141
- });
142
- req.end();
143
- });
144
- }
145
127
  // execFile is safe — arguments are passed as an array, no shell interpolation.
146
128
  function getJavaVersion() {
147
129
  return new Promise((resolve) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "paper-manager",
3
- "version": "0.12.0",
3
+ "version": "0.12.1",
4
4
  "description": "A paper management system.",
5
5
  "keywords": [],
6
6
  "homepage": "https://github.com/EurFelux/paper-manager",