paper-manager 0.8.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/embed.js +0 -1
- package/dist/ai/index.js +0 -1
- package/dist/ai/provider.js +0 -1
- package/dist/commands/config.js +0 -1
- package/dist/commands/dep.js +48 -0
- package/dist/commands/knowledge-base.js +25 -3
- package/dist/commands/literature.js +98 -19
- package/dist/commands/util.js +0 -1
- package/dist/config/index.js +0 -1
- package/dist/config/init.js +0 -1
- package/dist/db/index.js +0 -1
- package/dist/db/operations/knowledge-bases.js +0 -1
- package/dist/db/operations/literatures.js +0 -1
- package/dist/db/project/knowledge-bases.js +0 -1
- package/dist/db/project/literatures.js +0 -1
- package/dist/db/schema.js +0 -1
- package/dist/db/test-utils.js +0 -1
- package/dist/db/user/knowledge-bases.js +0 -1
- package/dist/db/user/literatures.js +0 -1
- package/dist/extractor/index.js +0 -1
- package/dist/extractor/markdown.js +88 -0
- package/dist/extractor/pdf.js +23 -29
- package/dist/extractor/text.js +2 -4
- package/dist/index.js +2 -1
- package/dist/lifecycle.js +0 -1
- package/dist/logger.js +0 -1
- package/dist/migrations.js +0 -1
- package/dist/text-splitter.js +56 -0
- package/dist/types/index.js +0 -1
- package/dist/vector-store/embeddings.js +0 -1
- package/dist/vector-store/index.js +76 -12
- package/package.json +5 -6
- package/dist/ai/embed.d.ts +0 -3
- package/dist/ai/embed.js.map +0 -1
- package/dist/ai/embed.test.d.ts +0 -1
- package/dist/ai/embed.test.js +0 -93
- package/dist/ai/embed.test.js.map +0 -1
- package/dist/ai/index.d.ts +0 -2
- package/dist/ai/index.js.map +0 -1
- package/dist/ai/provider.d.ts +0 -3
- package/dist/ai/provider.js.map +0 -1
- package/dist/commands/config.d.ts +0 -2
- package/dist/commands/config.js.map +0 -1
- package/dist/commands/knowledge-base.d.ts +0 -2
- package/dist/commands/knowledge-base.js.map +0 -1
- package/dist/commands/literature.d.ts +0 -2
- package/dist/commands/literature.js.map +0 -1
- package/dist/commands/util.d.ts +0 -2
- package/dist/commands/util.js.map +0 -1
- package/dist/config/index.d.ts +0 -39
- package/dist/config/index.js.map +0 -1
- package/dist/config/index.test.d.ts +0 -1
- package/dist/config/index.test.js +0 -143
- package/dist/config/index.test.js.map +0 -1
- package/dist/config/init.d.ts +0 -10
- package/dist/config/init.js.map +0 -1
- package/dist/config/init.test.d.ts +0 -1
- package/dist/config/init.test.js +0 -61
- package/dist/config/init.test.js.map +0 -1
- package/dist/db/index.d.ts +0 -7
- package/dist/db/index.js.map +0 -1
- package/dist/db/index.test.d.ts +0 -1
- package/dist/db/index.test.js +0 -32
- package/dist/db/index.test.js.map +0 -1
- package/dist/db/operations/knowledge-bases.d.ts +0 -11
- package/dist/db/operations/knowledge-bases.js.map +0 -1
- package/dist/db/operations/knowledge-bases.test.d.ts +0 -1
- package/dist/db/operations/knowledge-bases.test.js +0 -71
- package/dist/db/operations/knowledge-bases.test.js.map +0 -1
- package/dist/db/operations/literatures.d.ts +0 -16
- package/dist/db/operations/literatures.js.map +0 -1
- package/dist/db/operations/literatures.test.d.ts +0 -1
- package/dist/db/operations/literatures.test.js +0 -156
- package/dist/db/operations/literatures.test.js.map +0 -1
- package/dist/db/project/knowledge-bases.d.ts +0 -10
- package/dist/db/project/knowledge-bases.js.map +0 -1
- package/dist/db/project/literatures.d.ts +0 -10
- package/dist/db/project/literatures.js.map +0 -1
- package/dist/db/schema.d.ts +0 -372
- package/dist/db/schema.js.map +0 -1
- package/dist/db/test-utils.d.ts +0 -6
- package/dist/db/test-utils.js.map +0 -1
- package/dist/db/user/knowledge-bases.d.ts +0 -10
- package/dist/db/user/knowledge-bases.js.map +0 -1
- package/dist/db/user/literatures.d.ts +0 -10
- package/dist/db/user/literatures.js.map +0 -1
- package/dist/extractor/index.d.ts +0 -6
- package/dist/extractor/index.js.map +0 -1
- package/dist/extractor/pdf.d.ts +0 -13
- package/dist/extractor/pdf.js.map +0 -1
- package/dist/extractor/pdf.test.d.ts +0 -1
- package/dist/extractor/pdf.test.js +0 -106
- package/dist/extractor/pdf.test.js.map +0 -1
- package/dist/extractor/text.d.ts +0 -2
- package/dist/extractor/text.js.map +0 -1
- package/dist/index.d.ts +0 -2
- package/dist/index.js.map +0 -1
- package/dist/lifecycle.d.ts +0 -1
- package/dist/lifecycle.js.map +0 -1
- package/dist/logger.d.ts +0 -24
- package/dist/logger.js.map +0 -1
- package/dist/migrations.d.ts +0 -5
- package/dist/migrations.js.map +0 -1
- package/dist/pdf/extractor.d.ts +0 -2
- package/dist/pdf/extractor.js +0 -18
- package/dist/pdf/extractor.js.map +0 -1
- package/dist/types/index.d.ts +0 -61
- package/dist/types/index.js.map +0 -1
- package/dist/types/index.test.d.ts +0 -1
- package/dist/types/index.test.js +0 -100
- package/dist/types/index.test.js.map +0 -1
- package/dist/vector-store/embeddings.d.ts +0 -8
- package/dist/vector-store/embeddings.js.map +0 -1
- package/dist/vector-store/index.d.ts +0 -6
- package/dist/vector-store/index.js.map +0 -1
package/dist/ai/embed.js
CHANGED
package/dist/ai/index.js
CHANGED
package/dist/ai/provider.js
CHANGED
package/dist/commands/config.js
CHANGED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import chalk from "chalk";
|
|
2
|
+
import { Command } from "commander";
|
|
3
|
+
import { checkOpendataLoaderStatus } from "../extractor/markdown.js";
|
|
4
|
+
import { log } from "../logger.js";
|
|
5
|
+
const KNOWN_DEPS = new Set(["opendataloader"]);
|
|
6
|
+
export function createDepCommand() {
|
|
7
|
+
const dep = new Command("dep").description("Manage external dependencies");
|
|
8
|
+
dep
|
|
9
|
+
.command("check <dep>")
|
|
10
|
+
.description("Check if an external dependency is available")
|
|
11
|
+
.action(async (depName) => {
|
|
12
|
+
if (!KNOWN_DEPS.has(depName)) {
|
|
13
|
+
log.error(`Unknown dependency: ${depName}`);
|
|
14
|
+
log.step(`Available: ${[...KNOWN_DEPS].join(", ")}`);
|
|
15
|
+
process.exit(1);
|
|
16
|
+
}
|
|
17
|
+
if (depName === "opendataloader") {
|
|
18
|
+
await checkOpendataLoader();
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
return dep;
|
|
22
|
+
}
|
|
23
|
+
async function checkOpendataLoader() {
|
|
24
|
+
log.info("Checking opendataloader-pdf...");
|
|
25
|
+
const status = await checkOpendataLoaderStatus();
|
|
26
|
+
const pkgIcon = status.packageInstalled ? chalk.green("✔") : chalk.red("✖");
|
|
27
|
+
const javaIcon = status.javaAvailable ? chalk.green("✔") : chalk.red("✖");
|
|
28
|
+
log.plain(` ${pkgIcon} @opendataloader/pdf package`);
|
|
29
|
+
if (status.javaAvailable) {
|
|
30
|
+
log.plain(` ${javaIcon} Java runtime (${status.javaVersion})`);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
log.plain(` ${javaIcon} Java runtime (not found)`);
|
|
34
|
+
}
|
|
35
|
+
log.newline();
|
|
36
|
+
if (status.packageInstalled && status.javaAvailable) {
|
|
37
|
+
log.success("opendataloader-pdf is ready.");
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
log.error("opendataloader-pdf is not available.");
|
|
41
|
+
if (!status.packageInstalled) {
|
|
42
|
+
log.step("Install: pnpm add @opendataloader/pdf");
|
|
43
|
+
}
|
|
44
|
+
if (!status.javaAvailable) {
|
|
45
|
+
log.step("Install Java 11+: https://adoptium.net/");
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -57,6 +57,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
57
57
|
.description("List knowledge bases")
|
|
58
58
|
.option("--user", "List user knowledge bases only")
|
|
59
59
|
.option("--all", "List all knowledge bases (default)")
|
|
60
|
+
.option("--json", "Output as JSON")
|
|
60
61
|
.action((options) => {
|
|
61
62
|
let results = [];
|
|
62
63
|
if (options.user) {
|
|
@@ -68,7 +69,16 @@ export function createKnowledgeBaseCommand() {
|
|
|
68
69
|
results = [...projectKbs, ...userKbs];
|
|
69
70
|
}
|
|
70
71
|
if (results.length === 0) {
|
|
71
|
-
|
|
72
|
+
if (options.json) {
|
|
73
|
+
log.plain("[]");
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
log.info("No knowledge bases found.");
|
|
77
|
+
}
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
if (options.json) {
|
|
81
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
72
82
|
return;
|
|
73
83
|
}
|
|
74
84
|
for (const kb of results) {
|
|
@@ -151,6 +161,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
151
161
|
kb.command("query <id> <query-text>")
|
|
152
162
|
.description("Query a knowledge base")
|
|
153
163
|
.option("-k, --top-k <number>", "Number of results", "5")
|
|
164
|
+
.option("--json", "Output as JSON")
|
|
154
165
|
.action(async (id, queryText, options) => {
|
|
155
166
|
const resolved = resolveKnowledgeBase(id);
|
|
156
167
|
if (!resolved) {
|
|
@@ -168,7 +179,19 @@ export function createKnowledgeBaseCommand() {
|
|
|
168
179
|
const k = parseInt(options.topK, 10);
|
|
169
180
|
const results = await queryVectorStore(modelConfig, vectorDir, queryText, k);
|
|
170
181
|
if (results.length === 0) {
|
|
171
|
-
|
|
182
|
+
if (options.json) {
|
|
183
|
+
log.plain("[]");
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
log.info("No results found.");
|
|
187
|
+
}
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
if (options.json) {
|
|
191
|
+
const output = results
|
|
192
|
+
.filter((doc) => doc != null)
|
|
193
|
+
.map((doc) => ({ pageContent: doc.pageContent, metadata: doc.metadata }));
|
|
194
|
+
log.plain(JSON.stringify(output, null, 2));
|
|
172
195
|
return;
|
|
173
196
|
}
|
|
174
197
|
for (let i = 0; i < results.length; i++) {
|
|
@@ -185,4 +208,3 @@ export function createKnowledgeBaseCommand() {
|
|
|
185
208
|
});
|
|
186
209
|
return kb;
|
|
187
210
|
}
|
|
188
|
-
//# sourceMappingURL=knowledge-base.js.map
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import * as path from "node:path";
|
|
3
|
-
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
|
|
4
3
|
import chalk from "chalk";
|
|
5
4
|
import cliProgress from "cli-progress";
|
|
6
5
|
import { Command } from "commander";
|
|
@@ -10,8 +9,10 @@ import * as projectLit from "../db/project/literatures.js";
|
|
|
10
9
|
import * as userKb from "../db/user/knowledge-bases.js";
|
|
11
10
|
import * as userLit from "../db/user/literatures.js";
|
|
12
11
|
import { extractContent, extractPdfMetadata } from "../extractor/index.js";
|
|
12
|
+
import { convertPdfToMarkdown, isOpendataLoaderAvailable } from "../extractor/markdown.js";
|
|
13
13
|
import { log } from "../logger.js";
|
|
14
|
-
import {
|
|
14
|
+
import { splitDocuments } from "../text-splitter.js";
|
|
15
|
+
import { addDocuments, createVectorStore } from "../vector-store/index.js";
|
|
15
16
|
function resolveKnowledgeBase(id) {
|
|
16
17
|
const pkb = projectKb.getKnowledgeBase(id);
|
|
17
18
|
if (pkb)
|
|
@@ -94,13 +95,17 @@ export function createLiteratureCommand() {
|
|
|
94
95
|
const ext = path.extname(litPath);
|
|
95
96
|
fs.mkdirSync(filesDir, { recursive: true });
|
|
96
97
|
fs.copyFileSync(absolutePath, path.join(filesDir, `${literature.id}${ext}`));
|
|
98
|
+
// Convert PDF to Markdown if opendataloader is available
|
|
99
|
+
if (isPdf && (await isOpendataLoaderAvailable())) {
|
|
100
|
+
const markdown = await convertPdfToMarkdown(absolutePath);
|
|
101
|
+
if (markdown) {
|
|
102
|
+
fs.writeFileSync(path.join(filesDir, `${literature.id}.md`), markdown, "utf-8");
|
|
103
|
+
log.step("Converted to Markdown via opendataloader-pdf.");
|
|
104
|
+
}
|
|
105
|
+
}
|
|
97
106
|
// Split text and add to vector store
|
|
98
107
|
log.info("Splitting text...");
|
|
99
|
-
const
|
|
100
|
-
chunkSize: 1000,
|
|
101
|
-
chunkOverlap: 200,
|
|
102
|
-
});
|
|
103
|
-
const splitDocs = await splitter.splitDocuments(docs);
|
|
108
|
+
const splitDocs = splitDocuments(docs, { chunkSize: 1000, chunkOverlap: 200 });
|
|
104
109
|
log.step(`Created ${String(splitDocs.length)} chunks.`);
|
|
105
110
|
// Add literature ID metadata to each chunk
|
|
106
111
|
for (const doc of splitDocs) {
|
|
@@ -115,12 +120,9 @@ export function createLiteratureCommand() {
|
|
|
115
120
|
const hasIndex = fs.existsSync(path.join(vectorDir, "faiss.index")) &&
|
|
116
121
|
fs.existsSync(path.join(vectorDir, "docstore.json"));
|
|
117
122
|
if (hasIndex) {
|
|
118
|
-
|
|
119
|
-
await store.addDocuments(splitDocs);
|
|
120
|
-
await store.save(vectorDir);
|
|
123
|
+
await addDocuments(splitDocs, modelConfig, vectorDir);
|
|
121
124
|
}
|
|
122
125
|
else {
|
|
123
|
-
fs.mkdirSync(vectorDir, { recursive: true });
|
|
124
126
|
await createVectorStore(splitDocs, modelConfig, vectorDir);
|
|
125
127
|
}
|
|
126
128
|
bar.update(splitDocs.length);
|
|
@@ -136,6 +138,40 @@ export function createLiteratureCommand() {
|
|
|
136
138
|
if (literature.keywords.length > 0)
|
|
137
139
|
log.label("Keywords:", literature.keywords.join(", "));
|
|
138
140
|
});
|
|
141
|
+
// ─── lit convert ────────────────────────────────────────────
|
|
142
|
+
lit
|
|
143
|
+
.command("convert <id>")
|
|
144
|
+
.description("Convert an existing literature PDF to Markdown via opendataloader-pdf")
|
|
145
|
+
.action(async (id) => {
|
|
146
|
+
const found = findLiteratureWithScope(id);
|
|
147
|
+
if (!found) {
|
|
148
|
+
log.error(`Literature not found: ${id}`);
|
|
149
|
+
process.exit(1);
|
|
150
|
+
}
|
|
151
|
+
const filesDir = getFilesDir(getBaseDir(found.scope));
|
|
152
|
+
const pdfFile = findLiteratureFiles(filesDir, id).find((f) => f.endsWith(".pdf"));
|
|
153
|
+
if (!pdfFile) {
|
|
154
|
+
log.error(`No PDF file found for literature: ${id}`);
|
|
155
|
+
process.exit(1);
|
|
156
|
+
}
|
|
157
|
+
const mdPath = path.join(filesDir, `${id}.md`);
|
|
158
|
+
if (fs.existsSync(mdPath)) {
|
|
159
|
+
log.error("Markdown file already exists. Delete it first to reconvert.");
|
|
160
|
+
process.exit(1);
|
|
161
|
+
}
|
|
162
|
+
if (!(await isOpendataLoaderAvailable())) {
|
|
163
|
+
log.error("opendataloader-pdf is not available. Run `paper dep check opendataloader` for details.");
|
|
164
|
+
process.exit(1);
|
|
165
|
+
}
|
|
166
|
+
log.info("Converting PDF to Markdown...");
|
|
167
|
+
const markdown = await convertPdfToMarkdown(path.join(filesDir, pdfFile));
|
|
168
|
+
if (!markdown) {
|
|
169
|
+
log.error("Conversion failed.");
|
|
170
|
+
process.exit(1);
|
|
171
|
+
}
|
|
172
|
+
fs.writeFileSync(mdPath, markdown, "utf-8");
|
|
173
|
+
log.success(`Markdown saved: ${id}.md`);
|
|
174
|
+
});
|
|
139
175
|
// ─── lit remove ────────────────────────────────────────────
|
|
140
176
|
lit
|
|
141
177
|
.command("remove <knowledge-base-id> <id>")
|
|
@@ -211,7 +247,8 @@ export function createLiteratureCommand() {
|
|
|
211
247
|
lit
|
|
212
248
|
.command("list <knowledge-base-id>")
|
|
213
249
|
.description("List literatures in a knowledge base")
|
|
214
|
-
.
|
|
250
|
+
.option("--json", "Output as JSON")
|
|
251
|
+
.action((kbId, options) => {
|
|
215
252
|
const resolved = resolveKnowledgeBase(kbId);
|
|
216
253
|
if (!resolved) {
|
|
217
254
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -220,14 +257,26 @@ export function createLiteratureCommand() {
|
|
|
220
257
|
const litOps = getLitOps(resolved.scope);
|
|
221
258
|
const literatures = litOps.listLiteratures(kbId);
|
|
222
259
|
if (literatures.length === 0) {
|
|
223
|
-
|
|
260
|
+
if (options.json) {
|
|
261
|
+
log.plain("[]");
|
|
262
|
+
}
|
|
263
|
+
else {
|
|
264
|
+
log.info("No literatures found.");
|
|
265
|
+
}
|
|
266
|
+
return;
|
|
267
|
+
}
|
|
268
|
+
if (options.json) {
|
|
269
|
+
log.plain(JSON.stringify(literatures, null, 2));
|
|
224
270
|
return;
|
|
225
271
|
}
|
|
272
|
+
const filesDir = getFilesDir(getBaseDir(resolved.scope));
|
|
226
273
|
for (const l of literatures) {
|
|
227
274
|
log.header(l.id);
|
|
228
275
|
log.label("Title:", l.title);
|
|
229
276
|
if (l.author)
|
|
230
277
|
log.label("Author:", l.author);
|
|
278
|
+
const files = findLiteratureFiles(filesDir, l.id);
|
|
279
|
+
log.label("Files:", files.length > 0 ? files.join(", ") : "(none)");
|
|
231
280
|
log.label("Created:", l.createdAt.toISOString());
|
|
232
281
|
log.newline();
|
|
233
282
|
}
|
|
@@ -241,6 +290,7 @@ export function createLiteratureCommand() {
|
|
|
241
290
|
.option("-a, --author <author>", "Author substring")
|
|
242
291
|
.option("-k, --keyword <keyword>", "Keyword substring")
|
|
243
292
|
.option("--doi <doi>", "DOI substring")
|
|
293
|
+
.option("--json", "Output as JSON")
|
|
244
294
|
.action((kbId, options) => {
|
|
245
295
|
const resolved = resolveKnowledgeBase(kbId);
|
|
246
296
|
if (!resolved) {
|
|
@@ -262,7 +312,16 @@ export function createLiteratureCommand() {
|
|
|
262
312
|
doi: options.doi,
|
|
263
313
|
});
|
|
264
314
|
if (results.length === 0) {
|
|
265
|
-
|
|
315
|
+
if (options.json) {
|
|
316
|
+
log.plain("[]");
|
|
317
|
+
}
|
|
318
|
+
else {
|
|
319
|
+
log.info("No literatures found.");
|
|
320
|
+
}
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
if (options.json) {
|
|
324
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
266
325
|
return;
|
|
267
326
|
}
|
|
268
327
|
for (const l of results) {
|
|
@@ -283,7 +342,8 @@ export function createLiteratureCommand() {
|
|
|
283
342
|
lit
|
|
284
343
|
.command("show <knowledge-base-id> <id>")
|
|
285
344
|
.description("Show literature details")
|
|
286
|
-
.
|
|
345
|
+
.option("--json", "Output as JSON")
|
|
346
|
+
.action((kbId, id, options) => {
|
|
287
347
|
const resolved = resolveKnowledgeBase(kbId);
|
|
288
348
|
if (!resolved) {
|
|
289
349
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -295,19 +355,29 @@ export function createLiteratureCommand() {
|
|
|
295
355
|
log.error(`Literature not found: ${id}`);
|
|
296
356
|
process.exit(1);
|
|
297
357
|
}
|
|
298
|
-
|
|
358
|
+
if (options.json) {
|
|
359
|
+
log.plain(JSON.stringify(literature, null, 2));
|
|
360
|
+
return;
|
|
361
|
+
}
|
|
362
|
+
const filesDir = getFilesDir(getBaseDir(resolved.scope));
|
|
363
|
+
printLiterature(literature, filesDir);
|
|
299
364
|
});
|
|
300
365
|
// ─── lit note ──────────────────────────────────────────────
|
|
301
366
|
const note = lit.command("note").description("Manage literature notes");
|
|
302
367
|
note
|
|
303
368
|
.command("list <literature-id>")
|
|
304
369
|
.description("List all notes for a literature")
|
|
305
|
-
.
|
|
370
|
+
.option("--json", "Output as JSON")
|
|
371
|
+
.action((litId, options) => {
|
|
306
372
|
const literature = findLiterature(litId);
|
|
307
373
|
if (!literature) {
|
|
308
374
|
log.error(`Literature not found: ${litId}`);
|
|
309
375
|
process.exit(1);
|
|
310
376
|
}
|
|
377
|
+
if (options.json) {
|
|
378
|
+
log.plain(JSON.stringify(literature.notes, null, 2));
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
311
381
|
const entries = Object.entries(literature.notes);
|
|
312
382
|
if (entries.length === 0) {
|
|
313
383
|
log.info("No notes found.");
|
|
@@ -363,7 +433,15 @@ function findLiteratureWithScope(id) {
|
|
|
363
433
|
return { literature: uLit, scope: "user" };
|
|
364
434
|
return null;
|
|
365
435
|
}
|
|
366
|
-
function
|
|
436
|
+
function findLiteratureFiles(filesDir, id) {
|
|
437
|
+
if (!fs.existsSync(filesDir))
|
|
438
|
+
return [];
|
|
439
|
+
return fs
|
|
440
|
+
.readdirSync(filesDir)
|
|
441
|
+
.filter((name) => name.startsWith(`${id}.`))
|
|
442
|
+
.sort();
|
|
443
|
+
}
|
|
444
|
+
function printLiterature(lit, filesDir) {
|
|
367
445
|
log.header(lit.id);
|
|
368
446
|
log.label("Title:", lit.title);
|
|
369
447
|
if (lit.titleTranslation)
|
|
@@ -382,6 +460,8 @@ function printLiterature(lit) {
|
|
|
382
460
|
log.label("DOI:", lit.doi);
|
|
383
461
|
if (lit.knowledgeBaseId)
|
|
384
462
|
log.label("Knowledge Base:", lit.knowledgeBaseId);
|
|
463
|
+
const files = findLiteratureFiles(filesDir, lit.id);
|
|
464
|
+
log.label("Files:", files.length > 0 ? files.join(", ") : "(none)");
|
|
385
465
|
log.label("Created:", lit.createdAt.toISOString());
|
|
386
466
|
log.label("Updated:", lit.updatedAt.toISOString());
|
|
387
467
|
const noteEntries = Object.entries(lit.notes);
|
|
@@ -392,4 +472,3 @@ function printLiterature(lit) {
|
|
|
392
472
|
}
|
|
393
473
|
}
|
|
394
474
|
}
|
|
395
|
-
//# sourceMappingURL=literature.js.map
|
package/dist/commands/util.js
CHANGED
package/dist/config/index.js
CHANGED
package/dist/config/init.js
CHANGED
package/dist/db/index.js
CHANGED
package/dist/db/schema.js
CHANGED
package/dist/db/test-utils.js
CHANGED
package/dist/extractor/index.js
CHANGED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { mkdirSync, readdirSync, readFileSync, rmSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import * as path from "node:path";
|
|
5
|
+
/**
|
|
6
|
+
* Check whether opendataloader-pdf is available (package installed + Java runtime).
|
|
7
|
+
* Result is cached after the first call.
|
|
8
|
+
*/
|
|
9
|
+
export async function isOpendataLoaderAvailable() {
|
|
10
|
+
if (cachedAvailability !== undefined)
|
|
11
|
+
return cachedAvailability;
|
|
12
|
+
cachedAvailability = await detectAvailability();
|
|
13
|
+
return cachedAvailability;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Convert a PDF file to Markdown using opendataloader-pdf.
|
|
17
|
+
* Returns the markdown content on success, or null on failure.
|
|
18
|
+
*/
|
|
19
|
+
export async function convertPdfToMarkdown(pdfPath) {
|
|
20
|
+
const outDir = path.join(tmpdir(), `odl-${Date.now()}`);
|
|
21
|
+
mkdirSync(outDir, { recursive: true });
|
|
22
|
+
try {
|
|
23
|
+
const { convert } = await import("@opendataloader/pdf");
|
|
24
|
+
await convert([pdfPath], {
|
|
25
|
+
outputDir: outDir,
|
|
26
|
+
format: "markdown",
|
|
27
|
+
quiet: true,
|
|
28
|
+
});
|
|
29
|
+
const mdFile = readdirSync(outDir).find((f) => f.endsWith(".md"));
|
|
30
|
+
if (!mdFile)
|
|
31
|
+
return null;
|
|
32
|
+
return readFileSync(path.join(outDir, mdFile), "utf-8");
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
finally {
|
|
38
|
+
rmSync(outDir, { recursive: true, force: true });
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// ─── Internal ────────────────────────────────────────────
|
|
42
|
+
let cachedAvailability;
|
|
43
|
+
async function detectAvailability() {
|
|
44
|
+
const [hasPackage, hasJava] = await Promise.all([checkPackage(), checkJava()]);
|
|
45
|
+
return hasPackage && hasJava;
|
|
46
|
+
}
|
|
47
|
+
async function checkPackage() {
|
|
48
|
+
try {
|
|
49
|
+
await import("@opendataloader/pdf");
|
|
50
|
+
return true;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// execFile is safe — arguments are passed as an array, no shell interpolation.
|
|
57
|
+
async function checkJava() {
|
|
58
|
+
return new Promise((resolve) => {
|
|
59
|
+
execFile("java", ["-version"], (error) => {
|
|
60
|
+
resolve(!error);
|
|
61
|
+
});
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Detailed availability check for the `dep check` command.
|
|
66
|
+
*/
|
|
67
|
+
export async function checkOpendataLoaderStatus() {
|
|
68
|
+
const [packageInstalled, javaResult] = await Promise.all([checkPackage(), getJavaVersion()]);
|
|
69
|
+
return {
|
|
70
|
+
packageInstalled,
|
|
71
|
+
javaAvailable: javaResult !== null,
|
|
72
|
+
javaVersion: javaResult,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
// execFile is safe — arguments are passed as an array, no shell interpolation.
|
|
76
|
+
function getJavaVersion() {
|
|
77
|
+
return new Promise((resolve) => {
|
|
78
|
+
execFile("java", ["-version"], (error, _stdout, stderr) => {
|
|
79
|
+
if (error) {
|
|
80
|
+
resolve(null);
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
// Java prints version to stderr
|
|
84
|
+
const match = /version\s+"([^"]+)"/.exec(stderr);
|
|
85
|
+
resolve(match?.[1] ?? null);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
}
|
package/dist/extractor/pdf.js
CHANGED
|
@@ -1,44 +1,43 @@
|
|
|
1
1
|
import { readFile } from "node:fs/promises";
|
|
2
|
-
import {
|
|
3
|
-
import { PDFParse } from "pdf-parse";
|
|
2
|
+
import { extractText, getMeta } from "unpdf";
|
|
4
3
|
export async function extractPdfContent(pdfPath) {
|
|
5
|
-
const data = await readFile(pdfPath);
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
return result.pages.map((page) => new Document({
|
|
10
|
-
pageContent: page.text,
|
|
4
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
5
|
+
const result = await extractText(data, { mergePages: false });
|
|
6
|
+
return result.text.map((pageText, i) => ({
|
|
7
|
+
pageContent: pageText,
|
|
11
8
|
metadata: {
|
|
12
9
|
source: pdfPath,
|
|
13
|
-
pdf: { totalPages: result.
|
|
14
|
-
loc: { pageNumber:
|
|
10
|
+
pdf: { totalPages: result.totalPages },
|
|
11
|
+
loc: { pageNumber: i + 1 },
|
|
15
12
|
},
|
|
16
13
|
}));
|
|
17
14
|
}
|
|
18
15
|
export async function extractPdfMetadata(pdfPath) {
|
|
19
|
-
const data = await readFile(pdfPath);
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
const
|
|
29
|
-
const creationDate = parsePdfDate(nonEmptyStringOrNull(info?.["CreationDate"]));
|
|
30
|
-
const modDate = parsePdfDate(nonEmptyStringOrNull(info?.["ModDate"]));
|
|
31
|
-
const rawKeywords = nonEmptyStringOrNull(info?.["Keywords"]);
|
|
16
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
17
|
+
const { info } = await getMeta(data);
|
|
18
|
+
const custom = getRecord(info["Custom"]);
|
|
19
|
+
const title = nonEmptyStringOrNull(info["Title"]);
|
|
20
|
+
const author = nonEmptyStringOrNull(info["Author"]);
|
|
21
|
+
const subject = nonEmptyStringOrNull(info["Subject"]);
|
|
22
|
+
const creator = nonEmptyStringOrNull(info["Creator"]);
|
|
23
|
+
const creationDate = parsePdfDate(nonEmptyStringOrNull(info["CreationDate"]));
|
|
24
|
+
const modDate = parsePdfDate(nonEmptyStringOrNull(info["ModDate"]));
|
|
25
|
+
const rawKeywords = nonEmptyStringOrNull(info["Keywords"]);
|
|
32
26
|
const keywords = rawKeywords
|
|
33
27
|
? rawKeywords
|
|
34
28
|
.split(/[,;]/)
|
|
35
29
|
.map((k) => k.trim())
|
|
36
30
|
.filter(Boolean)
|
|
37
31
|
: [];
|
|
38
|
-
// DOI can appear in Custom fields (case-insensitive lookup)
|
|
39
32
|
const doi = findCustomField(custom, "doi");
|
|
40
33
|
return { title, author, subject, keywords, doi, creator, creationDate, modDate };
|
|
41
34
|
}
|
|
35
|
+
function getRecord(value) {
|
|
36
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
37
|
+
return value;
|
|
38
|
+
}
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
42
41
|
function nonEmptyStringOrNull(value) {
|
|
43
42
|
if (typeof value === "string" && value.trim().length > 0) {
|
|
44
43
|
return value.trim();
|
|
@@ -53,7 +52,6 @@ function parsePdfDate(value) {
|
|
|
53
52
|
if (!value)
|
|
54
53
|
return null;
|
|
55
54
|
const cleaned = value.replace(/^D:/, "");
|
|
56
|
-
// Extract components: YYYY[MM[DD[HH[mm[SS]]]]]
|
|
57
55
|
const match = /^(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/.exec(cleaned);
|
|
58
56
|
if (!match)
|
|
59
57
|
return null;
|
|
@@ -66,9 +64,6 @@ function parsePdfDate(value) {
|
|
|
66
64
|
const date = new Date(`${year}-${month}-${day}T${hour}:${min}:${sec}Z`);
|
|
67
65
|
return Number.isNaN(date.getTime()) ? null : date;
|
|
68
66
|
}
|
|
69
|
-
/**
|
|
70
|
-
* Case-insensitive lookup in the Custom fields dictionary.
|
|
71
|
-
*/
|
|
72
67
|
function findCustomField(custom, key) {
|
|
73
68
|
if (!custom)
|
|
74
69
|
return null;
|
|
@@ -80,4 +75,3 @@ function findCustomField(custom, key) {
|
|
|
80
75
|
}
|
|
81
76
|
return null;
|
|
82
77
|
}
|
|
83
|
-
//# sourceMappingURL=pdf.js.map
|
package/dist/extractor/text.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { readFile, stat } from "node:fs/promises";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
2
|
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10 MB
|
|
4
3
|
export async function extractTextContent(filePath) {
|
|
5
4
|
const fileStats = await stat(filePath);
|
|
@@ -8,10 +7,9 @@ export async function extractTextContent(filePath) {
|
|
|
8
7
|
}
|
|
9
8
|
const content = await readFile(filePath, "utf-8");
|
|
10
9
|
return [
|
|
11
|
-
|
|
10
|
+
{
|
|
12
11
|
pageContent: content,
|
|
13
12
|
metadata: { source: filePath },
|
|
14
|
-
}
|
|
13
|
+
},
|
|
15
14
|
];
|
|
16
15
|
}
|
|
17
|
-
//# sourceMappingURL=text.js.map
|
package/dist/index.js
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import { Command } from "commander";
|
|
4
4
|
import { createConfigCommand } from "./commands/config.js";
|
|
5
|
+
import { createDepCommand } from "./commands/dep.js";
|
|
5
6
|
import { createKnowledgeBaseCommand } from "./commands/knowledge-base.js";
|
|
6
7
|
import { createLiteratureCommand } from "./commands/literature.js";
|
|
7
8
|
import { createUtilCommand } from "./commands/util.js";
|
|
@@ -14,8 +15,8 @@ program.hook("preAction", () => {
|
|
|
14
15
|
startup();
|
|
15
16
|
});
|
|
16
17
|
program.addCommand(createConfigCommand());
|
|
18
|
+
program.addCommand(createDepCommand());
|
|
17
19
|
program.addCommand(createKnowledgeBaseCommand());
|
|
18
20
|
program.addCommand(createLiteratureCommand());
|
|
19
21
|
program.addCommand(createUtilCommand());
|
|
20
22
|
program.parse();
|
|
21
|
-
//# sourceMappingURL=index.js.map
|
package/dist/lifecycle.js
CHANGED