paper-manager 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/embed.js +0 -1
- package/dist/ai/index.js +0 -1
- package/dist/ai/provider.js +0 -1
- package/dist/commands/config.js +0 -1
- package/dist/commands/knowledge-base.js +25 -3
- package/dist/commands/literature.js +39 -17
- package/dist/commands/util.js +0 -1
- package/dist/config/index.js +0 -1
- package/dist/config/init.js +0 -1
- package/dist/db/index.js +0 -1
- package/dist/db/operations/knowledge-bases.js +0 -1
- package/dist/db/operations/literatures.js +0 -1
- package/dist/db/project/knowledge-bases.js +0 -1
- package/dist/db/project/literatures.js +0 -1
- package/dist/db/schema.js +0 -1
- package/dist/db/test-utils.js +0 -1
- package/dist/db/user/knowledge-bases.js +0 -1
- package/dist/db/user/literatures.js +0 -1
- package/dist/extractor/index.js +0 -1
- package/dist/extractor/pdf.js +23 -29
- package/dist/extractor/text.js +2 -4
- package/dist/index.js +0 -1
- package/dist/lifecycle.js +0 -1
- package/dist/logger.js +0 -1
- package/dist/migrations.js +0 -1
- package/dist/text-splitter.js +56 -0
- package/dist/types/index.js +0 -1
- package/dist/vector-store/embeddings.js +0 -1
- package/dist/vector-store/index.js +76 -12
- package/package.json +2 -6
- package/dist/ai/embed.d.ts +0 -3
- package/dist/ai/embed.js.map +0 -1
- package/dist/ai/embed.test.d.ts +0 -1
- package/dist/ai/embed.test.js +0 -93
- package/dist/ai/embed.test.js.map +0 -1
- package/dist/ai/index.d.ts +0 -2
- package/dist/ai/index.js.map +0 -1
- package/dist/ai/provider.d.ts +0 -3
- package/dist/ai/provider.js.map +0 -1
- package/dist/commands/config.d.ts +0 -2
- package/dist/commands/config.js.map +0 -1
- package/dist/commands/knowledge-base.d.ts +0 -2
- package/dist/commands/knowledge-base.js.map +0 -1
- package/dist/commands/literature.d.ts +0 -2
- package/dist/commands/literature.js.map +0 -1
- package/dist/commands/util.d.ts +0 -2
- package/dist/commands/util.js.map +0 -1
- package/dist/config/index.d.ts +0 -39
- package/dist/config/index.js.map +0 -1
- package/dist/config/index.test.d.ts +0 -1
- package/dist/config/index.test.js +0 -143
- package/dist/config/index.test.js.map +0 -1
- package/dist/config/init.d.ts +0 -10
- package/dist/config/init.js.map +0 -1
- package/dist/config/init.test.d.ts +0 -1
- package/dist/config/init.test.js +0 -61
- package/dist/config/init.test.js.map +0 -1
- package/dist/db/index.d.ts +0 -7
- package/dist/db/index.js.map +0 -1
- package/dist/db/index.test.d.ts +0 -1
- package/dist/db/index.test.js +0 -32
- package/dist/db/index.test.js.map +0 -1
- package/dist/db/operations/knowledge-bases.d.ts +0 -11
- package/dist/db/operations/knowledge-bases.js.map +0 -1
- package/dist/db/operations/knowledge-bases.test.d.ts +0 -1
- package/dist/db/operations/knowledge-bases.test.js +0 -71
- package/dist/db/operations/knowledge-bases.test.js.map +0 -1
- package/dist/db/operations/literatures.d.ts +0 -16
- package/dist/db/operations/literatures.js.map +0 -1
- package/dist/db/operations/literatures.test.d.ts +0 -1
- package/dist/db/operations/literatures.test.js +0 -156
- package/dist/db/operations/literatures.test.js.map +0 -1
- package/dist/db/project/knowledge-bases.d.ts +0 -10
- package/dist/db/project/knowledge-bases.js.map +0 -1
- package/dist/db/project/literatures.d.ts +0 -10
- package/dist/db/project/literatures.js.map +0 -1
- package/dist/db/schema.d.ts +0 -372
- package/dist/db/schema.js.map +0 -1
- package/dist/db/test-utils.d.ts +0 -6
- package/dist/db/test-utils.js.map +0 -1
- package/dist/db/user/knowledge-bases.d.ts +0 -10
- package/dist/db/user/knowledge-bases.js.map +0 -1
- package/dist/db/user/literatures.d.ts +0 -10
- package/dist/db/user/literatures.js.map +0 -1
- package/dist/extractor/index.d.ts +0 -6
- package/dist/extractor/index.js.map +0 -1
- package/dist/extractor/pdf.d.ts +0 -13
- package/dist/extractor/pdf.js.map +0 -1
- package/dist/extractor/pdf.test.d.ts +0 -1
- package/dist/extractor/pdf.test.js +0 -106
- package/dist/extractor/pdf.test.js.map +0 -1
- package/dist/extractor/text.d.ts +0 -2
- package/dist/extractor/text.js.map +0 -1
- package/dist/index.d.ts +0 -2
- package/dist/index.js.map +0 -1
- package/dist/lifecycle.d.ts +0 -1
- package/dist/lifecycle.js.map +0 -1
- package/dist/logger.d.ts +0 -24
- package/dist/logger.js.map +0 -1
- package/dist/migrations.d.ts +0 -5
- package/dist/migrations.js.map +0 -1
- package/dist/pdf/extractor.d.ts +0 -2
- package/dist/pdf/extractor.js +0 -18
- package/dist/pdf/extractor.js.map +0 -1
- package/dist/types/index.d.ts +0 -61
- package/dist/types/index.js.map +0 -1
- package/dist/types/index.test.d.ts +0 -1
- package/dist/types/index.test.js +0 -100
- package/dist/types/index.test.js.map +0 -1
- package/dist/vector-store/embeddings.d.ts +0 -8
- package/dist/vector-store/embeddings.js.map +0 -1
- package/dist/vector-store/index.d.ts +0 -6
- package/dist/vector-store/index.js.map +0 -1
package/dist/ai/embed.js
CHANGED
package/dist/ai/index.js
CHANGED
package/dist/ai/provider.js
CHANGED
package/dist/commands/config.js
CHANGED
|
@@ -57,6 +57,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
57
57
|
.description("List knowledge bases")
|
|
58
58
|
.option("--user", "List user knowledge bases only")
|
|
59
59
|
.option("--all", "List all knowledge bases (default)")
|
|
60
|
+
.option("--json", "Output as JSON")
|
|
60
61
|
.action((options) => {
|
|
61
62
|
let results = [];
|
|
62
63
|
if (options.user) {
|
|
@@ -68,7 +69,16 @@ export function createKnowledgeBaseCommand() {
|
|
|
68
69
|
results = [...projectKbs, ...userKbs];
|
|
69
70
|
}
|
|
70
71
|
if (results.length === 0) {
|
|
71
|
-
|
|
72
|
+
if (options.json) {
|
|
73
|
+
log.plain("[]");
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
log.info("No knowledge bases found.");
|
|
77
|
+
}
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
if (options.json) {
|
|
81
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
72
82
|
return;
|
|
73
83
|
}
|
|
74
84
|
for (const kb of results) {
|
|
@@ -151,6 +161,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
151
161
|
kb.command("query <id> <query-text>")
|
|
152
162
|
.description("Query a knowledge base")
|
|
153
163
|
.option("-k, --top-k <number>", "Number of results", "5")
|
|
164
|
+
.option("--json", "Output as JSON")
|
|
154
165
|
.action(async (id, queryText, options) => {
|
|
155
166
|
const resolved = resolveKnowledgeBase(id);
|
|
156
167
|
if (!resolved) {
|
|
@@ -168,7 +179,19 @@ export function createKnowledgeBaseCommand() {
|
|
|
168
179
|
const k = parseInt(options.topK, 10);
|
|
169
180
|
const results = await queryVectorStore(modelConfig, vectorDir, queryText, k);
|
|
170
181
|
if (results.length === 0) {
|
|
171
|
-
|
|
182
|
+
if (options.json) {
|
|
183
|
+
log.plain("[]");
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
log.info("No results found.");
|
|
187
|
+
}
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
if (options.json) {
|
|
191
|
+
const output = results
|
|
192
|
+
.filter((doc) => doc != null)
|
|
193
|
+
.map((doc) => ({ pageContent: doc.pageContent, metadata: doc.metadata }));
|
|
194
|
+
log.plain(JSON.stringify(output, null, 2));
|
|
172
195
|
return;
|
|
173
196
|
}
|
|
174
197
|
for (let i = 0; i < results.length; i++) {
|
|
@@ -185,4 +208,3 @@ export function createKnowledgeBaseCommand() {
|
|
|
185
208
|
});
|
|
186
209
|
return kb;
|
|
187
210
|
}
|
|
188
|
-
//# sourceMappingURL=knowledge-base.js.map
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import * as path from "node:path";
|
|
3
|
-
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
|
|
4
3
|
import chalk from "chalk";
|
|
5
4
|
import cliProgress from "cli-progress";
|
|
6
5
|
import { Command } from "commander";
|
|
@@ -11,7 +10,8 @@ import * as userKb from "../db/user/knowledge-bases.js";
|
|
|
11
10
|
import * as userLit from "../db/user/literatures.js";
|
|
12
11
|
import { extractContent, extractPdfMetadata } from "../extractor/index.js";
|
|
13
12
|
import { log } from "../logger.js";
|
|
14
|
-
import {
|
|
13
|
+
import { splitDocuments } from "../text-splitter.js";
|
|
14
|
+
import { addDocuments, createVectorStore } from "../vector-store/index.js";
|
|
15
15
|
function resolveKnowledgeBase(id) {
|
|
16
16
|
const pkb = projectKb.getKnowledgeBase(id);
|
|
17
17
|
if (pkb)
|
|
@@ -96,11 +96,7 @@ export function createLiteratureCommand() {
|
|
|
96
96
|
fs.copyFileSync(absolutePath, path.join(filesDir, `${literature.id}${ext}`));
|
|
97
97
|
// Split text and add to vector store
|
|
98
98
|
log.info("Splitting text...");
|
|
99
|
-
const
|
|
100
|
-
chunkSize: 1000,
|
|
101
|
-
chunkOverlap: 200,
|
|
102
|
-
});
|
|
103
|
-
const splitDocs = await splitter.splitDocuments(docs);
|
|
99
|
+
const splitDocs = splitDocuments(docs, { chunkSize: 1000, chunkOverlap: 200 });
|
|
104
100
|
log.step(`Created ${String(splitDocs.length)} chunks.`);
|
|
105
101
|
// Add literature ID metadata to each chunk
|
|
106
102
|
for (const doc of splitDocs) {
|
|
@@ -115,12 +111,9 @@ export function createLiteratureCommand() {
|
|
|
115
111
|
const hasIndex = fs.existsSync(path.join(vectorDir, "faiss.index")) &&
|
|
116
112
|
fs.existsSync(path.join(vectorDir, "docstore.json"));
|
|
117
113
|
if (hasIndex) {
|
|
118
|
-
|
|
119
|
-
await store.addDocuments(splitDocs);
|
|
120
|
-
await store.save(vectorDir);
|
|
114
|
+
await addDocuments(splitDocs, modelConfig, vectorDir);
|
|
121
115
|
}
|
|
122
116
|
else {
|
|
123
|
-
fs.mkdirSync(vectorDir, { recursive: true });
|
|
124
117
|
await createVectorStore(splitDocs, modelConfig, vectorDir);
|
|
125
118
|
}
|
|
126
119
|
bar.update(splitDocs.length);
|
|
@@ -211,7 +204,8 @@ export function createLiteratureCommand() {
|
|
|
211
204
|
lit
|
|
212
205
|
.command("list <knowledge-base-id>")
|
|
213
206
|
.description("List literatures in a knowledge base")
|
|
214
|
-
.
|
|
207
|
+
.option("--json", "Output as JSON")
|
|
208
|
+
.action((kbId, options) => {
|
|
215
209
|
const resolved = resolveKnowledgeBase(kbId);
|
|
216
210
|
if (!resolved) {
|
|
217
211
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -220,7 +214,16 @@ export function createLiteratureCommand() {
|
|
|
220
214
|
const litOps = getLitOps(resolved.scope);
|
|
221
215
|
const literatures = litOps.listLiteratures(kbId);
|
|
222
216
|
if (literatures.length === 0) {
|
|
223
|
-
|
|
217
|
+
if (options.json) {
|
|
218
|
+
log.plain("[]");
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
log.info("No literatures found.");
|
|
222
|
+
}
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
if (options.json) {
|
|
226
|
+
log.plain(JSON.stringify(literatures, null, 2));
|
|
224
227
|
return;
|
|
225
228
|
}
|
|
226
229
|
for (const l of literatures) {
|
|
@@ -241,6 +244,7 @@ export function createLiteratureCommand() {
|
|
|
241
244
|
.option("-a, --author <author>", "Author substring")
|
|
242
245
|
.option("-k, --keyword <keyword>", "Keyword substring")
|
|
243
246
|
.option("--doi <doi>", "DOI substring")
|
|
247
|
+
.option("--json", "Output as JSON")
|
|
244
248
|
.action((kbId, options) => {
|
|
245
249
|
const resolved = resolveKnowledgeBase(kbId);
|
|
246
250
|
if (!resolved) {
|
|
@@ -262,7 +266,16 @@ export function createLiteratureCommand() {
|
|
|
262
266
|
doi: options.doi,
|
|
263
267
|
});
|
|
264
268
|
if (results.length === 0) {
|
|
265
|
-
|
|
269
|
+
if (options.json) {
|
|
270
|
+
log.plain("[]");
|
|
271
|
+
}
|
|
272
|
+
else {
|
|
273
|
+
log.info("No literatures found.");
|
|
274
|
+
}
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
if (options.json) {
|
|
278
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
266
279
|
return;
|
|
267
280
|
}
|
|
268
281
|
for (const l of results) {
|
|
@@ -283,7 +296,8 @@ export function createLiteratureCommand() {
|
|
|
283
296
|
lit
|
|
284
297
|
.command("show <knowledge-base-id> <id>")
|
|
285
298
|
.description("Show literature details")
|
|
286
|
-
.
|
|
299
|
+
.option("--json", "Output as JSON")
|
|
300
|
+
.action((kbId, id, options) => {
|
|
287
301
|
const resolved = resolveKnowledgeBase(kbId);
|
|
288
302
|
if (!resolved) {
|
|
289
303
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -295,6 +309,10 @@ export function createLiteratureCommand() {
|
|
|
295
309
|
log.error(`Literature not found: ${id}`);
|
|
296
310
|
process.exit(1);
|
|
297
311
|
}
|
|
312
|
+
if (options.json) {
|
|
313
|
+
log.plain(JSON.stringify(literature, null, 2));
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
298
316
|
printLiterature(literature);
|
|
299
317
|
});
|
|
300
318
|
// ─── lit note ──────────────────────────────────────────────
|
|
@@ -302,12 +320,17 @@ export function createLiteratureCommand() {
|
|
|
302
320
|
note
|
|
303
321
|
.command("list <literature-id>")
|
|
304
322
|
.description("List all notes for a literature")
|
|
305
|
-
.
|
|
323
|
+
.option("--json", "Output as JSON")
|
|
324
|
+
.action((litId, options) => {
|
|
306
325
|
const literature = findLiterature(litId);
|
|
307
326
|
if (!literature) {
|
|
308
327
|
log.error(`Literature not found: ${litId}`);
|
|
309
328
|
process.exit(1);
|
|
310
329
|
}
|
|
330
|
+
if (options.json) {
|
|
331
|
+
log.plain(JSON.stringify(literature.notes, null, 2));
|
|
332
|
+
return;
|
|
333
|
+
}
|
|
311
334
|
const entries = Object.entries(literature.notes);
|
|
312
335
|
if (entries.length === 0) {
|
|
313
336
|
log.info("No notes found.");
|
|
@@ -392,4 +415,3 @@ function printLiterature(lit) {
|
|
|
392
415
|
}
|
|
393
416
|
}
|
|
394
417
|
}
|
|
395
|
-
//# sourceMappingURL=literature.js.map
|
package/dist/commands/util.js
CHANGED
package/dist/config/index.js
CHANGED
package/dist/config/init.js
CHANGED
package/dist/db/index.js
CHANGED
package/dist/db/schema.js
CHANGED
package/dist/db/test-utils.js
CHANGED
package/dist/extractor/index.js
CHANGED
package/dist/extractor/pdf.js
CHANGED
|
@@ -1,44 +1,43 @@
|
|
|
1
1
|
import { readFile } from "node:fs/promises";
|
|
2
|
-
import {
|
|
3
|
-
import { PDFParse } from "pdf-parse";
|
|
2
|
+
import { extractText, getMeta } from "unpdf";
|
|
4
3
|
export async function extractPdfContent(pdfPath) {
|
|
5
|
-
const data = await readFile(pdfPath);
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
return result.pages.map((page) => new Document({
|
|
10
|
-
pageContent: page.text,
|
|
4
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
5
|
+
const result = await extractText(data, { mergePages: false });
|
|
6
|
+
return result.text.map((pageText, i) => ({
|
|
7
|
+
pageContent: pageText,
|
|
11
8
|
metadata: {
|
|
12
9
|
source: pdfPath,
|
|
13
|
-
pdf: { totalPages: result.
|
|
14
|
-
loc: { pageNumber:
|
|
10
|
+
pdf: { totalPages: result.totalPages },
|
|
11
|
+
loc: { pageNumber: i + 1 },
|
|
15
12
|
},
|
|
16
13
|
}));
|
|
17
14
|
}
|
|
18
15
|
export async function extractPdfMetadata(pdfPath) {
|
|
19
|
-
const data = await readFile(pdfPath);
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
const
|
|
29
|
-
const creationDate = parsePdfDate(nonEmptyStringOrNull(info?.["CreationDate"]));
|
|
30
|
-
const modDate = parsePdfDate(nonEmptyStringOrNull(info?.["ModDate"]));
|
|
31
|
-
const rawKeywords = nonEmptyStringOrNull(info?.["Keywords"]);
|
|
16
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
17
|
+
const { info } = await getMeta(data);
|
|
18
|
+
const custom = getRecord(info["Custom"]);
|
|
19
|
+
const title = nonEmptyStringOrNull(info["Title"]);
|
|
20
|
+
const author = nonEmptyStringOrNull(info["Author"]);
|
|
21
|
+
const subject = nonEmptyStringOrNull(info["Subject"]);
|
|
22
|
+
const creator = nonEmptyStringOrNull(info["Creator"]);
|
|
23
|
+
const creationDate = parsePdfDate(nonEmptyStringOrNull(info["CreationDate"]));
|
|
24
|
+
const modDate = parsePdfDate(nonEmptyStringOrNull(info["ModDate"]));
|
|
25
|
+
const rawKeywords = nonEmptyStringOrNull(info["Keywords"]);
|
|
32
26
|
const keywords = rawKeywords
|
|
33
27
|
? rawKeywords
|
|
34
28
|
.split(/[,;]/)
|
|
35
29
|
.map((k) => k.trim())
|
|
36
30
|
.filter(Boolean)
|
|
37
31
|
: [];
|
|
38
|
-
// DOI can appear in Custom fields (case-insensitive lookup)
|
|
39
32
|
const doi = findCustomField(custom, "doi");
|
|
40
33
|
return { title, author, subject, keywords, doi, creator, creationDate, modDate };
|
|
41
34
|
}
|
|
35
|
+
function getRecord(value) {
|
|
36
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
37
|
+
return value;
|
|
38
|
+
}
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
42
41
|
function nonEmptyStringOrNull(value) {
|
|
43
42
|
if (typeof value === "string" && value.trim().length > 0) {
|
|
44
43
|
return value.trim();
|
|
@@ -53,7 +52,6 @@ function parsePdfDate(value) {
|
|
|
53
52
|
if (!value)
|
|
54
53
|
return null;
|
|
55
54
|
const cleaned = value.replace(/^D:/, "");
|
|
56
|
-
// Extract components: YYYY[MM[DD[HH[mm[SS]]]]]
|
|
57
55
|
const match = /^(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/.exec(cleaned);
|
|
58
56
|
if (!match)
|
|
59
57
|
return null;
|
|
@@ -66,9 +64,6 @@ function parsePdfDate(value) {
|
|
|
66
64
|
const date = new Date(`${year}-${month}-${day}T${hour}:${min}:${sec}Z`);
|
|
67
65
|
return Number.isNaN(date.getTime()) ? null : date;
|
|
68
66
|
}
|
|
69
|
-
/**
|
|
70
|
-
* Case-insensitive lookup in the Custom fields dictionary.
|
|
71
|
-
*/
|
|
72
67
|
function findCustomField(custom, key) {
|
|
73
68
|
if (!custom)
|
|
74
69
|
return null;
|
|
@@ -80,4 +75,3 @@ function findCustomField(custom, key) {
|
|
|
80
75
|
}
|
|
81
76
|
return null;
|
|
82
77
|
}
|
|
83
|
-
//# sourceMappingURL=pdf.js.map
|
package/dist/extractor/text.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { readFile, stat } from "node:fs/promises";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
2
|
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10 MB
|
|
4
3
|
export async function extractTextContent(filePath) {
|
|
5
4
|
const fileStats = await stat(filePath);
|
|
@@ -8,10 +7,9 @@ export async function extractTextContent(filePath) {
|
|
|
8
7
|
}
|
|
9
8
|
const content = await readFile(filePath, "utf-8");
|
|
10
9
|
return [
|
|
11
|
-
|
|
10
|
+
{
|
|
12
11
|
pageContent: content,
|
|
13
12
|
metadata: { source: filePath },
|
|
14
|
-
}
|
|
13
|
+
},
|
|
15
14
|
];
|
|
16
15
|
}
|
|
17
|
-
//# sourceMappingURL=text.js.map
|
package/dist/index.js
CHANGED
package/dist/lifecycle.js
CHANGED
package/dist/logger.js
CHANGED
package/dist/migrations.js
CHANGED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
|
|
2
|
+
export function splitDocuments(docs, options) {
|
|
3
|
+
if (options.chunkOverlap >= options.chunkSize) {
|
|
4
|
+
throw new Error("chunkOverlap must be less than chunkSize");
|
|
5
|
+
}
|
|
6
|
+
const result = [];
|
|
7
|
+
for (const doc of docs) {
|
|
8
|
+
const chunks = splitText(doc.pageContent, options.chunkSize, options.chunkOverlap);
|
|
9
|
+
for (const chunk of chunks) {
|
|
10
|
+
result.push({ pageContent: chunk, metadata: { ...doc.metadata } });
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return result;
|
|
14
|
+
}
|
|
15
|
+
function splitText(text, chunkSize, chunkOverlap) {
|
|
16
|
+
return recursiveSplit(text, DEFAULT_SEPARATORS, chunkSize, chunkOverlap);
|
|
17
|
+
}
|
|
18
|
+
function recursiveSplit(text, separators, chunkSize, chunkOverlap) {
|
|
19
|
+
if (text.length <= chunkSize)
|
|
20
|
+
return [text];
|
|
21
|
+
const separator = separators[0] ?? "";
|
|
22
|
+
const remaining = separators.slice(1);
|
|
23
|
+
const parts = separator === "" ? [...text] : text.split(separator);
|
|
24
|
+
const chunks = [];
|
|
25
|
+
let current = "";
|
|
26
|
+
for (const part of parts) {
|
|
27
|
+
const piece = current.length === 0 ? part : current + separator + part;
|
|
28
|
+
if (piece.length > chunkSize && current.length > 0) {
|
|
29
|
+
chunks.push(current);
|
|
30
|
+
// Overlap: keep the tail of the current chunk
|
|
31
|
+
if (chunkOverlap > 0 && current.length > chunkOverlap) {
|
|
32
|
+
current = current.slice(-chunkOverlap) + separator + part;
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
current = part;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
current = piece;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
if (current.length > 0) {
|
|
43
|
+
chunks.push(current);
|
|
44
|
+
}
|
|
45
|
+
// Recursively split any chunks that are still too large
|
|
46
|
+
const finalChunks = [];
|
|
47
|
+
for (const chunk of chunks) {
|
|
48
|
+
if (chunk.length > chunkSize && remaining.length > 0) {
|
|
49
|
+
finalChunks.push(...recursiveSplit(chunk, remaining, chunkSize, chunkOverlap));
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
finalChunks.push(chunk);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return finalChunks;
|
|
56
|
+
}
|
package/dist/types/index.js
CHANGED
|
@@ -1,17 +1,81 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
import * as fs from "node:fs/promises";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
import { embed, embedMany } from "../ai/embed.js";
|
|
5
|
+
async function importFaiss() {
|
|
6
|
+
const { IndexFlatL2 } = (await import("faiss-node")).default;
|
|
7
|
+
return { IndexFlatL2 };
|
|
8
|
+
}
|
|
9
|
+
function parseDocstore(raw) {
|
|
10
|
+
const parsed = JSON.parse(raw);
|
|
11
|
+
if (!Array.isArray(parsed) ||
|
|
12
|
+
parsed.length !== 2 ||
|
|
13
|
+
!Array.isArray(parsed[0]) ||
|
|
14
|
+
typeof parsed[1] !== "object" ||
|
|
15
|
+
parsed[1] === null) {
|
|
16
|
+
throw new Error("Corrupt docstore.json: expected [entries, mapping] tuple");
|
|
17
|
+
}
|
|
18
|
+
// Safe after validation: parsed[0] is Array, parsed[1] is non-null object
|
|
19
|
+
const entries = parsed[0];
|
|
20
|
+
const mapping = parsed[1];
|
|
21
|
+
return [entries, mapping];
|
|
22
|
+
}
|
|
3
23
|
export async function createVectorStore(docs, config, directory) {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
24
|
+
if (docs.length === 0)
|
|
25
|
+
return;
|
|
26
|
+
const texts = docs.map((d) => d.pageContent);
|
|
27
|
+
const vectors = await embedMany(config, texts);
|
|
28
|
+
const dimension = vectors[0].length;
|
|
29
|
+
const { IndexFlatL2 } = await importFaiss();
|
|
30
|
+
const index = new IndexFlatL2(dimension);
|
|
31
|
+
const mapping = {};
|
|
32
|
+
const docEntries = [];
|
|
33
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
34
|
+
const id = randomUUID();
|
|
35
|
+
index.add(vectors[i]);
|
|
36
|
+
mapping[i] = id;
|
|
37
|
+
docEntries.push([id, docs[i]]);
|
|
38
|
+
}
|
|
39
|
+
await fs.mkdir(directory, { recursive: true });
|
|
40
|
+
index.write(path.join(directory, "faiss.index"));
|
|
41
|
+
await fs.writeFile(path.join(directory, "docstore.json"), JSON.stringify([docEntries, mapping]));
|
|
8
42
|
}
|
|
9
|
-
export async function
|
|
10
|
-
|
|
11
|
-
|
|
43
|
+
export async function addDocuments(docs, config, directory) {
|
|
44
|
+
if (docs.length === 0)
|
|
45
|
+
return;
|
|
46
|
+
const texts = docs.map((d) => d.pageContent);
|
|
47
|
+
const vectors = await embedMany(config, texts);
|
|
48
|
+
const { IndexFlatL2 } = await importFaiss();
|
|
49
|
+
const index = IndexFlatL2.read(path.join(directory, "faiss.index"));
|
|
50
|
+
const raw = await fs.readFile(path.join(directory, "docstore.json"), "utf-8");
|
|
51
|
+
const [existingDocs, mapping] = parseDocstore(raw);
|
|
52
|
+
const baseId = index.ntotal();
|
|
53
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
54
|
+
const id = randomUUID();
|
|
55
|
+
index.add(vectors[i]);
|
|
56
|
+
mapping[baseId + i] = id;
|
|
57
|
+
existingDocs.push([id, docs[i]]);
|
|
58
|
+
}
|
|
59
|
+
index.write(path.join(directory, "faiss.index"));
|
|
60
|
+
await fs.writeFile(path.join(directory, "docstore.json"), JSON.stringify([existingDocs, mapping]));
|
|
12
61
|
}
|
|
13
62
|
export async function queryVectorStore(config, directory, query, k = 5) {
|
|
14
|
-
const
|
|
15
|
-
|
|
63
|
+
const { IndexFlatL2 } = await importFaiss();
|
|
64
|
+
const index = IndexFlatL2.read(path.join(directory, "faiss.index"));
|
|
65
|
+
const raw = await fs.readFile(path.join(directory, "docstore.json"), "utf-8");
|
|
66
|
+
const [docEntries, mapping] = parseDocstore(raw);
|
|
67
|
+
const docMap = new Map(docEntries);
|
|
68
|
+
const queryVector = await embed(config, query);
|
|
69
|
+
const total = index.ntotal();
|
|
70
|
+
if (total === 0)
|
|
71
|
+
return [];
|
|
72
|
+
const effectiveK = Math.min(k, total);
|
|
73
|
+
const result = index.search(queryVector, effectiveK);
|
|
74
|
+
return result.labels
|
|
75
|
+
.filter((label) => label >= 0)
|
|
76
|
+
.map((label) => {
|
|
77
|
+
const docId = mapping[label];
|
|
78
|
+
return docId ? docMap.get(docId) : undefined;
|
|
79
|
+
})
|
|
80
|
+
.filter((doc) => doc != null);
|
|
16
81
|
}
|
|
17
|
-
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "paper-manager",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "A paper management system.",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"homepage": "https://github.com/EurFelux/paper-manager",
|
|
@@ -21,9 +21,6 @@
|
|
|
21
21
|
"dependencies": {
|
|
22
22
|
"@ai-sdk/openai": "^3.0.37",
|
|
23
23
|
"@ai-sdk/provider": "^3.0.8",
|
|
24
|
-
"@langchain/community": "^1.1.20",
|
|
25
|
-
"@langchain/core": "^1.1.29",
|
|
26
|
-
"@langchain/textsplitters": "^1.0.1",
|
|
27
24
|
"ai": "^6.0.105",
|
|
28
25
|
"better-sqlite3": "^12.6.2",
|
|
29
26
|
"chalk": "^5.6.2",
|
|
@@ -31,9 +28,8 @@
|
|
|
31
28
|
"commander": "^14.0.3",
|
|
32
29
|
"drizzle-orm": "^0.45.1",
|
|
33
30
|
"faiss-node": "^0.5.1",
|
|
34
|
-
"langchain": "^1.2.28",
|
|
35
31
|
"mime-types": "^3.0.2",
|
|
36
|
-
"
|
|
32
|
+
"unpdf": "^1.4.0",
|
|
37
33
|
"zod": "^4.3.6"
|
|
38
34
|
},
|
|
39
35
|
"devDependencies": {
|
package/dist/ai/embed.d.ts
DELETED
package/dist/ai/embed.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"embed.js","sourceRoot":"","sources":["../../src/ai/embed.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,IAAI,OAAO,EAAE,SAAS,IAAI,WAAW,EAAE,MAAM,IAAI,CAAC;AAGhE,OAAO,EAAE,oBAAoB,EAAE,MAAM,eAAe,CAAC;AAErD,SAAS,oBAAoB,CAC3B,MAA4B;IAE5B,IAAI,MAAM,CAAC,UAAU,IAAI,IAAI;QAAE,OAAO,SAAS,CAAC;IAChD,OAAO,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC;AAClE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,MAA4B,EAAE,IAAY;IACpE,MAAM,KAAK,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC;QAC3B,KAAK;QACL,KAAK,EAAE,IAAI;QACX,eAAe,EAAE,oBAAoB,CAAC,MAAM,CAAC;KAC9C,CAAC,CAAC;IACH,OAAO,MAAM,CAAC,SAAS,CAAC;AAC1B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,MAA4B,EAC5B,KAAe;IAEf,MAAM,KAAK,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAC;IAC3C,MAAM,eAAe,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAC;IAErD,IAAI,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,KAAK,CAAC,MAAM,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACjE,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,eAAe,EAAE,CAAC,CAAC;QAC5E,OAAO,MAAM,CAAC,UAAU,CAAC;IAC3B,CAAC;IAED,MAAM,UAAU,GAAe,EAAE,CAAC;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACxD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC;QACnD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,eAAe,EAAE,CAAC,CAAC;QAC5E,UAAU,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC"}
|
package/dist/ai/embed.test.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|