paper-manager 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/embed.js +0 -1
- package/dist/ai/index.js +0 -1
- package/dist/ai/provider.js +0 -1
- package/dist/commands/config.js +0 -1
- package/dist/commands/knowledge-base.js +25 -3
- package/dist/commands/literature.js +84 -16
- package/dist/commands/util.js +0 -1
- package/dist/config/index.js +0 -1
- package/dist/config/init.js +0 -1
- package/dist/db/index.js +6 -2
- package/dist/db/operations/knowledge-bases.js +0 -1
- package/dist/db/operations/literatures.js +19 -2
- package/dist/db/project/knowledge-bases.js +0 -1
- package/dist/db/project/literatures.js +3 -1
- package/dist/db/schema.js +5 -3
- package/dist/db/test-utils.js +0 -1
- package/dist/db/user/knowledge-bases.js +0 -1
- package/dist/db/user/literatures.js +3 -1
- package/dist/extractor/index.js +0 -1
- package/dist/extractor/pdf.js +23 -29
- package/dist/extractor/text.js +2 -4
- package/dist/index.js +0 -1
- package/dist/lifecycle.js +0 -1
- package/dist/logger.js +0 -1
- package/dist/migrations.js +0 -1
- package/dist/text-splitter.js +56 -0
- package/dist/types/index.js +0 -1
- package/dist/vector-store/embeddings.js +0 -1
- package/dist/vector-store/index.js +76 -12
- package/package.json +2 -6
- package/dist/ai/embed.d.ts +0 -3
- package/dist/ai/embed.js.map +0 -1
- package/dist/ai/embed.test.d.ts +0 -1
- package/dist/ai/embed.test.js +0 -93
- package/dist/ai/embed.test.js.map +0 -1
- package/dist/ai/index.d.ts +0 -2
- package/dist/ai/index.js.map +0 -1
- package/dist/ai/provider.d.ts +0 -3
- package/dist/ai/provider.js.map +0 -1
- package/dist/commands/config.d.ts +0 -2
- package/dist/commands/config.js.map +0 -1
- package/dist/commands/knowledge-base.d.ts +0 -2
- package/dist/commands/knowledge-base.js.map +0 -1
- package/dist/commands/literature.d.ts +0 -2
- package/dist/commands/literature.js.map +0 -1
- package/dist/commands/util.d.ts +0 -2
- package/dist/commands/util.js.map +0 -1
- package/dist/config/index.d.ts +0 -39
- package/dist/config/index.js.map +0 -1
- package/dist/config/index.test.d.ts +0 -1
- package/dist/config/index.test.js +0 -143
- package/dist/config/index.test.js.map +0 -1
- package/dist/config/init.d.ts +0 -10
- package/dist/config/init.js.map +0 -1
- package/dist/config/init.test.d.ts +0 -1
- package/dist/config/init.test.js +0 -61
- package/dist/config/init.test.js.map +0 -1
- package/dist/db/index.d.ts +0 -7
- package/dist/db/index.js.map +0 -1
- package/dist/db/index.test.d.ts +0 -1
- package/dist/db/index.test.js +0 -32
- package/dist/db/index.test.js.map +0 -1
- package/dist/db/operations/knowledge-bases.d.ts +0 -11
- package/dist/db/operations/knowledge-bases.js.map +0 -1
- package/dist/db/operations/knowledge-bases.test.d.ts +0 -1
- package/dist/db/operations/knowledge-bases.test.js +0 -71
- package/dist/db/operations/knowledge-bases.test.js.map +0 -1
- package/dist/db/operations/literatures.d.ts +0 -9
- package/dist/db/operations/literatures.js.map +0 -1
- package/dist/db/operations/literatures.test.d.ts +0 -1
- package/dist/db/operations/literatures.test.js +0 -156
- package/dist/db/operations/literatures.test.js.map +0 -1
- package/dist/db/project/knowledge-bases.d.ts +0 -10
- package/dist/db/project/knowledge-bases.js.map +0 -1
- package/dist/db/project/literatures.d.ts +0 -8
- package/dist/db/project/literatures.js.map +0 -1
- package/dist/db/schema.d.ts +0 -371
- package/dist/db/schema.js.map +0 -1
- package/dist/db/test-utils.d.ts +0 -6
- package/dist/db/test-utils.js.map +0 -1
- package/dist/db/user/knowledge-bases.d.ts +0 -10
- package/dist/db/user/knowledge-bases.js.map +0 -1
- package/dist/db/user/literatures.d.ts +0 -8
- package/dist/db/user/literatures.js.map +0 -1
- package/dist/extractor/index.d.ts +0 -6
- package/dist/extractor/index.js.map +0 -1
- package/dist/extractor/pdf.d.ts +0 -13
- package/dist/extractor/pdf.js.map +0 -1
- package/dist/extractor/pdf.test.d.ts +0 -1
- package/dist/extractor/pdf.test.js +0 -106
- package/dist/extractor/pdf.test.js.map +0 -1
- package/dist/extractor/text.d.ts +0 -2
- package/dist/extractor/text.js.map +0 -1
- package/dist/index.d.ts +0 -2
- package/dist/index.js.map +0 -1
- package/dist/lifecycle.d.ts +0 -1
- package/dist/lifecycle.js.map +0 -1
- package/dist/logger.d.ts +0 -24
- package/dist/logger.js.map +0 -1
- package/dist/migrations.d.ts +0 -5
- package/dist/migrations.js.map +0 -1
- package/dist/pdf/extractor.d.ts +0 -2
- package/dist/pdf/extractor.js +0 -18
- package/dist/pdf/extractor.js.map +0 -1
- package/dist/types/index.d.ts +0 -61
- package/dist/types/index.js.map +0 -1
- package/dist/types/index.test.d.ts +0 -1
- package/dist/types/index.test.js +0 -100
- package/dist/types/index.test.js.map +0 -1
- package/dist/vector-store/embeddings.d.ts +0 -8
- package/dist/vector-store/embeddings.js.map +0 -1
- package/dist/vector-store/index.d.ts +0 -6
- package/dist/vector-store/index.js.map +0 -1
package/dist/ai/embed.js
CHANGED
package/dist/ai/index.js
CHANGED
package/dist/ai/provider.js
CHANGED
package/dist/commands/config.js
CHANGED
|
@@ -57,6 +57,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
57
57
|
.description("List knowledge bases")
|
|
58
58
|
.option("--user", "List user knowledge bases only")
|
|
59
59
|
.option("--all", "List all knowledge bases (default)")
|
|
60
|
+
.option("--json", "Output as JSON")
|
|
60
61
|
.action((options) => {
|
|
61
62
|
let results = [];
|
|
62
63
|
if (options.user) {
|
|
@@ -68,7 +69,16 @@ export function createKnowledgeBaseCommand() {
|
|
|
68
69
|
results = [...projectKbs, ...userKbs];
|
|
69
70
|
}
|
|
70
71
|
if (results.length === 0) {
|
|
71
|
-
|
|
72
|
+
if (options.json) {
|
|
73
|
+
log.plain("[]");
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
log.info("No knowledge bases found.");
|
|
77
|
+
}
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
if (options.json) {
|
|
81
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
72
82
|
return;
|
|
73
83
|
}
|
|
74
84
|
for (const kb of results) {
|
|
@@ -151,6 +161,7 @@ export function createKnowledgeBaseCommand() {
|
|
|
151
161
|
kb.command("query <id> <query-text>")
|
|
152
162
|
.description("Query a knowledge base")
|
|
153
163
|
.option("-k, --top-k <number>", "Number of results", "5")
|
|
164
|
+
.option("--json", "Output as JSON")
|
|
154
165
|
.action(async (id, queryText, options) => {
|
|
155
166
|
const resolved = resolveKnowledgeBase(id);
|
|
156
167
|
if (!resolved) {
|
|
@@ -168,7 +179,19 @@ export function createKnowledgeBaseCommand() {
|
|
|
168
179
|
const k = parseInt(options.topK, 10);
|
|
169
180
|
const results = await queryVectorStore(modelConfig, vectorDir, queryText, k);
|
|
170
181
|
if (results.length === 0) {
|
|
171
|
-
|
|
182
|
+
if (options.json) {
|
|
183
|
+
log.plain("[]");
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
log.info("No results found.");
|
|
187
|
+
}
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
if (options.json) {
|
|
191
|
+
const output = results
|
|
192
|
+
.filter((doc) => doc != null)
|
|
193
|
+
.map((doc) => ({ pageContent: doc.pageContent, metadata: doc.metadata }));
|
|
194
|
+
log.plain(JSON.stringify(output, null, 2));
|
|
172
195
|
return;
|
|
173
196
|
}
|
|
174
197
|
for (let i = 0; i < results.length; i++) {
|
|
@@ -185,4 +208,3 @@ export function createKnowledgeBaseCommand() {
|
|
|
185
208
|
});
|
|
186
209
|
return kb;
|
|
187
210
|
}
|
|
188
|
-
//# sourceMappingURL=knowledge-base.js.map
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import * as path from "node:path";
|
|
3
|
-
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
|
|
4
3
|
import chalk from "chalk";
|
|
5
4
|
import cliProgress from "cli-progress";
|
|
6
5
|
import { Command } from "commander";
|
|
@@ -11,7 +10,8 @@ import * as userKb from "../db/user/knowledge-bases.js";
|
|
|
11
10
|
import * as userLit from "../db/user/literatures.js";
|
|
12
11
|
import { extractContent, extractPdfMetadata } from "../extractor/index.js";
|
|
13
12
|
import { log } from "../logger.js";
|
|
14
|
-
import {
|
|
13
|
+
import { splitDocuments } from "../text-splitter.js";
|
|
14
|
+
import { addDocuments, createVectorStore } from "../vector-store/index.js";
|
|
15
15
|
function resolveKnowledgeBase(id) {
|
|
16
16
|
const pkb = projectKb.getKnowledgeBase(id);
|
|
17
17
|
if (pkb)
|
|
@@ -96,11 +96,7 @@ export function createLiteratureCommand() {
|
|
|
96
96
|
fs.copyFileSync(absolutePath, path.join(filesDir, `${literature.id}${ext}`));
|
|
97
97
|
// Split text and add to vector store
|
|
98
98
|
log.info("Splitting text...");
|
|
99
|
-
const
|
|
100
|
-
chunkSize: 1000,
|
|
101
|
-
chunkOverlap: 200,
|
|
102
|
-
});
|
|
103
|
-
const splitDocs = await splitter.splitDocuments(docs);
|
|
99
|
+
const splitDocs = splitDocuments(docs, { chunkSize: 1000, chunkOverlap: 200 });
|
|
104
100
|
log.step(`Created ${String(splitDocs.length)} chunks.`);
|
|
105
101
|
// Add literature ID metadata to each chunk
|
|
106
102
|
for (const doc of splitDocs) {
|
|
@@ -115,12 +111,9 @@ export function createLiteratureCommand() {
|
|
|
115
111
|
const hasIndex = fs.existsSync(path.join(vectorDir, "faiss.index")) &&
|
|
116
112
|
fs.existsSync(path.join(vectorDir, "docstore.json"));
|
|
117
113
|
if (hasIndex) {
|
|
118
|
-
|
|
119
|
-
await store.addDocuments(splitDocs);
|
|
120
|
-
await store.save(vectorDir);
|
|
114
|
+
await addDocuments(splitDocs, modelConfig, vectorDir);
|
|
121
115
|
}
|
|
122
116
|
else {
|
|
123
|
-
fs.mkdirSync(vectorDir, { recursive: true });
|
|
124
117
|
await createVectorStore(splitDocs, modelConfig, vectorDir);
|
|
125
118
|
}
|
|
126
119
|
bar.update(splitDocs.length);
|
|
@@ -211,7 +204,8 @@ export function createLiteratureCommand() {
|
|
|
211
204
|
lit
|
|
212
205
|
.command("list <knowledge-base-id>")
|
|
213
206
|
.description("List literatures in a knowledge base")
|
|
214
|
-
.
|
|
207
|
+
.option("--json", "Output as JSON")
|
|
208
|
+
.action((kbId, options) => {
|
|
215
209
|
const resolved = resolveKnowledgeBase(kbId);
|
|
216
210
|
if (!resolved) {
|
|
217
211
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -220,7 +214,16 @@ export function createLiteratureCommand() {
|
|
|
220
214
|
const litOps = getLitOps(resolved.scope);
|
|
221
215
|
const literatures = litOps.listLiteratures(kbId);
|
|
222
216
|
if (literatures.length === 0) {
|
|
223
|
-
|
|
217
|
+
if (options.json) {
|
|
218
|
+
log.plain("[]");
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
log.info("No literatures found.");
|
|
222
|
+
}
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
if (options.json) {
|
|
226
|
+
log.plain(JSON.stringify(literatures, null, 2));
|
|
224
227
|
return;
|
|
225
228
|
}
|
|
226
229
|
for (const l of literatures) {
|
|
@@ -233,11 +236,68 @@ export function createLiteratureCommand() {
|
|
|
233
236
|
}
|
|
234
237
|
log.count(literatures.length, literatures.length === 1 ? "literature" : "literatures");
|
|
235
238
|
});
|
|
239
|
+
// ─── lit search ────────────────────────────────────────────
|
|
240
|
+
lit
|
|
241
|
+
.command("search <knowledge-base-id>")
|
|
242
|
+
.description("Search literatures in a knowledge base by metadata")
|
|
243
|
+
.option("-t, --title <title>", "Title substring")
|
|
244
|
+
.option("-a, --author <author>", "Author substring")
|
|
245
|
+
.option("-k, --keyword <keyword>", "Keyword substring")
|
|
246
|
+
.option("--doi <doi>", "DOI substring")
|
|
247
|
+
.option("--json", "Output as JSON")
|
|
248
|
+
.action((kbId, options) => {
|
|
249
|
+
const resolved = resolveKnowledgeBase(kbId);
|
|
250
|
+
if (!resolved) {
|
|
251
|
+
log.error(`Knowledge base not found: ${kbId}`);
|
|
252
|
+
process.exit(1);
|
|
253
|
+
}
|
|
254
|
+
if (options.title === undefined &&
|
|
255
|
+
options.author === undefined &&
|
|
256
|
+
options.keyword === undefined &&
|
|
257
|
+
options.doi === undefined) {
|
|
258
|
+
log.error("At least one filter (--title, --author, --keyword, --doi) is required.");
|
|
259
|
+
process.exit(1);
|
|
260
|
+
}
|
|
261
|
+
const litOps = getLitOps(resolved.scope);
|
|
262
|
+
const results = litOps.searchLiteratures(kbId, {
|
|
263
|
+
title: options.title,
|
|
264
|
+
author: options.author,
|
|
265
|
+
keyword: options.keyword,
|
|
266
|
+
doi: options.doi,
|
|
267
|
+
});
|
|
268
|
+
if (results.length === 0) {
|
|
269
|
+
if (options.json) {
|
|
270
|
+
log.plain("[]");
|
|
271
|
+
}
|
|
272
|
+
else {
|
|
273
|
+
log.info("No literatures found.");
|
|
274
|
+
}
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
if (options.json) {
|
|
278
|
+
log.plain(JSON.stringify(results, null, 2));
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
for (const l of results) {
|
|
282
|
+
log.header(l.id);
|
|
283
|
+
log.label("Title:", l.title);
|
|
284
|
+
if (l.author)
|
|
285
|
+
log.label("Author:", l.author);
|
|
286
|
+
if (l.doi)
|
|
287
|
+
log.label("DOI:", l.doi);
|
|
288
|
+
if (l.keywords.length > 0)
|
|
289
|
+
log.label("Keywords:", l.keywords.join(", "));
|
|
290
|
+
log.label("Created:", l.createdAt.toISOString());
|
|
291
|
+
log.newline();
|
|
292
|
+
}
|
|
293
|
+
log.count(results.length, results.length === 1 ? "literature" : "literatures");
|
|
294
|
+
});
|
|
236
295
|
// ─── lit show ──────────────────────────────────────────────
|
|
237
296
|
lit
|
|
238
297
|
.command("show <knowledge-base-id> <id>")
|
|
239
298
|
.description("Show literature details")
|
|
240
|
-
.
|
|
299
|
+
.option("--json", "Output as JSON")
|
|
300
|
+
.action((kbId, id, options) => {
|
|
241
301
|
const resolved = resolveKnowledgeBase(kbId);
|
|
242
302
|
if (!resolved) {
|
|
243
303
|
log.error(`Knowledge base not found: ${kbId}`);
|
|
@@ -249,6 +309,10 @@ export function createLiteratureCommand() {
|
|
|
249
309
|
log.error(`Literature not found: ${id}`);
|
|
250
310
|
process.exit(1);
|
|
251
311
|
}
|
|
312
|
+
if (options.json) {
|
|
313
|
+
log.plain(JSON.stringify(literature, null, 2));
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
252
316
|
printLiterature(literature);
|
|
253
317
|
});
|
|
254
318
|
// ─── lit note ──────────────────────────────────────────────
|
|
@@ -256,12 +320,17 @@ export function createLiteratureCommand() {
|
|
|
256
320
|
note
|
|
257
321
|
.command("list <literature-id>")
|
|
258
322
|
.description("List all notes for a literature")
|
|
259
|
-
.
|
|
323
|
+
.option("--json", "Output as JSON")
|
|
324
|
+
.action((litId, options) => {
|
|
260
325
|
const literature = findLiterature(litId);
|
|
261
326
|
if (!literature) {
|
|
262
327
|
log.error(`Literature not found: ${litId}`);
|
|
263
328
|
process.exit(1);
|
|
264
329
|
}
|
|
330
|
+
if (options.json) {
|
|
331
|
+
log.plain(JSON.stringify(literature.notes, null, 2));
|
|
332
|
+
return;
|
|
333
|
+
}
|
|
265
334
|
const entries = Object.entries(literature.notes);
|
|
266
335
|
if (entries.length === 0) {
|
|
267
336
|
log.info("No notes found.");
|
|
@@ -346,4 +415,3 @@ function printLiterature(lit) {
|
|
|
346
415
|
}
|
|
347
416
|
}
|
|
348
417
|
}
|
|
349
|
-
//# sourceMappingURL=literature.js.map
|
package/dist/commands/util.js
CHANGED
package/dist/config/index.js
CHANGED
package/dist/config/init.js
CHANGED
package/dist/db/index.js
CHANGED
|
@@ -3,7 +3,7 @@ import * as path from "node:path";
|
|
|
3
3
|
import Database from "better-sqlite3";
|
|
4
4
|
import { drizzle } from "drizzle-orm/better-sqlite3";
|
|
5
5
|
import { getProjectDataDir, getUserDataDir } from "../config/index.js";
|
|
6
|
-
import { CREATE_KNOWLEDGE_BASES_TABLE, CREATE_LITERATURES_TABLE } from "./schema.js";
|
|
6
|
+
import { CREATE_KNOWLEDGE_BASES_TABLE, CREATE_LITERATURES_KB_INDEX, CREATE_LITERATURES_TABLE, } from "./schema.js";
|
|
7
7
|
// ─── Database Connection ────────────────────────────────────
|
|
8
8
|
export function openDatabase(dbPath) {
|
|
9
9
|
const dir = path.dirname(dbPath);
|
|
@@ -16,6 +16,7 @@ export function openDatabase(dbPath) {
|
|
|
16
16
|
export function initializeDatabase(db) {
|
|
17
17
|
db.exec(CREATE_KNOWLEDGE_BASES_TABLE);
|
|
18
18
|
db.exec(CREATE_LITERATURES_TABLE);
|
|
19
|
+
db.exec(CREATE_LITERATURES_KB_INDEX);
|
|
19
20
|
migrateDatabase(db);
|
|
20
21
|
}
|
|
21
22
|
// ─── Migrations ─────────────────────────────────────────────
|
|
@@ -27,6 +28,10 @@ const MIGRATIONS = [
|
|
|
27
28
|
db.exec("ALTER TABLE literatures ADD COLUMN doi TEXT");
|
|
28
29
|
}
|
|
29
30
|
},
|
|
31
|
+
// v1 → v2: add index on literatures.knowledge_base_id for faster KB-scoped lookups
|
|
32
|
+
(db) => {
|
|
33
|
+
db.exec(CREATE_LITERATURES_KB_INDEX);
|
|
34
|
+
},
|
|
30
35
|
];
|
|
31
36
|
function migrateDatabase(db) {
|
|
32
37
|
const currentVersion = db.pragma("user_version", { simple: true }) ?? 0;
|
|
@@ -60,4 +65,3 @@ export function getProjectDb() {
|
|
|
60
65
|
}
|
|
61
66
|
return projectDb;
|
|
62
67
|
}
|
|
63
|
-
//# sourceMappingURL=index.js.map
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import * as crypto from "node:crypto";
|
|
2
|
-
import { desc, eq } from "drizzle-orm";
|
|
2
|
+
import { and, desc, eq, like } from "drizzle-orm";
|
|
3
3
|
import { literatures } from "../schema.js";
|
|
4
4
|
export function createLiterature(db, input) {
|
|
5
5
|
const id = crypto.randomUUID();
|
|
@@ -39,6 +39,24 @@ export function listLiteratures(db, knowledgeBaseId) {
|
|
|
39
39
|
.orderBy(desc(literatures.createdAt))
|
|
40
40
|
.all();
|
|
41
41
|
}
|
|
42
|
+
export function searchLiteratures(db, knowledgeBaseId, filters) {
|
|
43
|
+
const conditions = [eq(literatures.knowledgeBaseId, knowledgeBaseId)];
|
|
44
|
+
if (filters.title)
|
|
45
|
+
conditions.push(like(literatures.title, `%${filters.title}%`));
|
|
46
|
+
if (filters.author)
|
|
47
|
+
conditions.push(like(literatures.author, `%${filters.author}%`));
|
|
48
|
+
if (filters.doi)
|
|
49
|
+
conditions.push(like(literatures.doi, `%${filters.doi}%`));
|
|
50
|
+
// keywords stored as JSON text; LIKE over raw text matches substrings
|
|
51
|
+
if (filters.keyword)
|
|
52
|
+
conditions.push(like(literatures.keywords, `%${filters.keyword}%`));
|
|
53
|
+
return db
|
|
54
|
+
.select()
|
|
55
|
+
.from(literatures)
|
|
56
|
+
.where(and(...conditions))
|
|
57
|
+
.orderBy(desc(literatures.createdAt))
|
|
58
|
+
.all();
|
|
59
|
+
}
|
|
42
60
|
export function updateLiterature(db, id, input) {
|
|
43
61
|
const updates = {};
|
|
44
62
|
if (input.title !== undefined)
|
|
@@ -81,4 +99,3 @@ export function deleteLiteraturesByKnowledgeBaseId(db, knowledgeBaseId) {
|
|
|
81
99
|
export function getLiteraturesByKnowledgeBaseId(db, knowledgeBaseId) {
|
|
82
100
|
return listLiteratures(db, knowledgeBaseId);
|
|
83
101
|
}
|
|
84
|
-
//# sourceMappingURL=literatures.js.map
|
|
@@ -18,7 +18,9 @@ export function deleteLiterature(id) {
|
|
|
18
18
|
export function deleteLiteraturesByKnowledgeBaseId(knowledgeBaseId) {
|
|
19
19
|
return ops.deleteLiteraturesByKnowledgeBaseId(getProjectDb(), knowledgeBaseId);
|
|
20
20
|
}
|
|
21
|
+
export function searchLiteratures(knowledgeBaseId, filters) {
|
|
22
|
+
return ops.searchLiteratures(getProjectDb(), knowledgeBaseId, filters);
|
|
23
|
+
}
|
|
21
24
|
export function getLiteraturesByKnowledgeBaseId(knowledgeBaseId) {
|
|
22
25
|
return ops.getLiteraturesByKnowledgeBaseId(getProjectDb(), knowledgeBaseId);
|
|
23
26
|
}
|
|
24
|
-
//# sourceMappingURL=literatures.js.map
|
package/dist/db/schema.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
|
1
|
+
import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
|
2
2
|
// ─── Drizzle Table Definitions ──────────────────────────────
|
|
3
3
|
export const knowledgeBases = sqliteTable("knowledge_bases", {
|
|
4
4
|
id: text("id").primaryKey(),
|
|
@@ -24,7 +24,7 @@ export const literatures = sqliteTable("literatures", {
|
|
|
24
24
|
}),
|
|
25
25
|
createdAt: integer("created_at", { mode: "timestamp_ms" }).notNull(),
|
|
26
26
|
updatedAt: integer("updated_at", { mode: "timestamp_ms" }).notNull(),
|
|
27
|
-
});
|
|
27
|
+
}, (t) => [index("idx_literatures_knowledge_base_id").on(t.knowledgeBaseId)]);
|
|
28
28
|
// ─── Bootstrap SQL ──────────────────────────────────────────
|
|
29
29
|
export const CREATE_KNOWLEDGE_BASES_TABLE = `
|
|
30
30
|
CREATE TABLE IF NOT EXISTS knowledge_bases (
|
|
@@ -52,4 +52,6 @@ CREATE TABLE IF NOT EXISTS literatures (
|
|
|
52
52
|
updated_at INTEGER NOT NULL,
|
|
53
53
|
FOREIGN KEY (knowledge_base_id) REFERENCES knowledge_bases(id) ON DELETE SET NULL
|
|
54
54
|
)`;
|
|
55
|
-
|
|
55
|
+
export const CREATE_LITERATURES_KB_INDEX = `
|
|
56
|
+
CREATE INDEX IF NOT EXISTS idx_literatures_knowledge_base_id
|
|
57
|
+
ON literatures(knowledge_base_id)`;
|
package/dist/db/test-utils.js
CHANGED
|
@@ -18,7 +18,9 @@ export function deleteLiterature(id) {
|
|
|
18
18
|
export function deleteLiteraturesByKnowledgeBaseId(knowledgeBaseId) {
|
|
19
19
|
return ops.deleteLiteraturesByKnowledgeBaseId(getUserDb(), knowledgeBaseId);
|
|
20
20
|
}
|
|
21
|
+
export function searchLiteratures(knowledgeBaseId, filters) {
|
|
22
|
+
return ops.searchLiteratures(getUserDb(), knowledgeBaseId, filters);
|
|
23
|
+
}
|
|
21
24
|
export function getLiteraturesByKnowledgeBaseId(knowledgeBaseId) {
|
|
22
25
|
return ops.getLiteraturesByKnowledgeBaseId(getUserDb(), knowledgeBaseId);
|
|
23
26
|
}
|
|
24
|
-
//# sourceMappingURL=literatures.js.map
|
package/dist/extractor/index.js
CHANGED
package/dist/extractor/pdf.js
CHANGED
|
@@ -1,44 +1,43 @@
|
|
|
1
1
|
import { readFile } from "node:fs/promises";
|
|
2
|
-
import {
|
|
3
|
-
import { PDFParse } from "pdf-parse";
|
|
2
|
+
import { extractText, getMeta } from "unpdf";
|
|
4
3
|
export async function extractPdfContent(pdfPath) {
|
|
5
|
-
const data = await readFile(pdfPath);
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
return result.pages.map((page) => new Document({
|
|
10
|
-
pageContent: page.text,
|
|
4
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
5
|
+
const result = await extractText(data, { mergePages: false });
|
|
6
|
+
return result.text.map((pageText, i) => ({
|
|
7
|
+
pageContent: pageText,
|
|
11
8
|
metadata: {
|
|
12
9
|
source: pdfPath,
|
|
13
|
-
pdf: { totalPages: result.
|
|
14
|
-
loc: { pageNumber:
|
|
10
|
+
pdf: { totalPages: result.totalPages },
|
|
11
|
+
loc: { pageNumber: i + 1 },
|
|
15
12
|
},
|
|
16
13
|
}));
|
|
17
14
|
}
|
|
18
15
|
export async function extractPdfMetadata(pdfPath) {
|
|
19
|
-
const data = await readFile(pdfPath);
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
const
|
|
29
|
-
const creationDate = parsePdfDate(nonEmptyStringOrNull(info?.["CreationDate"]));
|
|
30
|
-
const modDate = parsePdfDate(nonEmptyStringOrNull(info?.["ModDate"]));
|
|
31
|
-
const rawKeywords = nonEmptyStringOrNull(info?.["Keywords"]);
|
|
16
|
+
const data = new Uint8Array(await readFile(pdfPath));
|
|
17
|
+
const { info } = await getMeta(data);
|
|
18
|
+
const custom = getRecord(info["Custom"]);
|
|
19
|
+
const title = nonEmptyStringOrNull(info["Title"]);
|
|
20
|
+
const author = nonEmptyStringOrNull(info["Author"]);
|
|
21
|
+
const subject = nonEmptyStringOrNull(info["Subject"]);
|
|
22
|
+
const creator = nonEmptyStringOrNull(info["Creator"]);
|
|
23
|
+
const creationDate = parsePdfDate(nonEmptyStringOrNull(info["CreationDate"]));
|
|
24
|
+
const modDate = parsePdfDate(nonEmptyStringOrNull(info["ModDate"]));
|
|
25
|
+
const rawKeywords = nonEmptyStringOrNull(info["Keywords"]);
|
|
32
26
|
const keywords = rawKeywords
|
|
33
27
|
? rawKeywords
|
|
34
28
|
.split(/[,;]/)
|
|
35
29
|
.map((k) => k.trim())
|
|
36
30
|
.filter(Boolean)
|
|
37
31
|
: [];
|
|
38
|
-
// DOI can appear in Custom fields (case-insensitive lookup)
|
|
39
32
|
const doi = findCustomField(custom, "doi");
|
|
40
33
|
return { title, author, subject, keywords, doi, creator, creationDate, modDate };
|
|
41
34
|
}
|
|
35
|
+
function getRecord(value) {
|
|
36
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
37
|
+
return value;
|
|
38
|
+
}
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
42
41
|
function nonEmptyStringOrNull(value) {
|
|
43
42
|
if (typeof value === "string" && value.trim().length > 0) {
|
|
44
43
|
return value.trim();
|
|
@@ -53,7 +52,6 @@ function parsePdfDate(value) {
|
|
|
53
52
|
if (!value)
|
|
54
53
|
return null;
|
|
55
54
|
const cleaned = value.replace(/^D:/, "");
|
|
56
|
-
// Extract components: YYYY[MM[DD[HH[mm[SS]]]]]
|
|
57
55
|
const match = /^(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/.exec(cleaned);
|
|
58
56
|
if (!match)
|
|
59
57
|
return null;
|
|
@@ -66,9 +64,6 @@ function parsePdfDate(value) {
|
|
|
66
64
|
const date = new Date(`${year}-${month}-${day}T${hour}:${min}:${sec}Z`);
|
|
67
65
|
return Number.isNaN(date.getTime()) ? null : date;
|
|
68
66
|
}
|
|
69
|
-
/**
|
|
70
|
-
* Case-insensitive lookup in the Custom fields dictionary.
|
|
71
|
-
*/
|
|
72
67
|
function findCustomField(custom, key) {
|
|
73
68
|
if (!custom)
|
|
74
69
|
return null;
|
|
@@ -80,4 +75,3 @@ function findCustomField(custom, key) {
|
|
|
80
75
|
}
|
|
81
76
|
return null;
|
|
82
77
|
}
|
|
83
|
-
//# sourceMappingURL=pdf.js.map
|
package/dist/extractor/text.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { readFile, stat } from "node:fs/promises";
|
|
2
|
-
import { Document } from "@langchain/core/documents";
|
|
3
2
|
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10 MB
|
|
4
3
|
export async function extractTextContent(filePath) {
|
|
5
4
|
const fileStats = await stat(filePath);
|
|
@@ -8,10 +7,9 @@ export async function extractTextContent(filePath) {
|
|
|
8
7
|
}
|
|
9
8
|
const content = await readFile(filePath, "utf-8");
|
|
10
9
|
return [
|
|
11
|
-
|
|
10
|
+
{
|
|
12
11
|
pageContent: content,
|
|
13
12
|
metadata: { source: filePath },
|
|
14
|
-
}
|
|
13
|
+
},
|
|
15
14
|
];
|
|
16
15
|
}
|
|
17
|
-
//# sourceMappingURL=text.js.map
|
package/dist/index.js
CHANGED
package/dist/lifecycle.js
CHANGED
package/dist/logger.js
CHANGED
package/dist/migrations.js
CHANGED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
|
|
2
|
+
export function splitDocuments(docs, options) {
|
|
3
|
+
if (options.chunkOverlap >= options.chunkSize) {
|
|
4
|
+
throw new Error("chunkOverlap must be less than chunkSize");
|
|
5
|
+
}
|
|
6
|
+
const result = [];
|
|
7
|
+
for (const doc of docs) {
|
|
8
|
+
const chunks = splitText(doc.pageContent, options.chunkSize, options.chunkOverlap);
|
|
9
|
+
for (const chunk of chunks) {
|
|
10
|
+
result.push({ pageContent: chunk, metadata: { ...doc.metadata } });
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return result;
|
|
14
|
+
}
|
|
15
|
+
function splitText(text, chunkSize, chunkOverlap) {
|
|
16
|
+
return recursiveSplit(text, DEFAULT_SEPARATORS, chunkSize, chunkOverlap);
|
|
17
|
+
}
|
|
18
|
+
function recursiveSplit(text, separators, chunkSize, chunkOverlap) {
|
|
19
|
+
if (text.length <= chunkSize)
|
|
20
|
+
return [text];
|
|
21
|
+
const separator = separators[0] ?? "";
|
|
22
|
+
const remaining = separators.slice(1);
|
|
23
|
+
const parts = separator === "" ? [...text] : text.split(separator);
|
|
24
|
+
const chunks = [];
|
|
25
|
+
let current = "";
|
|
26
|
+
for (const part of parts) {
|
|
27
|
+
const piece = current.length === 0 ? part : current + separator + part;
|
|
28
|
+
if (piece.length > chunkSize && current.length > 0) {
|
|
29
|
+
chunks.push(current);
|
|
30
|
+
// Overlap: keep the tail of the current chunk
|
|
31
|
+
if (chunkOverlap > 0 && current.length > chunkOverlap) {
|
|
32
|
+
current = current.slice(-chunkOverlap) + separator + part;
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
current = part;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
current = piece;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
if (current.length > 0) {
|
|
43
|
+
chunks.push(current);
|
|
44
|
+
}
|
|
45
|
+
// Recursively split any chunks that are still too large
|
|
46
|
+
const finalChunks = [];
|
|
47
|
+
for (const chunk of chunks) {
|
|
48
|
+
if (chunk.length > chunkSize && remaining.length > 0) {
|
|
49
|
+
finalChunks.push(...recursiveSplit(chunk, remaining, chunkSize, chunkOverlap));
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
finalChunks.push(chunk);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return finalChunks;
|
|
56
|
+
}
|
package/dist/types/index.js
CHANGED