papergraph 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +229 -0
- package/dist/index.js +2695 -0
- package/package.json +63 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2695 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
3
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
4
|
+
}) : x)(function(x) {
|
|
5
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
6
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
// src/cli/index.ts
|
|
10
|
+
import { Command } from "commander";
|
|
11
|
+
|
|
12
|
+
// src/utils/config.ts
|
|
13
|
+
import { cosmiconfig } from "cosmiconfig";
|
|
14
|
+
|
|
15
|
+
// src/types/config.ts
|
|
16
|
+
var DEFAULT_CONFIG = {
|
|
17
|
+
source: "openalex",
|
|
18
|
+
spine: "citation",
|
|
19
|
+
depth: 2,
|
|
20
|
+
maxPapers: 150,
|
|
21
|
+
maxRefsPerPaper: 40,
|
|
22
|
+
maxCitesPerPaper: 40,
|
|
23
|
+
noCache: false,
|
|
24
|
+
resume: false,
|
|
25
|
+
logLevel: "info",
|
|
26
|
+
jsonLogs: false,
|
|
27
|
+
similarity: {
|
|
28
|
+
enabled: true,
|
|
29
|
+
topK: 10,
|
|
30
|
+
threshold: 0.25
|
|
31
|
+
},
|
|
32
|
+
clustering: {
|
|
33
|
+
enabled: true,
|
|
34
|
+
method: "louvain"
|
|
35
|
+
},
|
|
36
|
+
ranking: {
|
|
37
|
+
pagerankWeight: 0.5,
|
|
38
|
+
relevanceWeight: 0.3,
|
|
39
|
+
recencyWeight: 0.2
|
|
40
|
+
},
|
|
41
|
+
llm: {
|
|
42
|
+
enabled: false,
|
|
43
|
+
provider: "openai",
|
|
44
|
+
model: "gpt-4.1-mini",
|
|
45
|
+
tasks: ["edges", "clusters"],
|
|
46
|
+
maxAnnotatedPapers: 120,
|
|
47
|
+
maxAnnotatedEdges: 400,
|
|
48
|
+
concurrency: 3
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// src/utils/logger.ts
|
|
53
|
+
import pino from "pino";
|
|
54
|
+
var loggerInstance = null;
|
|
55
|
+
function initLogger(options) {
|
|
56
|
+
const { level = "info", jsonLogs = false } = options;
|
|
57
|
+
if (jsonLogs) {
|
|
58
|
+
loggerInstance = pino({ level });
|
|
59
|
+
} else {
|
|
60
|
+
loggerInstance = pino({
|
|
61
|
+
level,
|
|
62
|
+
transport: {
|
|
63
|
+
target: "pino-pretty",
|
|
64
|
+
options: {
|
|
65
|
+
colorize: true,
|
|
66
|
+
translateTime: "HH:MM:ss",
|
|
67
|
+
ignore: "pid,hostname"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return loggerInstance;
|
|
73
|
+
}
|
|
74
|
+
function getLogger() {
|
|
75
|
+
if (!loggerInstance) {
|
|
76
|
+
loggerInstance = initLogger({ level: "info" });
|
|
77
|
+
}
|
|
78
|
+
return loggerInstance;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// src/utils/config.ts
|
|
82
|
+
async function loadConfigFile() {
|
|
83
|
+
const explorer = cosmiconfig("papergraph", {
|
|
84
|
+
searchPlaces: ["papergraph.config.json"]
|
|
85
|
+
});
|
|
86
|
+
try {
|
|
87
|
+
const result = await explorer.search();
|
|
88
|
+
if (result && !result.isEmpty) {
|
|
89
|
+
getLogger().debug({ path: result.filepath }, "Loaded config file");
|
|
90
|
+
return result.config;
|
|
91
|
+
}
|
|
92
|
+
} catch (error) {
|
|
93
|
+
getLogger().warn({ error }, "Failed to load config file, using defaults");
|
|
94
|
+
}
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
function loadEnvVars() {
|
|
98
|
+
const env = {};
|
|
99
|
+
if (process.env["OPENAI_API_KEY"] && !process.env["LLM_DISABLE"]) {
|
|
100
|
+
getLogger().debug("OPENAI_API_KEY detected in environment");
|
|
101
|
+
}
|
|
102
|
+
return env;
|
|
103
|
+
}
|
|
104
|
+
async function resolveConfig(cliFlags) {
|
|
105
|
+
const fileConfig = await loadConfigFile();
|
|
106
|
+
const envConfig = loadEnvVars();
|
|
107
|
+
const merged = {
|
|
108
|
+
...DEFAULT_CONFIG,
|
|
109
|
+
out: "./papergraph.db",
|
|
110
|
+
...fileConfig,
|
|
111
|
+
...envConfig,
|
|
112
|
+
...cliFlags,
|
|
113
|
+
// Deep merge nested objects
|
|
114
|
+
similarity: {
|
|
115
|
+
...DEFAULT_CONFIG.similarity,
|
|
116
|
+
...fileConfig?.similarity,
|
|
117
|
+
...cliFlags.similarity
|
|
118
|
+
},
|
|
119
|
+
clustering: {
|
|
120
|
+
...DEFAULT_CONFIG.clustering,
|
|
121
|
+
...fileConfig?.clustering,
|
|
122
|
+
...cliFlags.clustering
|
|
123
|
+
},
|
|
124
|
+
ranking: {
|
|
125
|
+
...DEFAULT_CONFIG.ranking,
|
|
126
|
+
...fileConfig?.ranking,
|
|
127
|
+
...cliFlags.ranking
|
|
128
|
+
},
|
|
129
|
+
llm: {
|
|
130
|
+
...DEFAULT_CONFIG.llm,
|
|
131
|
+
...fileConfig?.llm,
|
|
132
|
+
...cliFlags.llm
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
return merged;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// src/utils/http-client.ts
|
|
139
|
+
var logger = getLogger();
|
|
140
|
+
var RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([429, 500, 502, 503, 504]);
|
|
141
|
+
var RETRYABLE_ERROR_CODES = /* @__PURE__ */ new Set(["ECONNRESET", "ECONNREFUSED", "ETIMEDOUT", "UND_ERR_SOCKET", "UND_ERR_CONNECT_TIMEOUT"]);
|
|
142
|
+
var TokenBucket = class {
|
|
143
|
+
constructor(tokensPerSecond, maxTokens) {
|
|
144
|
+
this.tokensPerSecond = tokensPerSecond;
|
|
145
|
+
this.maxTokens = maxTokens;
|
|
146
|
+
this.tokens = maxTokens;
|
|
147
|
+
this.lastRefill = Date.now();
|
|
148
|
+
}
|
|
149
|
+
tokens;
|
|
150
|
+
lastRefill;
|
|
151
|
+
async acquire() {
|
|
152
|
+
this.refill();
|
|
153
|
+
if (this.tokens >= 1) {
|
|
154
|
+
this.tokens -= 1;
|
|
155
|
+
return;
|
|
156
|
+
}
|
|
157
|
+
const waitMs = (1 - this.tokens) / this.tokensPerSecond * 1e3;
|
|
158
|
+
await sleep(waitMs);
|
|
159
|
+
this.refill();
|
|
160
|
+
this.tokens -= 1;
|
|
161
|
+
}
|
|
162
|
+
refill() {
|
|
163
|
+
const now = Date.now();
|
|
164
|
+
const elapsed = (now - this.lastRefill) / 1e3;
|
|
165
|
+
this.tokens = Math.min(this.maxTokens, this.tokens + elapsed * this.tokensPerSecond);
|
|
166
|
+
this.lastRefill = now;
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
var RATE_LIMITS = {
|
|
170
|
+
openalex: { tokensPerSecond: 10, maxBurst: 10 },
|
|
171
|
+
// 10/s with polite pool
|
|
172
|
+
s2: { tokensPerSecond: 1, maxBurst: 1 },
|
|
173
|
+
// 1/s without API key, 10/s with
|
|
174
|
+
openai: { tokensPerSecond: 5, maxBurst: 5 },
|
|
175
|
+
ollama: { tokensPerSecond: 100, maxBurst: 100 },
|
|
176
|
+
// Local — effectively unlimited
|
|
177
|
+
default: { tokensPerSecond: 5, maxBurst: 5 }
|
|
178
|
+
};
|
|
179
|
+
var HttpError = class extends Error {
|
|
180
|
+
constructor(message, status, retryable, response) {
|
|
181
|
+
super(message);
|
|
182
|
+
this.status = status;
|
|
183
|
+
this.retryable = retryable;
|
|
184
|
+
this.response = response;
|
|
185
|
+
this.name = "HttpError";
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
var HttpClient = class {
|
|
189
|
+
buckets = /* @__PURE__ */ new Map();
|
|
190
|
+
requestCounts = /* @__PURE__ */ new Map();
|
|
191
|
+
defaultTimeout;
|
|
192
|
+
userAgent;
|
|
193
|
+
constructor(options) {
|
|
194
|
+
this.defaultTimeout = options?.timeout ?? 3e4;
|
|
195
|
+
const version = options?.version ?? "1.0.0";
|
|
196
|
+
const email = options?.email ?? "papergraph@example.com";
|
|
197
|
+
this.userAgent = `PaperGraph/${version} (mailto:${email})`;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Make an HTTP request with rate limiting and retry.
|
|
201
|
+
*/
|
|
202
|
+
async request(url, options = {}) {
|
|
203
|
+
const {
|
|
204
|
+
method = "GET",
|
|
205
|
+
headers = {},
|
|
206
|
+
body,
|
|
207
|
+
timeout = this.defaultTimeout,
|
|
208
|
+
source = "default"
|
|
209
|
+
} = options;
|
|
210
|
+
const bucket = this.getBucket(source);
|
|
211
|
+
await bucket.acquire();
|
|
212
|
+
this.requestCounts.set(source, (this.requestCounts.get(source) ?? 0) + 1);
|
|
213
|
+
const requestHeaders = {
|
|
214
|
+
"User-Agent": this.userAgent,
|
|
215
|
+
...headers
|
|
216
|
+
};
|
|
217
|
+
let requestBody;
|
|
218
|
+
if (body) {
|
|
219
|
+
if (typeof body === "object") {
|
|
220
|
+
requestBody = JSON.stringify(body);
|
|
221
|
+
requestHeaders["Content-Type"] = requestHeaders["Content-Type"] ?? "application/json";
|
|
222
|
+
} else {
|
|
223
|
+
requestBody = body;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
const maxRetries = 3;
|
|
227
|
+
const initialBackoff = 1e3;
|
|
228
|
+
const maxBackoff = 3e4;
|
|
229
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
230
|
+
try {
|
|
231
|
+
const controller = new AbortController();
|
|
232
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
233
|
+
const response = await fetch(url, {
|
|
234
|
+
method,
|
|
235
|
+
headers: requestHeaders,
|
|
236
|
+
body: requestBody,
|
|
237
|
+
signal: controller.signal
|
|
238
|
+
});
|
|
239
|
+
clearTimeout(timeoutId);
|
|
240
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
241
|
+
let data;
|
|
242
|
+
if (contentType.includes("application/json")) {
|
|
243
|
+
data = await response.json();
|
|
244
|
+
} else {
|
|
245
|
+
data = await response.text();
|
|
246
|
+
}
|
|
247
|
+
const responseHeaders = {};
|
|
248
|
+
response.headers.forEach((value, key) => {
|
|
249
|
+
responseHeaders[key] = value;
|
|
250
|
+
});
|
|
251
|
+
if (!response.ok) {
|
|
252
|
+
const retryable = RETRYABLE_STATUS_CODES.has(response.status);
|
|
253
|
+
if (retryable && attempt < maxRetries) {
|
|
254
|
+
const retryAfter = this.parseRetryAfter(response.headers.get("retry-after"));
|
|
255
|
+
const backoff = retryAfter ?? this.calculateBackoff(attempt, initialBackoff, maxBackoff);
|
|
256
|
+
logger.warn(
|
|
257
|
+
{ status: response.status, attempt: attempt + 1, backoffMs: backoff, url },
|
|
258
|
+
`Retryable HTTP error, backing off`
|
|
259
|
+
);
|
|
260
|
+
await sleep(backoff);
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
throw new HttpError(
|
|
264
|
+
`HTTP ${response.status}: ${response.statusText}`,
|
|
265
|
+
response.status,
|
|
266
|
+
retryable,
|
|
267
|
+
data
|
|
268
|
+
);
|
|
269
|
+
}
|
|
270
|
+
return { status: response.status, headers: responseHeaders, data, ok: true };
|
|
271
|
+
} catch (error) {
|
|
272
|
+
if (error instanceof HttpError) throw error;
|
|
273
|
+
const errorCode = error.code;
|
|
274
|
+
const retryable = errorCode ? RETRYABLE_ERROR_CODES.has(errorCode) : false;
|
|
275
|
+
if (retryable && attempt < maxRetries) {
|
|
276
|
+
const backoff = this.calculateBackoff(attempt, initialBackoff, maxBackoff);
|
|
277
|
+
logger.warn(
|
|
278
|
+
{ errorCode, attempt: attempt + 1, backoffMs: backoff, url },
|
|
279
|
+
`Retryable network error, backing off`
|
|
280
|
+
);
|
|
281
|
+
await sleep(backoff);
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
285
|
+
throw new HttpError(`Request timeout after ${timeout}ms: ${url}`, 0, true);
|
|
286
|
+
}
|
|
287
|
+
throw new HttpError(
|
|
288
|
+
`Network error: ${error instanceof Error ? error.message : String(error)}`,
|
|
289
|
+
0,
|
|
290
|
+
retryable
|
|
291
|
+
);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
throw new HttpError(`Max retries exceeded for ${url}`, 0, false);
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Convenience method for GET requests.
|
|
298
|
+
*/
|
|
299
|
+
async get(url, options) {
|
|
300
|
+
return this.request(url, { ...options, method: "GET" });
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Convenience method for POST requests.
|
|
304
|
+
*/
|
|
305
|
+
async post(url, body, options) {
|
|
306
|
+
return this.request(url, { ...options, method: "POST", body });
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Get request count for a source.
|
|
310
|
+
*/
|
|
311
|
+
getRequestCount(source) {
|
|
312
|
+
return this.requestCounts.get(source) ?? 0;
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Get all request counts.
|
|
316
|
+
*/
|
|
317
|
+
getAllRequestCounts() {
|
|
318
|
+
return Object.fromEntries(this.requestCounts.entries());
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Reset request counts.
|
|
322
|
+
*/
|
|
323
|
+
resetCounts() {
|
|
324
|
+
this.requestCounts.clear();
|
|
325
|
+
}
|
|
326
|
+
getBucket(source) {
|
|
327
|
+
if (!this.buckets.has(source)) {
|
|
328
|
+
const config = RATE_LIMITS[source] ?? RATE_LIMITS["default"];
|
|
329
|
+
this.buckets.set(source, new TokenBucket(config.tokensPerSecond, config.maxBurst));
|
|
330
|
+
}
|
|
331
|
+
return this.buckets.get(source);
|
|
332
|
+
}
|
|
333
|
+
parseRetryAfter(header) {
|
|
334
|
+
if (!header) return null;
|
|
335
|
+
const seconds = parseInt(header, 10);
|
|
336
|
+
if (!isNaN(seconds)) return seconds * 1e3;
|
|
337
|
+
const date = new Date(header);
|
|
338
|
+
if (!isNaN(date.getTime())) {
|
|
339
|
+
return Math.max(0, date.getTime() - Date.now());
|
|
340
|
+
}
|
|
341
|
+
return null;
|
|
342
|
+
}
|
|
343
|
+
calculateBackoff(attempt, initial, max) {
|
|
344
|
+
const exponential = initial * Math.pow(2, attempt);
|
|
345
|
+
const jitter = Math.random() * exponential * 0.5;
|
|
346
|
+
return Math.min(max, exponential + jitter);
|
|
347
|
+
}
|
|
348
|
+
};
|
|
349
|
+
function sleep(ms) {
|
|
350
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
351
|
+
}
|
|
352
|
+
var clientInstance = null;
|
|
353
|
+
function getHttpClient(options) {
|
|
354
|
+
if (!clientInstance) {
|
|
355
|
+
clientInstance = new HttpClient(options);
|
|
356
|
+
}
|
|
357
|
+
return clientInstance;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// src/storage/database.ts
|
|
361
|
+
import Database from "better-sqlite3";
|
|
362
|
+
var logger2 = getLogger();
|
|
363
|
+
var MIGRATION_V1 = `
|
|
364
|
+
-- Runs: build session metadata
|
|
365
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
366
|
+
run_id INTEGER PRIMARY KEY,
|
|
367
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
368
|
+
papergraph_version TEXT NOT NULL,
|
|
369
|
+
config_json TEXT NOT NULL,
|
|
370
|
+
source TEXT NOT NULL,
|
|
371
|
+
spine TEXT NOT NULL,
|
|
372
|
+
depth INTEGER NOT NULL,
|
|
373
|
+
stats_json TEXT NOT NULL DEFAULT '{}'
|
|
374
|
+
);
|
|
375
|
+
|
|
376
|
+
-- Papers: core paper nodes
|
|
377
|
+
CREATE TABLE IF NOT EXISTS papers (
|
|
378
|
+
paper_id INTEGER PRIMARY KEY,
|
|
379
|
+
source TEXT NOT NULL,
|
|
380
|
+
source_id TEXT NOT NULL,
|
|
381
|
+
doi TEXT,
|
|
382
|
+
arxiv_id TEXT,
|
|
383
|
+
title TEXT NOT NULL,
|
|
384
|
+
abstract TEXT,
|
|
385
|
+
year INTEGER,
|
|
386
|
+
venue TEXT,
|
|
387
|
+
url TEXT,
|
|
388
|
+
citation_count INTEGER NOT NULL DEFAULT 0,
|
|
389
|
+
influence_score REAL,
|
|
390
|
+
keywords_json TEXT,
|
|
391
|
+
concepts_json TEXT,
|
|
392
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
393
|
+
);
|
|
394
|
+
|
|
395
|
+
-- Edges: relationships between papers
|
|
396
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
397
|
+
edge_id INTEGER PRIMARY KEY,
|
|
398
|
+
src_paper_id INTEGER NOT NULL REFERENCES papers(paper_id),
|
|
399
|
+
dst_paper_id INTEGER NOT NULL REFERENCES papers(paper_id),
|
|
400
|
+
type TEXT NOT NULL,
|
|
401
|
+
weight REAL NOT NULL DEFAULT 1.0,
|
|
402
|
+
confidence REAL NOT NULL DEFAULT 1.0,
|
|
403
|
+
rationale TEXT,
|
|
404
|
+
evidence TEXT,
|
|
405
|
+
created_by TEXT NOT NULL DEFAULT 'algo',
|
|
406
|
+
provenance_json TEXT NOT NULL DEFAULT '{}',
|
|
407
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
408
|
+
);
|
|
409
|
+
|
|
410
|
+
-- Authors
|
|
411
|
+
CREATE TABLE IF NOT EXISTS authors (
|
|
412
|
+
author_id INTEGER PRIMARY KEY,
|
|
413
|
+
name TEXT NOT NULL,
|
|
414
|
+
source_id TEXT,
|
|
415
|
+
affiliation TEXT
|
|
416
|
+
);
|
|
417
|
+
|
|
418
|
+
-- Paper-Author junction
|
|
419
|
+
CREATE TABLE IF NOT EXISTS paper_authors (
|
|
420
|
+
paper_id INTEGER NOT NULL REFERENCES papers(paper_id),
|
|
421
|
+
author_id INTEGER NOT NULL REFERENCES authors(author_id),
|
|
422
|
+
position INTEGER NOT NULL DEFAULT 0,
|
|
423
|
+
PRIMARY KEY (paper_id, author_id)
|
|
424
|
+
);
|
|
425
|
+
|
|
426
|
+
-- Clusters: community detection results
|
|
427
|
+
CREATE TABLE IF NOT EXISTS clusters (
|
|
428
|
+
cluster_id INTEGER PRIMARY KEY,
|
|
429
|
+
method TEXT NOT NULL,
|
|
430
|
+
name TEXT,
|
|
431
|
+
description TEXT,
|
|
432
|
+
stats_json TEXT NOT NULL DEFAULT '{}'
|
|
433
|
+
);
|
|
434
|
+
|
|
435
|
+
-- Paper-Cluster junction
|
|
436
|
+
CREATE TABLE IF NOT EXISTS paper_clusters (
|
|
437
|
+
paper_id INTEGER NOT NULL REFERENCES papers(paper_id),
|
|
438
|
+
cluster_id INTEGER NOT NULL REFERENCES clusters(cluster_id),
|
|
439
|
+
PRIMARY KEY (paper_id, cluster_id)
|
|
440
|
+
);
|
|
441
|
+
|
|
442
|
+
-- Entities: extracted datasets/methods/tasks/metrics
|
|
443
|
+
CREATE TABLE IF NOT EXISTS entities (
|
|
444
|
+
entity_id INTEGER PRIMARY KEY,
|
|
445
|
+
type TEXT NOT NULL,
|
|
446
|
+
name TEXT NOT NULL,
|
|
447
|
+
aliases_json TEXT NOT NULL DEFAULT '[]'
|
|
448
|
+
);
|
|
449
|
+
|
|
450
|
+
-- Paper-Entity junction
|
|
451
|
+
CREATE TABLE IF NOT EXISTS paper_entities (
|
|
452
|
+
paper_id INTEGER NOT NULL REFERENCES papers(paper_id),
|
|
453
|
+
entity_id INTEGER NOT NULL REFERENCES entities(entity_id),
|
|
454
|
+
role TEXT NOT NULL DEFAULT 'uses',
|
|
455
|
+
PRIMARY KEY (paper_id, entity_id)
|
|
456
|
+
);
|
|
457
|
+
|
|
458
|
+
-- Indexes for edges (fast lookup by src, dst, type)
|
|
459
|
+
CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_paper_id);
|
|
460
|
+
CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_paper_id);
|
|
461
|
+
CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);
|
|
462
|
+
|
|
463
|
+
-- Indexes for papers (fast lookup by doi, arxiv, source_id, year)
|
|
464
|
+
CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi);
|
|
465
|
+
CREATE INDEX IF NOT EXISTS idx_papers_arxiv ON papers(arxiv_id);
|
|
466
|
+
CREATE INDEX IF NOT EXISTS idx_papers_source_id ON papers(source_id);
|
|
467
|
+
CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year);
|
|
468
|
+
|
|
469
|
+
-- Unique constraint on source + source_id to prevent duplicates
|
|
470
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_papers_source_unique ON papers(source, source_id);
|
|
471
|
+
`;
|
|
472
|
+
var PaperGraphDatabase = class {
|
|
473
|
+
db;
|
|
474
|
+
constructor(dbPath) {
|
|
475
|
+
this.db = new Database(dbPath);
|
|
476
|
+
this.db.pragma("journal_mode = WAL");
|
|
477
|
+
this.db.pragma("foreign_keys = ON");
|
|
478
|
+
this.migrate();
|
|
479
|
+
logger2.debug({ dbPath }, "Database initialized");
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Run schema migrations.
|
|
483
|
+
*/
|
|
484
|
+
migrate() {
|
|
485
|
+
const currentVersion = this.db.pragma("user_version", { simple: true });
|
|
486
|
+
if (currentVersion < 1) {
|
|
487
|
+
this.db.exec(MIGRATION_V1);
|
|
488
|
+
this.db.pragma("user_version = 1");
|
|
489
|
+
logger2.info("Database migrated to v1");
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
// ─── Papers ───────────────────────────────────────────────
|
|
493
|
+
/**
|
|
494
|
+
* Insert multiple papers in a single transaction.
|
|
495
|
+
* Returns the inserted paper IDs.
|
|
496
|
+
*/
|
|
497
|
+
insertPapers(papers) {
|
|
498
|
+
const stmt = this.db.prepare(`
|
|
499
|
+
INSERT OR IGNORE INTO papers (source, source_id, doi, arxiv_id, title, abstract, year, venue, url, citation_count, influence_score, keywords_json, concepts_json)
|
|
500
|
+
VALUES (@source, @source_id, @doi, @arxiv_id, @title, @abstract, @year, @venue, @url, @citation_count, @influence_score, @keywords_json, @concepts_json)
|
|
501
|
+
`);
|
|
502
|
+
const ids = [];
|
|
503
|
+
const insertAll = this.db.transaction((papers2) => {
|
|
504
|
+
for (const paper of papers2) {
|
|
505
|
+
const result = stmt.run(paper);
|
|
506
|
+
if (result.changes > 0) {
|
|
507
|
+
ids.push(Number(result.lastInsertRowid));
|
|
508
|
+
} else {
|
|
509
|
+
const existing = this.getPaperBySourceId(paper.source, paper.source_id);
|
|
510
|
+
if (existing) ids.push(existing.paper_id);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
});
|
|
514
|
+
insertAll(papers);
|
|
515
|
+
return ids;
|
|
516
|
+
}
|
|
517
|
+
/**
|
|
518
|
+
* Upsert a single paper. If it already exists (by source+source_id), update it.
|
|
519
|
+
*/
|
|
520
|
+
upsertPaper(paper) {
|
|
521
|
+
const stmt = this.db.prepare(`
|
|
522
|
+
INSERT INTO papers (source, source_id, doi, arxiv_id, title, abstract, year, venue, url, citation_count, influence_score, keywords_json, concepts_json)
|
|
523
|
+
VALUES (@source, @source_id, @doi, @arxiv_id, @title, @abstract, @year, @venue, @url, @citation_count, @influence_score, @keywords_json, @concepts_json)
|
|
524
|
+
ON CONFLICT(source, source_id) DO UPDATE SET
|
|
525
|
+
doi = COALESCE(excluded.doi, doi),
|
|
526
|
+
arxiv_id = COALESCE(excluded.arxiv_id, arxiv_id),
|
|
527
|
+
title = excluded.title,
|
|
528
|
+
abstract = COALESCE(excluded.abstract, abstract),
|
|
529
|
+
year = COALESCE(excluded.year, year),
|
|
530
|
+
venue = COALESCE(excluded.venue, venue),
|
|
531
|
+
url = COALESCE(excluded.url, url),
|
|
532
|
+
citation_count = MAX(citation_count, excluded.citation_count),
|
|
533
|
+
influence_score = COALESCE(excluded.influence_score, influence_score),
|
|
534
|
+
keywords_json = COALESCE(excluded.keywords_json, keywords_json),
|
|
535
|
+
concepts_json = COALESCE(excluded.concepts_json, concepts_json)
|
|
536
|
+
`);
|
|
537
|
+
const result = stmt.run(paper);
|
|
538
|
+
if (result.changes > 0 && result.lastInsertRowid) {
|
|
539
|
+
return Number(result.lastInsertRowid);
|
|
540
|
+
}
|
|
541
|
+
const existing = this.getPaperBySourceId(paper.source, paper.source_id);
|
|
542
|
+
return existing?.paper_id ?? -1;
|
|
543
|
+
}
|
|
544
|
+
getPaperById(id) {
|
|
545
|
+
return this.db.prepare("SELECT * FROM papers WHERE paper_id = ?").get(id);
|
|
546
|
+
}
|
|
547
|
+
getPaperByDoi(doi) {
|
|
548
|
+
return this.db.prepare("SELECT * FROM papers WHERE doi = ?").get(doi);
|
|
549
|
+
}
|
|
550
|
+
getPaperBySourceId(source, sourceId) {
|
|
551
|
+
return this.db.prepare("SELECT * FROM papers WHERE source = ? AND source_id = ?").get(source, sourceId);
|
|
552
|
+
}
|
|
553
|
+
paperExists(source, sourceId) {
|
|
554
|
+
const row = this.db.prepare("SELECT 1 FROM papers WHERE source = ? AND source_id = ?").get(source, sourceId);
|
|
555
|
+
return row !== void 0;
|
|
556
|
+
}
|
|
557
|
+
getAllPapers() {
|
|
558
|
+
return this.db.prepare("SELECT * FROM papers ORDER BY paper_id").all();
|
|
559
|
+
}
|
|
560
|
+
getPaperCount() {
|
|
561
|
+
const row = this.db.prepare("SELECT COUNT(*) as count FROM papers").get();
|
|
562
|
+
return row.count;
|
|
563
|
+
}
|
|
564
|
+
// ─── Edges ────────────────────────────────────────────────
|
|
565
|
+
/**
|
|
566
|
+
* Insert multiple edges in a single transaction.
|
|
567
|
+
*/
|
|
568
|
+
insertEdges(edges) {
|
|
569
|
+
const stmt = this.db.prepare(`
|
|
570
|
+
INSERT INTO edges (src_paper_id, dst_paper_id, type, weight, confidence, rationale, evidence, created_by, provenance_json)
|
|
571
|
+
VALUES (@src_paper_id, @dst_paper_id, @type, @weight, @confidence, @rationale, @evidence, @created_by, @provenance_json)
|
|
572
|
+
`);
|
|
573
|
+
const insertAll = this.db.transaction((edges2) => {
|
|
574
|
+
for (const edge of edges2) {
|
|
575
|
+
stmt.run(edge);
|
|
576
|
+
}
|
|
577
|
+
});
|
|
578
|
+
insertAll(edges);
|
|
579
|
+
}
|
|
580
|
+
getAllEdges() {
|
|
581
|
+
return this.db.prepare("SELECT * FROM edges ORDER BY edge_id").all();
|
|
582
|
+
}
|
|
583
|
+
getEdgesByType(type) {
|
|
584
|
+
return this.db.prepare("SELECT * FROM edges WHERE type = ?").all(type);
|
|
585
|
+
}
|
|
586
|
+
getEdgeCount() {
|
|
587
|
+
const row = this.db.prepare("SELECT COUNT(*) as count FROM edges").get();
|
|
588
|
+
return row.count;
|
|
589
|
+
}
|
|
590
|
+
// ─── Authors ──────────────────────────────────────────────
|
|
591
|
+
insertAuthors(authors, paperLinks) {
|
|
592
|
+
const authorStmt = this.db.prepare(`
|
|
593
|
+
INSERT OR IGNORE INTO authors (name, source_id, affiliation)
|
|
594
|
+
VALUES (@name, @source_id, @affiliation)
|
|
595
|
+
`);
|
|
596
|
+
const linkStmt = this.db.prepare(`
|
|
597
|
+
INSERT OR IGNORE INTO paper_authors (paper_id, author_id, position)
|
|
598
|
+
VALUES (?, ?, ?)
|
|
599
|
+
`);
|
|
600
|
+
const insertAll = this.db.transaction(() => {
|
|
601
|
+
const authorIds = [];
|
|
602
|
+
for (const author of authors) {
|
|
603
|
+
const result = authorStmt.run(author);
|
|
604
|
+
authorIds.push(Number(result.lastInsertRowid));
|
|
605
|
+
}
|
|
606
|
+
for (const link of paperLinks) {
|
|
607
|
+
const authorId = authorIds[link.authorIndex];
|
|
608
|
+
if (authorId !== void 0) {
|
|
609
|
+
linkStmt.run(link.paperId, authorId, link.position);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
});
|
|
613
|
+
insertAll();
|
|
614
|
+
}
|
|
615
|
+
// ─── Clusters ─────────────────────────────────────────────
|
|
616
|
+
insertClusters(clusters, paperMappings) {
|
|
617
|
+
const clusterStmt = this.db.prepare(`
|
|
618
|
+
INSERT INTO clusters (method, name, description, stats_json)
|
|
619
|
+
VALUES (@method, @name, @description, @stats_json)
|
|
620
|
+
`);
|
|
621
|
+
const linkStmt = this.db.prepare(`
|
|
622
|
+
INSERT OR IGNORE INTO paper_clusters (paper_id, cluster_id)
|
|
623
|
+
VALUES (?, ?)
|
|
624
|
+
`);
|
|
625
|
+
const insertAll = this.db.transaction(() => {
|
|
626
|
+
let clusterIndex = 0;
|
|
627
|
+
for (const cluster of clusters) {
|
|
628
|
+
const result = clusterStmt.run(cluster);
|
|
629
|
+
const clusterId = Number(result.lastInsertRowid);
|
|
630
|
+
const paperIds = paperMappings.get(clusterIndex) ?? [];
|
|
631
|
+
for (const paperId of paperIds) {
|
|
632
|
+
linkStmt.run(paperId, clusterId);
|
|
633
|
+
}
|
|
634
|
+
clusterIndex++;
|
|
635
|
+
}
|
|
636
|
+
});
|
|
637
|
+
insertAll();
|
|
638
|
+
}
|
|
639
|
+
getAllClusters() {
|
|
640
|
+
return this.db.prepare("SELECT * FROM clusters ORDER BY cluster_id").all();
|
|
641
|
+
}
|
|
642
|
+
getClusterCount() {
|
|
643
|
+
const row = this.db.prepare("SELECT COUNT(*) as count FROM clusters").get();
|
|
644
|
+
return row.count;
|
|
645
|
+
}
|
|
646
|
+
// ─── Entities ─────────────────────────────────────────────
|
|
647
|
+
insertEntities(entities, paperLinks) {
|
|
648
|
+
const entityStmt = this.db.prepare(`
|
|
649
|
+
INSERT OR IGNORE INTO entities (type, name, aliases_json)
|
|
650
|
+
VALUES (@type, @name, @aliases_json)
|
|
651
|
+
`);
|
|
652
|
+
const getEntityStmt = this.db.prepare(`
|
|
653
|
+
SELECT entity_id FROM entities WHERE type = ? AND name = ?
|
|
654
|
+
`);
|
|
655
|
+
const linkStmt = this.db.prepare(`
|
|
656
|
+
INSERT OR IGNORE INTO paper_entities (paper_id, entity_id, role)
|
|
657
|
+
VALUES (?, ?, ?)
|
|
658
|
+
`);
|
|
659
|
+
const insertAll = this.db.transaction(() => {
|
|
660
|
+
const entityIds = [];
|
|
661
|
+
for (const entity of entities) {
|
|
662
|
+
const result = entityStmt.run(entity);
|
|
663
|
+
if (result.changes > 0) {
|
|
664
|
+
entityIds.push(Number(result.lastInsertRowid));
|
|
665
|
+
} else {
|
|
666
|
+
const existing = getEntityStmt.get(entity.type, entity.name);
|
|
667
|
+
entityIds.push(existing?.entity_id ?? -1);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
for (const link of paperLinks) {
|
|
671
|
+
const entityId = entityIds[link.entityIndex];
|
|
672
|
+
if (entityId !== void 0 && entityId !== -1) {
|
|
673
|
+
linkStmt.run(link.paperId, entityId, link.role);
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
});
|
|
677
|
+
insertAll();
|
|
678
|
+
}
|
|
679
|
+
getAllEntities() {
|
|
680
|
+
return this.db.prepare("SELECT * FROM entities ORDER BY entity_id").all();
|
|
681
|
+
}
|
|
682
|
+
getEntityCount() {
|
|
683
|
+
const row = this.db.prepare("SELECT COUNT(*) as count FROM entities").get();
|
|
684
|
+
return row.count;
|
|
685
|
+
}
|
|
686
|
+
// ─── Runs ─────────────────────────────────────────────────
|
|
687
|
+
insertRun(run) {
|
|
688
|
+
const stmt = this.db.prepare(`
|
|
689
|
+
INSERT INTO runs (created_at, papergraph_version, config_json, source, spine, depth, stats_json)
|
|
690
|
+
VALUES (@created_at, @papergraph_version, @config_json, @source, @spine, @depth, @stats_json)
|
|
691
|
+
`);
|
|
692
|
+
const result = stmt.run(run);
|
|
693
|
+
return Number(result.lastInsertRowid);
|
|
694
|
+
}
|
|
695
|
+
// ─── Stats ────────────────────────────────────────────────
|
|
696
|
+
getStats() {
|
|
697
|
+
const papers = this.getPaperCount();
|
|
698
|
+
const edges = this.getEdgeCount();
|
|
699
|
+
const clusters = this.getClusterCount();
|
|
700
|
+
const entities = this.getEntityCount();
|
|
701
|
+
const runs = this.db.prepare("SELECT COUNT(*) as count FROM runs").get().count;
|
|
702
|
+
const edgeTypeRows = this.db.prepare("SELECT type, COUNT(*) as count FROM edges GROUP BY type").all();
|
|
703
|
+
const edgesByType = {};
|
|
704
|
+
for (const row of edgeTypeRows) {
|
|
705
|
+
edgesByType[row.type] = row.count;
|
|
706
|
+
}
|
|
707
|
+
return { papers, edges, clusters, entities, runs, edgesByType };
|
|
708
|
+
}
|
|
709
|
+
// ─── Score Updates ────────────────────────────────────────
|
|
710
|
+
/**
|
|
711
|
+
* Update influence_score for a paper by its paper_id.
|
|
712
|
+
*/
|
|
713
|
+
updatePaperScore(paperId, score) {
|
|
714
|
+
this.db.prepare("UPDATE papers SET influence_score = ? WHERE paper_id = ?").run(score, paperId);
|
|
715
|
+
}
|
|
716
|
+
// ─── Utility ──────────────────────────────────────────────
|
|
717
|
+
/**
|
|
718
|
+
* Execute a function within a transaction.
|
|
719
|
+
*/
|
|
720
|
+
transaction(fn) {
|
|
721
|
+
return this.db.transaction(fn)();
|
|
722
|
+
}
|
|
723
|
+
/**
|
|
724
|
+
* Close the database connection.
|
|
725
|
+
*/
|
|
726
|
+
close() {
|
|
727
|
+
this.db.close();
|
|
728
|
+
logger2.debug("Database closed");
|
|
729
|
+
}
|
|
730
|
+
/**
|
|
731
|
+
* Get the raw better-sqlite3 instance (for advanced queries).
|
|
732
|
+
*/
|
|
733
|
+
getRawDb() {
|
|
734
|
+
return this.db;
|
|
735
|
+
}
|
|
736
|
+
};
|
|
737
|
+
|
|
738
|
+
// src/sources/utils.ts
|
|
739
|
+
function invertedIndexToText(invertedIndex) {
|
|
740
|
+
if (!invertedIndex || typeof invertedIndex !== "object") {
|
|
741
|
+
return null;
|
|
742
|
+
}
|
|
743
|
+
try {
|
|
744
|
+
const words = [];
|
|
745
|
+
for (const [word, positions] of Object.entries(invertedIndex)) {
|
|
746
|
+
if (!Array.isArray(positions)) continue;
|
|
747
|
+
for (const pos of positions) {
|
|
748
|
+
if (typeof pos === "number" && pos >= 0) {
|
|
749
|
+
words.push([pos, word]);
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
if (words.length === 0) return null;
|
|
754
|
+
words.sort((a, b) => a[0] - b[0]);
|
|
755
|
+
return words.map(([, word]) => word).join(" ");
|
|
756
|
+
} catch {
|
|
757
|
+
return null;
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
function stripDoiPrefix(doi) {
|
|
761
|
+
if (!doi) return null;
|
|
762
|
+
return doi.replace("https://doi.org/", "").replace("http://doi.org/", "").trim() || null;
|
|
763
|
+
}
|
|
764
|
+
function extractArxivId(input) {
|
|
765
|
+
if (!input) return null;
|
|
766
|
+
const patterns = [
|
|
767
|
+
/arxiv\.org\/abs\/(\d{4}\.\d{4,5}(?:v\d+)?)/i,
|
|
768
|
+
/arxiv:(\d{4}\.\d{4,5}(?:v\d+)?)/i,
|
|
769
|
+
/^(\d{4}\.\d{4,5}(?:v\d+)?)$/
|
|
770
|
+
];
|
|
771
|
+
for (const pattern of patterns) {
|
|
772
|
+
const match = input.match(pattern);
|
|
773
|
+
if (match?.[1]) return match[1];
|
|
774
|
+
}
|
|
775
|
+
return null;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// src/sources/openalex.ts
|
|
779
|
+
var logger3 = getLogger();
|
|
780
|
+
var OPENALEX_BASE = "https://api.openalex.org";
|
|
781
|
+
var OpenAlexAdapter = class {
|
|
782
|
+
name = "OpenAlex";
|
|
783
|
+
sourceId = "openalex";
|
|
784
|
+
httpClient;
|
|
785
|
+
apiKey;
|
|
786
|
+
email;
|
|
787
|
+
constructor(options) {
|
|
788
|
+
this.apiKey = options?.apiKey ?? process.env["OPENALEX_API_KEY"];
|
|
789
|
+
this.email = options?.email;
|
|
790
|
+
this.httpClient = getHttpClient();
|
|
791
|
+
}
|
|
792
|
+
/**
|
|
793
|
+
* For dependency injection in tests.
|
|
794
|
+
*/
|
|
795
|
+
setHttpClient(client) {
|
|
796
|
+
this.httpClient = client;
|
|
797
|
+
}
|
|
798
|
+
async searchByTopic(topic, limit = 25) {
|
|
799
|
+
const params = new URLSearchParams({
|
|
800
|
+
search: topic,
|
|
801
|
+
per_page: String(Math.min(limit, 200)),
|
|
802
|
+
sort: "cited_by_count:desc"
|
|
803
|
+
});
|
|
804
|
+
this.addAuthParams(params);
|
|
805
|
+
const url = `${OPENALEX_BASE}/works?${params.toString()}`;
|
|
806
|
+
logger3.debug({ url }, "OpenAlex topic search");
|
|
807
|
+
const response = await this.httpClient.get(url, { source: "openalex" });
|
|
808
|
+
return response.data.results.map((work) => this.normalizeWork(work));
|
|
809
|
+
}
|
|
810
|
+
async searchByTitle(title, limit = 10) {
|
|
811
|
+
const exactParams = new URLSearchParams({
|
|
812
|
+
"filter": `title.search:${title}`,
|
|
813
|
+
"per_page": String(Math.min(limit, 200)),
|
|
814
|
+
"sort": "cited_by_count:desc"
|
|
815
|
+
});
|
|
816
|
+
this.addAuthParams(exactParams);
|
|
817
|
+
const exactUrl = `${OPENALEX_BASE}/works?${exactParams.toString()}`;
|
|
818
|
+
logger3.debug({ url: exactUrl }, "OpenAlex title search (exact)");
|
|
819
|
+
const exactResponse = await this.httpClient.get(exactUrl, { source: "openalex" });
|
|
820
|
+
if (exactResponse.data.results.length > 0) {
|
|
821
|
+
return exactResponse.data.results.map((work) => this.normalizeWork(work));
|
|
822
|
+
}
|
|
823
|
+
const fallbackParams = new URLSearchParams({
|
|
824
|
+
search: title,
|
|
825
|
+
per_page: String(Math.min(limit, 200)),
|
|
826
|
+
sort: "cited_by_count:desc"
|
|
827
|
+
});
|
|
828
|
+
this.addAuthParams(fallbackParams);
|
|
829
|
+
const fallbackUrl = `${OPENALEX_BASE}/works?${fallbackParams.toString()}`;
|
|
830
|
+
logger3.debug({ url: fallbackUrl }, "OpenAlex title search (fallback)");
|
|
831
|
+
const fallbackResponse = await this.httpClient.get(fallbackUrl, { source: "openalex" });
|
|
832
|
+
return fallbackResponse.data.results.map((work) => this.normalizeWork(work));
|
|
833
|
+
}
|
|
834
|
+
async fetchPaper(id) {
|
|
835
|
+
const normalizedId = id.startsWith("https://openalex.org/") ? id : `https://openalex.org/${id}`;
|
|
836
|
+
const params = new URLSearchParams();
|
|
837
|
+
this.addAuthParams(params);
|
|
838
|
+
const url = `${OPENALEX_BASE}/works/${encodeURIComponent(normalizedId)}?${params.toString()}`;
|
|
839
|
+
logger3.debug({ url }, "OpenAlex fetch paper");
|
|
840
|
+
try {
|
|
841
|
+
const response = await this.httpClient.get(url, { source: "openalex" });
|
|
842
|
+
return this.normalizeWork(response.data);
|
|
843
|
+
} catch (error) {
|
|
844
|
+
logger3.warn({ id, error }, "Failed to fetch paper from OpenAlex");
|
|
845
|
+
return null;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
async fetchReferences(paperId, limit = 40) {
|
|
849
|
+
const paper = await this.fetchFullWork(paperId);
|
|
850
|
+
if (!paper?.referenced_works?.length) {
|
|
851
|
+
logger3.debug({ paperId }, "No references found");
|
|
852
|
+
return [];
|
|
853
|
+
}
|
|
854
|
+
const refIds = paper.referenced_works.slice(0, limit);
|
|
855
|
+
return this.fetchWorksByIds(refIds);
|
|
856
|
+
}
|
|
857
|
+
async fetchCitations(paperId, limit = 40) {
|
|
858
|
+
const normalizedId = paperId.startsWith("https://openalex.org/") ? paperId : `https://openalex.org/${paperId}`;
|
|
859
|
+
const params = new URLSearchParams({
|
|
860
|
+
"filter": `cites:${normalizedId}`,
|
|
861
|
+
"per_page": String(Math.min(limit, 200)),
|
|
862
|
+
"sort": "cited_by_count:desc"
|
|
863
|
+
});
|
|
864
|
+
this.addAuthParams(params);
|
|
865
|
+
const url = `${OPENALEX_BASE}/works?${params.toString()}`;
|
|
866
|
+
logger3.debug({ url }, "OpenAlex fetch citations");
|
|
867
|
+
const response = await this.httpClient.get(url, { source: "openalex" });
|
|
868
|
+
return response.data.results.map((work) => this.normalizeWork(work));
|
|
869
|
+
}
|
|
870
|
+
normalize(raw) {
|
|
871
|
+
return this.normalizeWork(raw);
|
|
872
|
+
}
|
|
873
|
+
// ─── Private helpers ──────────────────────────────────────
|
|
874
|
+
async fetchFullWork(id) {
|
|
875
|
+
const normalizedId = id.startsWith("https://openalex.org/") ? id : `https://openalex.org/${id}`;
|
|
876
|
+
const params = new URLSearchParams();
|
|
877
|
+
this.addAuthParams(params);
|
|
878
|
+
const url = `${OPENALEX_BASE}/works/${encodeURIComponent(normalizedId)}?${params.toString()}`;
|
|
879
|
+
try {
|
|
880
|
+
const response = await this.httpClient.get(url, { source: "openalex" });
|
|
881
|
+
return response.data;
|
|
882
|
+
} catch {
|
|
883
|
+
return null;
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
async fetchWorksByIds(ids) {
|
|
887
|
+
if (ids.length === 0) return [];
|
|
888
|
+
const batchSize = 50;
|
|
889
|
+
const papers = [];
|
|
890
|
+
for (let i = 0; i < ids.length; i += batchSize) {
|
|
891
|
+
const batch = ids.slice(i, i + batchSize);
|
|
892
|
+
const filter = batch.map((id) => id.replace("https://openalex.org/", "")).join("|");
|
|
893
|
+
const params = new URLSearchParams({
|
|
894
|
+
"filter": `openalex:${filter}`,
|
|
895
|
+
"per_page": String(batchSize)
|
|
896
|
+
});
|
|
897
|
+
this.addAuthParams(params);
|
|
898
|
+
const url = `${OPENALEX_BASE}/works?${params.toString()}`;
|
|
899
|
+
try {
|
|
900
|
+
const response = await this.httpClient.get(url, { source: "openalex" });
|
|
901
|
+
papers.push(...response.data.results.map((work) => this.normalizeWork(work)));
|
|
902
|
+
} catch (error) {
|
|
903
|
+
logger3.warn({ error, batchIndex: i }, "Failed to fetch batch of works");
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
return papers;
|
|
907
|
+
}
|
|
908
|
+
normalizeWork(work) {
|
|
909
|
+
const openalexId = work.id?.replace("https://openalex.org/", "") ?? "";
|
|
910
|
+
const doi = stripDoiPrefix(work.doi);
|
|
911
|
+
const abstract = invertedIndexToText(work.abstract_inverted_index);
|
|
912
|
+
let arxivId = null;
|
|
913
|
+
if (doi?.includes("arxiv")) {
|
|
914
|
+
arxivId = extractArxivId(doi);
|
|
915
|
+
}
|
|
916
|
+
const keywords = work.keywords?.map((k) => k.keyword).filter((k) => !!k) ?? [];
|
|
917
|
+
const concepts = work.concepts?.filter((c) => (c.level ?? 0) <= 2).map((c) => ({ name: c.display_name ?? "", score: c.score })).filter((c) => c.name) ?? [];
|
|
918
|
+
return {
|
|
919
|
+
source: "openalex",
|
|
920
|
+
source_id: openalexId,
|
|
921
|
+
doi,
|
|
922
|
+
arxiv_id: arxivId,
|
|
923
|
+
title: work.display_name ?? work.title ?? "Untitled",
|
|
924
|
+
abstract,
|
|
925
|
+
year: work.publication_year ?? null,
|
|
926
|
+
venue: work.primary_location?.source?.display_name ?? null,
|
|
927
|
+
url: work.primary_location?.landing_page_url ?? (doi ? `https://doi.org/${doi}` : null),
|
|
928
|
+
citation_count: work.cited_by_count ?? 0,
|
|
929
|
+
influence_score: null,
|
|
930
|
+
keywords_json: keywords.length > 0 ? JSON.stringify(keywords) : null,
|
|
931
|
+
concepts_json: concepts.length > 0 ? JSON.stringify(concepts) : null
|
|
932
|
+
};
|
|
933
|
+
}
|
|
934
|
+
addAuthParams(params) {
|
|
935
|
+
if (this.apiKey) {
|
|
936
|
+
params.set("api_key", this.apiKey);
|
|
937
|
+
}
|
|
938
|
+
if (this.email) {
|
|
939
|
+
params.set("mailto", this.email);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
};
|
|
943
|
+
|
|
944
|
+
// src/sources/semantic-scholar.ts
|
|
945
|
+
var logger4 = getLogger();
|
|
946
|
+
var S2_BASE = "https://api.semanticscholar.org/graph/v1";
|
|
947
|
+
var PAPER_FIELDS = [
|
|
948
|
+
"paperId",
|
|
949
|
+
"externalIds",
|
|
950
|
+
"title",
|
|
951
|
+
"abstract",
|
|
952
|
+
"year",
|
|
953
|
+
"venue",
|
|
954
|
+
"citationCount",
|
|
955
|
+
"influentialCitationCount",
|
|
956
|
+
"isOpenAccess",
|
|
957
|
+
"fieldsOfStudy",
|
|
958
|
+
"authors",
|
|
959
|
+
"url"
|
|
960
|
+
].join(",");
|
|
961
|
+
var REFERENCE_FIELDS = [
|
|
962
|
+
"paperId",
|
|
963
|
+
"externalIds",
|
|
964
|
+
"title",
|
|
965
|
+
"abstract",
|
|
966
|
+
"year",
|
|
967
|
+
"venue",
|
|
968
|
+
"citationCount",
|
|
969
|
+
"influentialCitationCount",
|
|
970
|
+
"url",
|
|
971
|
+
"authors"
|
|
972
|
+
].join(",");
|
|
973
|
+
var SemanticScholarAdapter = class {
|
|
974
|
+
name = "Semantic Scholar";
|
|
975
|
+
sourceId = "s2";
|
|
976
|
+
httpClient;
|
|
977
|
+
apiKey;
|
|
978
|
+
constructor(options) {
|
|
979
|
+
this.apiKey = options?.apiKey ?? process.env["S2_API_KEY"];
|
|
980
|
+
this.httpClient = getHttpClient();
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* For dependency injection in tests.
|
|
984
|
+
*/
|
|
985
|
+
setHttpClient(client) {
|
|
986
|
+
this.httpClient = client;
|
|
987
|
+
}
|
|
988
|
+
async searchByTopic(topic, limit = 25) {
|
|
989
|
+
const cleanedQuery = this.cleanSearchQuery(topic);
|
|
990
|
+
const params = new URLSearchParams({
|
|
991
|
+
query: cleanedQuery,
|
|
992
|
+
limit: String(Math.min(limit, 100)),
|
|
993
|
+
fields: PAPER_FIELDS
|
|
994
|
+
});
|
|
995
|
+
const url = `${S2_BASE}/paper/search?${params.toString()}`;
|
|
996
|
+
logger4.debug({ url }, "S2 topic search");
|
|
997
|
+
const response = await this.httpClient.get(url, {
|
|
998
|
+
source: "s2",
|
|
999
|
+
headers: this.buildHeaders()
|
|
1000
|
+
});
|
|
1001
|
+
return (response.data.data ?? []).map((paper) => this.normalizeS2Paper(paper));
|
|
1002
|
+
}
|
|
1003
|
+
async searchByTitle(title, limit = 10) {
|
|
1004
|
+
const cleanedQuery = this.cleanSearchQuery(title);
|
|
1005
|
+
const params = new URLSearchParams({
|
|
1006
|
+
query: cleanedQuery,
|
|
1007
|
+
limit: String(Math.min(limit, 100)),
|
|
1008
|
+
fields: PAPER_FIELDS
|
|
1009
|
+
});
|
|
1010
|
+
const url = `${S2_BASE}/paper/search?${params.toString()}`;
|
|
1011
|
+
logger4.debug({ url }, "S2 title search");
|
|
1012
|
+
const response = await this.httpClient.get(url, {
|
|
1013
|
+
source: "s2",
|
|
1014
|
+
headers: this.buildHeaders()
|
|
1015
|
+
});
|
|
1016
|
+
return (response.data.data ?? []).map((paper) => this.normalizeS2Paper(paper));
|
|
1017
|
+
}
|
|
1018
|
+
async fetchPaper(id) {
|
|
1019
|
+
const url = `${S2_BASE}/paper/${encodeURIComponent(id)}?fields=${PAPER_FIELDS}`;
|
|
1020
|
+
logger4.debug({ url }, "S2 fetch paper");
|
|
1021
|
+
try {
|
|
1022
|
+
const response = await this.httpClient.get(url, {
|
|
1023
|
+
source: "s2",
|
|
1024
|
+
headers: this.buildHeaders()
|
|
1025
|
+
});
|
|
1026
|
+
return this.normalizeS2Paper(response.data);
|
|
1027
|
+
} catch (error) {
|
|
1028
|
+
logger4.warn({ id, error }, "Failed to fetch paper from S2");
|
|
1029
|
+
return null;
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
async fetchReferences(paperId, limit = 40) {
|
|
1033
|
+
const params = new URLSearchParams({
|
|
1034
|
+
fields: REFERENCE_FIELDS,
|
|
1035
|
+
limit: String(Math.min(limit, 1e3))
|
|
1036
|
+
});
|
|
1037
|
+
const url = `${S2_BASE}/paper/${encodeURIComponent(paperId)}/references?${params.toString()}`;
|
|
1038
|
+
logger4.debug({ url }, "S2 fetch references");
|
|
1039
|
+
try {
|
|
1040
|
+
const response = await this.httpClient.get(url, {
|
|
1041
|
+
source: "s2",
|
|
1042
|
+
headers: this.buildHeaders()
|
|
1043
|
+
});
|
|
1044
|
+
return (response.data.data ?? []).map((ref) => ref.citedPaper).filter((p) => p.paperId && p.title).map((paper) => this.normalizeS2Paper(paper));
|
|
1045
|
+
} catch (error) {
|
|
1046
|
+
logger4.warn({ paperId, error }, "Failed to fetch references from S2");
|
|
1047
|
+
return [];
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
async fetchCitations(paperId, limit = 40) {
|
|
1051
|
+
const params = new URLSearchParams({
|
|
1052
|
+
fields: REFERENCE_FIELDS,
|
|
1053
|
+
limit: String(Math.min(limit, 1e3))
|
|
1054
|
+
});
|
|
1055
|
+
const url = `${S2_BASE}/paper/${encodeURIComponent(paperId)}/citations?${params.toString()}`;
|
|
1056
|
+
logger4.debug({ url }, "S2 fetch citations");
|
|
1057
|
+
try {
|
|
1058
|
+
const response = await this.httpClient.get(url, {
|
|
1059
|
+
source: "s2",
|
|
1060
|
+
headers: this.buildHeaders()
|
|
1061
|
+
});
|
|
1062
|
+
return (response.data.data ?? []).map((cite) => cite.citingPaper).filter((p) => p.paperId && p.title).map((paper) => this.normalizeS2Paper(paper));
|
|
1063
|
+
} catch (error) {
|
|
1064
|
+
logger4.warn({ paperId, error }, "Failed to fetch citations from S2");
|
|
1065
|
+
return [];
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
/**
|
|
1069
|
+
* Batch fetch papers by IDs.
|
|
1070
|
+
* S2 supports up to 500 papers per batch request.
|
|
1071
|
+
*/
|
|
1072
|
+
async batchFetchPapers(ids) {
|
|
1073
|
+
if (ids.length === 0) return [];
|
|
1074
|
+
const batchSize = 500;
|
|
1075
|
+
const papers = [];
|
|
1076
|
+
for (let i = 0; i < ids.length; i += batchSize) {
|
|
1077
|
+
const batch = ids.slice(i, i + batchSize);
|
|
1078
|
+
const url = `${S2_BASE}/paper/batch?fields=${PAPER_FIELDS}`;
|
|
1079
|
+
logger4.debug({ batchSize: batch.length, batchIndex: i / batchSize }, "S2 batch fetch");
|
|
1080
|
+
try {
|
|
1081
|
+
const response = await this.httpClient.post(
|
|
1082
|
+
url,
|
|
1083
|
+
{ ids: batch },
|
|
1084
|
+
{ source: "s2", headers: this.buildHeaders() }
|
|
1085
|
+
);
|
|
1086
|
+
const batchPapers = (response.data ?? []).filter((p) => p && p.paperId && p.title).map((paper) => this.normalizeS2Paper(paper));
|
|
1087
|
+
papers.push(...batchPapers);
|
|
1088
|
+
} catch (error) {
|
|
1089
|
+
logger4.warn({ error, batchIndex: i / batchSize }, "Failed batch fetch from S2");
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
return papers;
|
|
1093
|
+
}
|
|
1094
|
+
normalize(raw) {
|
|
1095
|
+
return this.normalizeS2Paper(raw);
|
|
1096
|
+
}
|
|
1097
|
+
// ─── Private helpers ──────────────────────────────────────
|
|
1098
|
+
normalizeS2Paper(paper) {
|
|
1099
|
+
const doi = stripDoiPrefix(paper.externalIds?.DOI ?? null);
|
|
1100
|
+
const arxivId = extractArxivId(paper.externalIds?.ArXiv ?? null);
|
|
1101
|
+
return {
|
|
1102
|
+
source: "s2",
|
|
1103
|
+
source_id: paper.paperId,
|
|
1104
|
+
doi,
|
|
1105
|
+
arxiv_id: arxivId,
|
|
1106
|
+
title: paper.title ?? "Untitled",
|
|
1107
|
+
abstract: paper.abstract ?? null,
|
|
1108
|
+
year: paper.year ?? null,
|
|
1109
|
+
venue: paper.venue || null,
|
|
1110
|
+
url: paper.url ?? (doi ? `https://doi.org/${doi}` : null),
|
|
1111
|
+
citation_count: paper.citationCount ?? 0,
|
|
1112
|
+
influence_score: paper.influentialCitationCount ?? null,
|
|
1113
|
+
keywords_json: paper.fieldsOfStudy ? JSON.stringify(paper.fieldsOfStudy) : null,
|
|
1114
|
+
concepts_json: null
|
|
1115
|
+
};
|
|
1116
|
+
}
|
|
1117
|
+
/**
|
|
1118
|
+
* Clean search query — S2 treats hyphens and plus signs as operators.
|
|
1119
|
+
*/
|
|
1120
|
+
cleanSearchQuery(query) {
|
|
1121
|
+
return query.replace(/[-+]/g, " ").replace(/\s+/g, " ").trim();
|
|
1122
|
+
}
|
|
1123
|
+
buildHeaders() {
|
|
1124
|
+
const headers = {};
|
|
1125
|
+
if (this.apiKey) {
|
|
1126
|
+
headers["x-api-key"] = this.apiKey;
|
|
1127
|
+
}
|
|
1128
|
+
return headers;
|
|
1129
|
+
}
|
|
1130
|
+
};
|
|
1131
|
+
|
|
1132
|
+
// src/nlp/stopwords.ts
|
|
1133
|
+
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
1134
|
+
"a",
|
|
1135
|
+
"about",
|
|
1136
|
+
"above",
|
|
1137
|
+
"after",
|
|
1138
|
+
"again",
|
|
1139
|
+
"against",
|
|
1140
|
+
"all",
|
|
1141
|
+
"am",
|
|
1142
|
+
"an",
|
|
1143
|
+
"and",
|
|
1144
|
+
"any",
|
|
1145
|
+
"are",
|
|
1146
|
+
"aren",
|
|
1147
|
+
"aren't",
|
|
1148
|
+
"as",
|
|
1149
|
+
"at",
|
|
1150
|
+
"be",
|
|
1151
|
+
"because",
|
|
1152
|
+
"been",
|
|
1153
|
+
"before",
|
|
1154
|
+
"being",
|
|
1155
|
+
"below",
|
|
1156
|
+
"between",
|
|
1157
|
+
"both",
|
|
1158
|
+
"but",
|
|
1159
|
+
"by",
|
|
1160
|
+
"can",
|
|
1161
|
+
"could",
|
|
1162
|
+
"couldn",
|
|
1163
|
+
"couldn't",
|
|
1164
|
+
"d",
|
|
1165
|
+
"did",
|
|
1166
|
+
"didn",
|
|
1167
|
+
"didn't",
|
|
1168
|
+
"do",
|
|
1169
|
+
"does",
|
|
1170
|
+
"doesn",
|
|
1171
|
+
"doesn't",
|
|
1172
|
+
"doing",
|
|
1173
|
+
"don",
|
|
1174
|
+
"don't",
|
|
1175
|
+
"down",
|
|
1176
|
+
"during",
|
|
1177
|
+
"each",
|
|
1178
|
+
"few",
|
|
1179
|
+
"for",
|
|
1180
|
+
"from",
|
|
1181
|
+
"further",
|
|
1182
|
+
"get",
|
|
1183
|
+
"got",
|
|
1184
|
+
"had",
|
|
1185
|
+
"hadn",
|
|
1186
|
+
"hadn't",
|
|
1187
|
+
"has",
|
|
1188
|
+
"hasn",
|
|
1189
|
+
"hasn't",
|
|
1190
|
+
"have",
|
|
1191
|
+
"haven",
|
|
1192
|
+
"haven't",
|
|
1193
|
+
"having",
|
|
1194
|
+
"he",
|
|
1195
|
+
"her",
|
|
1196
|
+
"here",
|
|
1197
|
+
"hers",
|
|
1198
|
+
"herself",
|
|
1199
|
+
"him",
|
|
1200
|
+
"himself",
|
|
1201
|
+
"his",
|
|
1202
|
+
"how",
|
|
1203
|
+
"i",
|
|
1204
|
+
"if",
|
|
1205
|
+
"in",
|
|
1206
|
+
"into",
|
|
1207
|
+
"is",
|
|
1208
|
+
"isn",
|
|
1209
|
+
"isn't",
|
|
1210
|
+
"it",
|
|
1211
|
+
"it's",
|
|
1212
|
+
"its",
|
|
1213
|
+
"itself",
|
|
1214
|
+
"just",
|
|
1215
|
+
"let",
|
|
1216
|
+
"ll",
|
|
1217
|
+
"m",
|
|
1218
|
+
"ma",
|
|
1219
|
+
"may",
|
|
1220
|
+
"me",
|
|
1221
|
+
"might",
|
|
1222
|
+
"mightn",
|
|
1223
|
+
"mightn't",
|
|
1224
|
+
"more",
|
|
1225
|
+
"most",
|
|
1226
|
+
"much",
|
|
1227
|
+
"must",
|
|
1228
|
+
"mustn",
|
|
1229
|
+
"mustn't",
|
|
1230
|
+
"my",
|
|
1231
|
+
"myself",
|
|
1232
|
+
"need",
|
|
1233
|
+
"needn",
|
|
1234
|
+
"needn't",
|
|
1235
|
+
"no",
|
|
1236
|
+
"nor",
|
|
1237
|
+
"not",
|
|
1238
|
+
"now",
|
|
1239
|
+
"o",
|
|
1240
|
+
"of",
|
|
1241
|
+
"off",
|
|
1242
|
+
"on",
|
|
1243
|
+
"once",
|
|
1244
|
+
"only",
|
|
1245
|
+
"or",
|
|
1246
|
+
"other",
|
|
1247
|
+
"our",
|
|
1248
|
+
"ours",
|
|
1249
|
+
"ourselves",
|
|
1250
|
+
"out",
|
|
1251
|
+
"over",
|
|
1252
|
+
"own",
|
|
1253
|
+
"quite",
|
|
1254
|
+
"re",
|
|
1255
|
+
"s",
|
|
1256
|
+
"said",
|
|
1257
|
+
"same",
|
|
1258
|
+
"shan",
|
|
1259
|
+
"shan't",
|
|
1260
|
+
"she",
|
|
1261
|
+
"she's",
|
|
1262
|
+
"should",
|
|
1263
|
+
"should've",
|
|
1264
|
+
"shouldn",
|
|
1265
|
+
"shouldn't",
|
|
1266
|
+
"so",
|
|
1267
|
+
"some",
|
|
1268
|
+
"such",
|
|
1269
|
+
"t",
|
|
1270
|
+
"than",
|
|
1271
|
+
"that",
|
|
1272
|
+
"that'll",
|
|
1273
|
+
"the",
|
|
1274
|
+
"their",
|
|
1275
|
+
"theirs",
|
|
1276
|
+
"them",
|
|
1277
|
+
"themselves",
|
|
1278
|
+
"then",
|
|
1279
|
+
"there",
|
|
1280
|
+
"these",
|
|
1281
|
+
"they",
|
|
1282
|
+
"this",
|
|
1283
|
+
"those",
|
|
1284
|
+
"through",
|
|
1285
|
+
"to",
|
|
1286
|
+
"too",
|
|
1287
|
+
"under",
|
|
1288
|
+
"until",
|
|
1289
|
+
"up",
|
|
1290
|
+
"upon",
|
|
1291
|
+
"ve",
|
|
1292
|
+
"very",
|
|
1293
|
+
"was",
|
|
1294
|
+
"wasn",
|
|
1295
|
+
"wasn't",
|
|
1296
|
+
"we",
|
|
1297
|
+
"were",
|
|
1298
|
+
"weren",
|
|
1299
|
+
"weren't",
|
|
1300
|
+
"what",
|
|
1301
|
+
"when",
|
|
1302
|
+
"where",
|
|
1303
|
+
"which",
|
|
1304
|
+
"while",
|
|
1305
|
+
"who",
|
|
1306
|
+
"whom",
|
|
1307
|
+
"why",
|
|
1308
|
+
"will",
|
|
1309
|
+
"with",
|
|
1310
|
+
"won",
|
|
1311
|
+
"won't",
|
|
1312
|
+
"would",
|
|
1313
|
+
"wouldn",
|
|
1314
|
+
"wouldn't",
|
|
1315
|
+
"y",
|
|
1316
|
+
"you",
|
|
1317
|
+
"you'd",
|
|
1318
|
+
"you'll",
|
|
1319
|
+
"you're",
|
|
1320
|
+
"you've",
|
|
1321
|
+
"your",
|
|
1322
|
+
"yours",
|
|
1323
|
+
"yourself",
|
|
1324
|
+
"yourselves",
|
|
1325
|
+
// Additional academic/generic stopwords
|
|
1326
|
+
"also",
|
|
1327
|
+
"based",
|
|
1328
|
+
"et",
|
|
1329
|
+
"al",
|
|
1330
|
+
"using",
|
|
1331
|
+
"via",
|
|
1332
|
+
"vs",
|
|
1333
|
+
"use",
|
|
1334
|
+
"used",
|
|
1335
|
+
"show",
|
|
1336
|
+
"shows",
|
|
1337
|
+
"shown",
|
|
1338
|
+
"propose",
|
|
1339
|
+
"proposed",
|
|
1340
|
+
"paper",
|
|
1341
|
+
"approach",
|
|
1342
|
+
"method",
|
|
1343
|
+
"results",
|
|
1344
|
+
"model",
|
|
1345
|
+
"models",
|
|
1346
|
+
"work",
|
|
1347
|
+
"study",
|
|
1348
|
+
"new",
|
|
1349
|
+
"novel",
|
|
1350
|
+
"present",
|
|
1351
|
+
"presented",
|
|
1352
|
+
"demonstrate",
|
|
1353
|
+
"demonstrated",
|
|
1354
|
+
"provide",
|
|
1355
|
+
"provided",
|
|
1356
|
+
"however",
|
|
1357
|
+
"therefore",
|
|
1358
|
+
"thus",
|
|
1359
|
+
"hence",
|
|
1360
|
+
"although",
|
|
1361
|
+
"moreover",
|
|
1362
|
+
"furthermore",
|
|
1363
|
+
"additionally",
|
|
1364
|
+
"specifically",
|
|
1365
|
+
"respectively"
|
|
1366
|
+
]);
|
|
1367
|
+
|
|
1368
|
+
// src/nlp/tokenizer.ts
|
|
1369
|
+
function tokenize(text) {
|
|
1370
|
+
if (!text) return [];
|
|
1371
|
+
return text.toLowerCase().replace(/[^a-z0-9\s-]/g, " ").split(/\s+/).map((token) => token.replace(/^-+|-+$/g, "")).filter(
|
|
1372
|
+
(token) => token.length > 1 && !STOPWORDS.has(token) && !/^\d+$/.test(token)
|
|
1373
|
+
// Remove pure numbers
|
|
1374
|
+
);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
// src/nlp/tfidf.ts
|
|
1378
|
+
var logger5 = getLogger();
|
|
1379
|
+
function buildCorpus(papers) {
|
|
1380
|
+
const df = /* @__PURE__ */ new Map();
|
|
1381
|
+
const documents = /* @__PURE__ */ new Map();
|
|
1382
|
+
let nullAbstractCount = 0;
|
|
1383
|
+
for (const paper of papers) {
|
|
1384
|
+
const id = paper.source_id || String(paper.paper_id);
|
|
1385
|
+
let text = paper.title;
|
|
1386
|
+
if (paper.abstract) {
|
|
1387
|
+
text += " " + paper.abstract;
|
|
1388
|
+
} else {
|
|
1389
|
+
nullAbstractCount++;
|
|
1390
|
+
if (paper.keywords_json) {
|
|
1391
|
+
try {
|
|
1392
|
+
const keywords = JSON.parse(paper.keywords_json);
|
|
1393
|
+
text += " " + keywords.join(" ");
|
|
1394
|
+
} catch {
|
|
1395
|
+
}
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
const tokens = tokenize(text);
|
|
1399
|
+
if (tokens.length === 0) continue;
|
|
1400
|
+
const tf = /* @__PURE__ */ new Map();
|
|
1401
|
+
for (const token of tokens) {
|
|
1402
|
+
tf.set(token, (tf.get(token) ?? 0) + 1);
|
|
1403
|
+
}
|
|
1404
|
+
const maxTf = Math.max(...tf.values());
|
|
1405
|
+
const normalizedTf = /* @__PURE__ */ new Map();
|
|
1406
|
+
for (const [term, count] of tf) {
|
|
1407
|
+
normalizedTf.set(term, count / maxTf);
|
|
1408
|
+
}
|
|
1409
|
+
documents.set(id, normalizedTf);
|
|
1410
|
+
const seenTerms = new Set(tokens);
|
|
1411
|
+
for (const term of seenTerms) {
|
|
1412
|
+
df.set(term, (df.get(term) ?? 0) + 1);
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
if (papers.length > 0) {
|
|
1416
|
+
const pct = (nullAbstractCount / papers.length * 100).toFixed(1);
|
|
1417
|
+
if (nullAbstractCount > 0) {
|
|
1418
|
+
logger5.warn(
|
|
1419
|
+
{ nullAbstracts: nullAbstractCount, total: papers.length, percentage: pct },
|
|
1420
|
+
`${pct}% of papers have null abstracts (using title-only fallback)`
|
|
1421
|
+
);
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
const N = documents.size;
|
|
1425
|
+
for (const [, docTf] of documents) {
|
|
1426
|
+
for (const [term, tf] of docTf) {
|
|
1427
|
+
const termDf = df.get(term) ?? 1;
|
|
1428
|
+
const idf = Math.log(N / termDf);
|
|
1429
|
+
docTf.set(term, tf * idf);
|
|
1430
|
+
}
|
|
1431
|
+
}
|
|
1432
|
+
return { documents, df, size: N };
|
|
1433
|
+
}
|
|
1434
|
+
function getTopTerms(corpus, docIds, topN = 5) {
|
|
1435
|
+
const termScores = /* @__PURE__ */ new Map();
|
|
1436
|
+
for (const docId of docIds) {
|
|
1437
|
+
const vector = corpus.documents.get(docId);
|
|
1438
|
+
if (!vector) continue;
|
|
1439
|
+
for (const [term, weight] of vector) {
|
|
1440
|
+
termScores.set(term, (termScores.get(term) ?? 0) + weight);
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
return Array.from(termScores.entries()).sort((a, b) => b[1] - a[1]).slice(0, topN).map(([term]) => term);
|
|
1444
|
+
}
|
|
1445
|
+
function computeRelevance(corpus, docId, queryTokens) {
|
|
1446
|
+
const docVector = corpus.documents.get(docId);
|
|
1447
|
+
if (!docVector || queryTokens.length === 0) return 0;
|
|
1448
|
+
let score = 0;
|
|
1449
|
+
for (const token of queryTokens) {
|
|
1450
|
+
score += docVector.get(token) ?? 0;
|
|
1451
|
+
}
|
|
1452
|
+
return Math.min(1, score / queryTokens.length);
|
|
1453
|
+
}
|
|
1454
|
+
|
|
1455
|
+
// src/nlp/similarity.ts
|
|
1456
|
+
function cosineSimilarity(vecA, vecB) {
|
|
1457
|
+
if (vecA.size === 0 || vecB.size === 0) return 0;
|
|
1458
|
+
let dotProduct = 0;
|
|
1459
|
+
let normA = 0;
|
|
1460
|
+
let normB = 0;
|
|
1461
|
+
const [smaller, larger] = vecA.size <= vecB.size ? [vecA, vecB] : [vecB, vecA];
|
|
1462
|
+
for (const [term, weightA] of smaller) {
|
|
1463
|
+
const weightB = larger.get(term);
|
|
1464
|
+
if (weightB !== void 0) {
|
|
1465
|
+
dotProduct += weightA * weightB;
|
|
1466
|
+
}
|
|
1467
|
+
normA += weightA * weightA;
|
|
1468
|
+
}
|
|
1469
|
+
for (const [term, weight] of larger) {
|
|
1470
|
+
normB += weight * weight;
|
|
1471
|
+
if (!smaller.has(term)) {
|
|
1472
|
+
}
|
|
1473
|
+
}
|
|
1474
|
+
normA = 0;
|
|
1475
|
+
for (const [, w] of vecA) normA += w * w;
|
|
1476
|
+
normB = 0;
|
|
1477
|
+
for (const [, w] of vecB) normB += w * w;
|
|
1478
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
1479
|
+
if (denominator === 0) return 0;
|
|
1480
|
+
return dotProduct / denominator;
|
|
1481
|
+
}
|
|
1482
|
+
function findTopKSimilar(docId, corpus, k, threshold) {
|
|
1483
|
+
const docVector = corpus.documents.get(docId);
|
|
1484
|
+
if (!docVector) return [];
|
|
1485
|
+
const similarities = [];
|
|
1486
|
+
for (const [otherId, otherVector] of corpus.documents) {
|
|
1487
|
+
if (otherId === docId) continue;
|
|
1488
|
+
const sim = cosineSimilarity(docVector, otherVector);
|
|
1489
|
+
if (sim >= threshold) {
|
|
1490
|
+
similarities.push({ id: otherId, similarity: sim });
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
similarities.sort((a, b) => b.similarity - a.similarity);
|
|
1494
|
+
return similarities.slice(0, k);
|
|
1495
|
+
}
|
|
1496
|
+
function buildSimilarityEdges(paperIdMap, corpus, topK = 10, threshold = 0.25) {
|
|
1497
|
+
const edges = [];
|
|
1498
|
+
const seenPairs = /* @__PURE__ */ new Set();
|
|
1499
|
+
for (const [sourceId, paperId] of paperIdMap) {
|
|
1500
|
+
const neighbors = findTopKSimilar(sourceId, corpus, topK, threshold);
|
|
1501
|
+
for (const neighbor of neighbors) {
|
|
1502
|
+
const neighborPaperId = paperIdMap.get(neighbor.id);
|
|
1503
|
+
if (neighborPaperId === void 0) continue;
|
|
1504
|
+
const pairKey = [Math.min(paperId, neighborPaperId), Math.max(paperId, neighborPaperId)].join("-");
|
|
1505
|
+
if (seenPairs.has(pairKey)) continue;
|
|
1506
|
+
seenPairs.add(pairKey);
|
|
1507
|
+
edges.push({
|
|
1508
|
+
src_paper_id: paperId,
|
|
1509
|
+
dst_paper_id: neighborPaperId,
|
|
1510
|
+
type: "SIMILAR_TEXT" /* SIMILAR_TEXT */,
|
|
1511
|
+
weight: neighbor.similarity,
|
|
1512
|
+
confidence: neighbor.similarity,
|
|
1513
|
+
rationale: null,
|
|
1514
|
+
evidence: null,
|
|
1515
|
+
created_by: "algo",
|
|
1516
|
+
provenance_json: JSON.stringify({
|
|
1517
|
+
source: "tfidf",
|
|
1518
|
+
version: "1.0.0",
|
|
1519
|
+
topK,
|
|
1520
|
+
threshold
|
|
1521
|
+
})
|
|
1522
|
+
});
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
1525
|
+
return edges;
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
// src/graph/algorithms.ts
|
|
1529
|
+
import GraphDefault from "graphology";
|
|
1530
|
+
import pagerankModule from "graphology-metrics/centrality/pagerank.js";
|
|
1531
|
+
import louvainModule from "graphology-communities-louvain";
|
|
1532
|
+
import { toUndirected } from "graphology-operators";
|
|
1533
|
+
var Graph = GraphDefault.default ?? GraphDefault;
|
|
1534
|
+
var pagerank = pagerankModule.default ?? pagerankModule;
|
|
1535
|
+
var louvain = louvainModule.default ?? louvainModule;
|
|
1536
|
+
var logger6 = getLogger();
|
|
1537
|
+
function computePageRank(papers, edges) {
|
|
1538
|
+
const graph = new Graph({ type: "directed" });
|
|
1539
|
+
for (const paper of papers) {
|
|
1540
|
+
if (paper.paper_id !== void 0) {
|
|
1541
|
+
graph.addNode(String(paper.paper_id));
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
const citesEdges = edges.filter((e) => e.type === "CITES" /* CITES */);
|
|
1545
|
+
for (const edge of citesEdges) {
|
|
1546
|
+
const src = String(edge.src_paper_id);
|
|
1547
|
+
const dst = String(edge.dst_paper_id);
|
|
1548
|
+
if (graph.hasNode(src) && graph.hasNode(dst) && !graph.hasEdge(src, dst)) {
|
|
1549
|
+
graph.addEdge(src, dst, { weight: edge.weight });
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
const scores = pagerank(graph, { alpha: 0.85, maxIterations: 100, tolerance: 1e-6 });
|
|
1553
|
+
const result = /* @__PURE__ */ new Map();
|
|
1554
|
+
for (const [nodeId, score] of Object.entries(scores)) {
|
|
1555
|
+
result.set(parseInt(nodeId, 10), score);
|
|
1556
|
+
}
|
|
1557
|
+
logger6.debug({ nodeCount: graph.order, edgeCount: graph.size }, "PageRank computed");
|
|
1558
|
+
return result;
|
|
1559
|
+
}
|
|
1560
|
+
function computeLouvainClusters(papers, edges) {
|
|
1561
|
+
const directedGraph = new Graph({ type: "directed", allowSelfLoops: false });
|
|
1562
|
+
for (const paper of papers) {
|
|
1563
|
+
if (paper.paper_id !== void 0) {
|
|
1564
|
+
directedGraph.addNode(String(paper.paper_id));
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
for (const edge of edges) {
|
|
1568
|
+
const src = String(edge.src_paper_id);
|
|
1569
|
+
const dst = String(edge.dst_paper_id);
|
|
1570
|
+
if (directedGraph.hasNode(src) && directedGraph.hasNode(dst) && !directedGraph.hasEdge(src, dst)) {
|
|
1571
|
+
directedGraph.addEdge(src, dst, { weight: edge.weight });
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
const undirectedGraph = toUndirected(directedGraph);
|
|
1575
|
+
if (undirectedGraph.order === 0) {
|
|
1576
|
+
return /* @__PURE__ */ new Map();
|
|
1577
|
+
}
|
|
1578
|
+
const communities = louvain(undirectedGraph, {
|
|
1579
|
+
resolution: 1
|
|
1580
|
+
});
|
|
1581
|
+
const clusterMap = /* @__PURE__ */ new Map();
|
|
1582
|
+
for (const [nodeId, community] of Object.entries(communities)) {
|
|
1583
|
+
const paperId = parseInt(nodeId, 10);
|
|
1584
|
+
const communityNum = community;
|
|
1585
|
+
if (!clusterMap.has(communityNum)) {
|
|
1586
|
+
clusterMap.set(communityNum, []);
|
|
1587
|
+
}
|
|
1588
|
+
clusterMap.get(communityNum).push(paperId);
|
|
1589
|
+
}
|
|
1590
|
+
logger6.debug(
|
|
1591
|
+
{ communities: clusterMap.size, papers: papers.length },
|
|
1592
|
+
"Louvain clustering computed"
|
|
1593
|
+
);
|
|
1594
|
+
return clusterMap;
|
|
1595
|
+
}
|
|
1596
|
+
function computeCoCitation(edges) {
|
|
1597
|
+
const citesEdges = edges.filter((e) => e.type === "CITES" /* CITES */);
|
|
1598
|
+
const citingToRefs = /* @__PURE__ */ new Map();
|
|
1599
|
+
for (const edge of citesEdges) {
|
|
1600
|
+
if (!citingToRefs.has(edge.src_paper_id)) {
|
|
1601
|
+
citingToRefs.set(edge.src_paper_id, /* @__PURE__ */ new Set());
|
|
1602
|
+
}
|
|
1603
|
+
citingToRefs.get(edge.src_paper_id).add(edge.dst_paper_id);
|
|
1604
|
+
}
|
|
1605
|
+
const pairCounts = /* @__PURE__ */ new Map();
|
|
1606
|
+
for (const [, refs] of citingToRefs) {
|
|
1607
|
+
const refArray = Array.from(refs);
|
|
1608
|
+
for (let i = 0; i < refArray.length; i++) {
|
|
1609
|
+
for (let j = i + 1; j < refArray.length; j++) {
|
|
1610
|
+
const a = Math.min(refArray[i], refArray[j]);
|
|
1611
|
+
const b = Math.max(refArray[i], refArray[j]);
|
|
1612
|
+
const key = `${a}-${b}`;
|
|
1613
|
+
pairCounts.set(key, (pairCounts.get(key) ?? 0) + 1);
|
|
1614
|
+
}
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
const coCitationEdges = [];
|
|
1618
|
+
const maxCount = Math.max(1, ...pairCounts.values());
|
|
1619
|
+
for (const [key, count] of pairCounts) {
|
|
1620
|
+
const [a, b] = key.split("-").map(Number);
|
|
1621
|
+
coCitationEdges.push({
|
|
1622
|
+
src_paper_id: a,
|
|
1623
|
+
dst_paper_id: b,
|
|
1624
|
+
type: "CO_CITED" /* CO_CITED */,
|
|
1625
|
+
weight: count / maxCount,
|
|
1626
|
+
// Normalize to [0, 1]
|
|
1627
|
+
confidence: 1,
|
|
1628
|
+
rationale: null,
|
|
1629
|
+
evidence: null,
|
|
1630
|
+
created_by: "algo",
|
|
1631
|
+
provenance_json: JSON.stringify({ source: "co-citation", count })
|
|
1632
|
+
});
|
|
1633
|
+
}
|
|
1634
|
+
logger6.debug({ pairs: coCitationEdges.length }, "Co-citation edges computed");
|
|
1635
|
+
return coCitationEdges;
|
|
1636
|
+
}
|
|
1637
|
+
function computeBibCoupling(edges) {
|
|
1638
|
+
const citesEdges = edges.filter((e) => e.type === "CITES" /* CITES */);
|
|
1639
|
+
const paperRefs = /* @__PURE__ */ new Map();
|
|
1640
|
+
for (const edge of citesEdges) {
|
|
1641
|
+
if (!paperRefs.has(edge.src_paper_id)) {
|
|
1642
|
+
paperRefs.set(edge.src_paper_id, /* @__PURE__ */ new Set());
|
|
1643
|
+
}
|
|
1644
|
+
paperRefs.get(edge.src_paper_id).add(edge.dst_paper_id);
|
|
1645
|
+
}
|
|
1646
|
+
const couplingEdges = [];
|
|
1647
|
+
const papers = Array.from(paperRefs.keys());
|
|
1648
|
+
for (let i = 0; i < papers.length; i++) {
|
|
1649
|
+
const refsA = paperRefs.get(papers[i]);
|
|
1650
|
+
if (refsA.size === 0) continue;
|
|
1651
|
+
for (let j = i + 1; j < papers.length; j++) {
|
|
1652
|
+
const refsB = paperRefs.get(papers[j]);
|
|
1653
|
+
if (refsB.size === 0) continue;
|
|
1654
|
+
let overlap = 0;
|
|
1655
|
+
for (const ref of refsA) {
|
|
1656
|
+
if (refsB.has(ref)) overlap++;
|
|
1657
|
+
}
|
|
1658
|
+
if (overlap === 0) continue;
|
|
1659
|
+
const weight = overlap / Math.min(refsA.size, refsB.size);
|
|
1660
|
+
couplingEdges.push({
|
|
1661
|
+
src_paper_id: papers[i],
|
|
1662
|
+
dst_paper_id: papers[j],
|
|
1663
|
+
type: "BIB_COUPLED" /* BIB_COUPLED */,
|
|
1664
|
+
weight,
|
|
1665
|
+
confidence: 1,
|
|
1666
|
+
rationale: null,
|
|
1667
|
+
evidence: null,
|
|
1668
|
+
created_by: "algo",
|
|
1669
|
+
provenance_json: JSON.stringify({
|
|
1670
|
+
source: "bib-coupling",
|
|
1671
|
+
overlap,
|
|
1672
|
+
refsA: refsA.size,
|
|
1673
|
+
refsB: refsB.size
|
|
1674
|
+
})
|
|
1675
|
+
});
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
logger6.debug({ pairs: couplingEdges.length }, "Bibliographic coupling edges computed");
|
|
1679
|
+
return couplingEdges;
|
|
1680
|
+
}
|
|
1681
|
+
function buildClusterObjects(clusterMap, clusterNames) {
|
|
1682
|
+
const clusters = [];
|
|
1683
|
+
const paperMappings = /* @__PURE__ */ new Map();
|
|
1684
|
+
let idx = 0;
|
|
1685
|
+
for (const [communityId, paperIds] of clusterMap) {
|
|
1686
|
+
const name = clusterNames?.get(communityId) ?? `Cluster ${communityId}`;
|
|
1687
|
+
clusters.push({
|
|
1688
|
+
method: "louvain_citation",
|
|
1689
|
+
name,
|
|
1690
|
+
description: null,
|
|
1691
|
+
stats_json: JSON.stringify({
|
|
1692
|
+
member_count: paperIds.length,
|
|
1693
|
+
community_id: communityId
|
|
1694
|
+
})
|
|
1695
|
+
});
|
|
1696
|
+
paperMappings.set(idx, paperIds);
|
|
1697
|
+
idx++;
|
|
1698
|
+
}
|
|
1699
|
+
return { clusters, paperMappings };
|
|
1700
|
+
}
|
|
1701
|
+
|
|
1702
|
+
// src/graph/scoring.ts
|
|
1703
|
+
var DEFAULT_WEIGHTS = {
|
|
1704
|
+
pagerankWeight: 0.5,
|
|
1705
|
+
relevanceWeight: 0.3,
|
|
1706
|
+
recencyWeight: 0.2
|
|
1707
|
+
};
|
|
1708
|
+
function computeCompositeScores(papers, pagerankScores, corpus, topic, weights = DEFAULT_WEIGHTS) {
|
|
1709
|
+
const scores = /* @__PURE__ */ new Map();
|
|
1710
|
+
const currentYear = (/* @__PURE__ */ new Date()).getFullYear();
|
|
1711
|
+
const oldestYear = Math.min(
|
|
1712
|
+
...papers.map((p) => p.year ?? currentYear).filter((y) => y > 1900)
|
|
1713
|
+
);
|
|
1714
|
+
const yearRange = Math.max(1, currentYear - oldestYear);
|
|
1715
|
+
const queryTokens = topic ? tokenize(topic) : [];
|
|
1716
|
+
const maxPagerank = Math.max(1e-3, ...pagerankScores.values());
|
|
1717
|
+
for (const paper of papers) {
|
|
1718
|
+
if (paper.paper_id === void 0) continue;
|
|
1719
|
+
const pr = (pagerankScores.get(paper.paper_id) ?? 0) / maxPagerank;
|
|
1720
|
+
const sourceId = paper.source_id || String(paper.paper_id);
|
|
1721
|
+
const relevance = queryTokens.length > 0 ? computeRelevance(corpus, sourceId, queryTokens) : 0;
|
|
1722
|
+
const paperYear = paper.year ?? currentYear;
|
|
1723
|
+
const recency = yearRange > 0 ? (paperYear - oldestYear) / yearRange : 0.5;
|
|
1724
|
+
const composite = pr * weights.pagerankWeight + relevance * weights.relevanceWeight + recency * weights.recencyWeight;
|
|
1725
|
+
scores.set(paper.paper_id, Math.min(1, composite));
|
|
1726
|
+
}
|
|
1727
|
+
return scores;
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
// src/builder/graph-builder.ts
|
|
1731
|
+
var logger7 = getLogger();
|
|
1732
|
+
function getSourceAdapter(config) {
|
|
1733
|
+
switch (config.source) {
|
|
1734
|
+
case "s2":
|
|
1735
|
+
return new SemanticScholarAdapter();
|
|
1736
|
+
case "openalex":
|
|
1737
|
+
default:
|
|
1738
|
+
return new OpenAlexAdapter();
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1741
|
+
async function buildGraph(config) {
|
|
1742
|
+
const db = new PaperGraphDatabase(config.out);
|
|
1743
|
+
const adapter = getSourceAdapter(config);
|
|
1744
|
+
logger7.info(
|
|
1745
|
+
{ topic: config.topic, paper: config.paper, source: config.source, spine: config.spine, depth: config.depth, maxPapers: config.maxPapers },
|
|
1746
|
+
"Starting graph build"
|
|
1747
|
+
);
|
|
1748
|
+
const startTime = Date.now();
|
|
1749
|
+
try {
|
|
1750
|
+
const seedPapers = await findSeeds(adapter, config);
|
|
1751
|
+
logger7.info({ seedCount: seedPapers.length }, "Seeds found");
|
|
1752
|
+
if (seedPapers.length === 0) {
|
|
1753
|
+
logger7.warn("No seed papers found \u2014 nothing to build");
|
|
1754
|
+
db.close();
|
|
1755
|
+
return config.out;
|
|
1756
|
+
}
|
|
1757
|
+
const seedIds = db.insertPapers(seedPapers);
|
|
1758
|
+
for (let i = 0; i < seedPapers.length; i++) {
|
|
1759
|
+
seedPapers[i].paper_id = seedIds[i];
|
|
1760
|
+
}
|
|
1761
|
+
const allEdges = [];
|
|
1762
|
+
await traverseCitations(adapter, db, seedPapers, config, allEdges);
|
|
1763
|
+
const allPapers = db.getAllPapers();
|
|
1764
|
+
logger7.info({ paperCount: allPapers.length }, "Papers in database");
|
|
1765
|
+
const corpus = buildCorpus(allPapers);
|
|
1766
|
+
logger7.info({ corpusSize: corpus.size }, "TF-IDF corpus built");
|
|
1767
|
+
const paperIdMap = /* @__PURE__ */ new Map();
|
|
1768
|
+
for (const paper of allPapers) {
|
|
1769
|
+
if (paper.paper_id !== void 0) {
|
|
1770
|
+
paperIdMap.set(paper.source_id, paper.paper_id);
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
const spine = config.spine;
|
|
1774
|
+
if (spine === "similarity" || spine === "hybrid") {
|
|
1775
|
+
const simEdges = buildSimilarityEdges(
|
|
1776
|
+
paperIdMap,
|
|
1777
|
+
corpus,
|
|
1778
|
+
config.similarity?.topK ?? 10,
|
|
1779
|
+
config.similarity?.threshold ?? 0.25
|
|
1780
|
+
);
|
|
1781
|
+
db.insertEdges(simEdges);
|
|
1782
|
+
logger7.info({ simEdges: simEdges.length }, "Similarity edges added");
|
|
1783
|
+
}
|
|
1784
|
+
const dbEdges = db.getAllEdges();
|
|
1785
|
+
if (spine === "co-citation" || spine === "hybrid") {
|
|
1786
|
+
const coCiteEdges = computeCoCitation(dbEdges);
|
|
1787
|
+
db.insertEdges(coCiteEdges);
|
|
1788
|
+
logger7.info({ coCiteEdges: coCiteEdges.length }, "Co-citation edges added");
|
|
1789
|
+
}
|
|
1790
|
+
if (spine === "coupling" || spine === "hybrid") {
|
|
1791
|
+
const couplingEdges = computeBibCoupling(dbEdges);
|
|
1792
|
+
db.insertEdges(couplingEdges);
|
|
1793
|
+
logger7.info({ couplingEdges: couplingEdges.length }, "Bib coupling edges added");
|
|
1794
|
+
}
|
|
1795
|
+
const finalEdges = db.getAllEdges();
|
|
1796
|
+
const pagerankScores = computePageRank(allPapers, finalEdges);
|
|
1797
|
+
logger7.info({ nodes: pagerankScores.size }, "PageRank computed");
|
|
1798
|
+
const clusterMap = computeLouvainClusters(allPapers, finalEdges);
|
|
1799
|
+
const clusterNames = /* @__PURE__ */ new Map();
|
|
1800
|
+
for (const [communityId, paperIds] of clusterMap) {
|
|
1801
|
+
const sourceIds = paperIds.map((id) => allPapers.find((p) => p.paper_id === id)?.source_id).filter((id) => !!id);
|
|
1802
|
+
const top = getTopTerms(corpus, sourceIds, 3);
|
|
1803
|
+
clusterNames.set(communityId, top.join(", ") || `Cluster ${communityId}`);
|
|
1804
|
+
}
|
|
1805
|
+
const { clusters, paperMappings } = buildClusterObjects(clusterMap, clusterNames);
|
|
1806
|
+
db.insertClusters(clusters, paperMappings);
|
|
1807
|
+
logger7.info({ clusterCount: clusters.length }, "Clusters stored");
|
|
1808
|
+
const topicQuery = config.topic ?? "";
|
|
1809
|
+
const compositeScores = computeCompositeScores(
|
|
1810
|
+
allPapers,
|
|
1811
|
+
pagerankScores,
|
|
1812
|
+
corpus,
|
|
1813
|
+
topicQuery,
|
|
1814
|
+
{
|
|
1815
|
+
pagerankWeight: config.ranking?.pagerankWeight ?? 0.5,
|
|
1816
|
+
relevanceWeight: config.ranking?.relevanceWeight ?? 0.3,
|
|
1817
|
+
recencyWeight: config.ranking?.recencyWeight ?? 0.2
|
|
1818
|
+
}
|
|
1819
|
+
);
|
|
1820
|
+
for (const [paperId, pr] of pagerankScores) {
|
|
1821
|
+
db.updatePaperScore(paperId, pr);
|
|
1822
|
+
}
|
|
1823
|
+
const stats = db.getStats();
|
|
1824
|
+
db.insertRun({
|
|
1825
|
+
created_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1826
|
+
papergraph_version: "1.0.0",
|
|
1827
|
+
config_json: JSON.stringify(config),
|
|
1828
|
+
source: config.source,
|
|
1829
|
+
spine: config.spine,
|
|
1830
|
+
depth: config.depth,
|
|
1831
|
+
stats_json: JSON.stringify(stats)
|
|
1832
|
+
});
|
|
1833
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
1834
|
+
logger7.info(
|
|
1835
|
+
{ papers: stats.papers, edges: stats.edges, clusters: stats.clusters, elapsed: `${elapsed}s` },
|
|
1836
|
+
"Graph build complete"
|
|
1837
|
+
);
|
|
1838
|
+
db.close();
|
|
1839
|
+
return config.out;
|
|
1840
|
+
} catch (error) {
|
|
1841
|
+
db.close();
|
|
1842
|
+
throw error;
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
async function findSeeds(adapter, config) {
|
|
1846
|
+
const papers = [];
|
|
1847
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1848
|
+
const addUnique = (paper) => {
|
|
1849
|
+
const key = `${paper.source}:${paper.source_id}`;
|
|
1850
|
+
if (!seen.has(key)) {
|
|
1851
|
+
seen.add(key);
|
|
1852
|
+
papers.push(paper);
|
|
1853
|
+
}
|
|
1854
|
+
};
|
|
1855
|
+
const seedLimit = Math.max(10, Math.min(Math.floor(config.maxPapers * 0.4), 200));
|
|
1856
|
+
if (config.topic) {
|
|
1857
|
+
const results = await adapter.searchByTopic(config.topic, seedLimit);
|
|
1858
|
+
results.forEach(addUnique);
|
|
1859
|
+
}
|
|
1860
|
+
if (config.paper?.length) {
|
|
1861
|
+
for (const title of config.paper) {
|
|
1862
|
+
const results = await adapter.searchByTitle(title, 5);
|
|
1863
|
+
if (results.length > 0) {
|
|
1864
|
+
addUnique(results[0]);
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
}
|
|
1868
|
+
if (config.doi?.length) {
|
|
1869
|
+
for (const doi of config.doi) {
|
|
1870
|
+
const paper = await adapter.fetchPaper(doi);
|
|
1871
|
+
if (paper) addUnique(paper);
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
return papers.slice(0, seedLimit);
|
|
1875
|
+
}
|
|
1876
|
+
async function traverseCitations(adapter, db, seedPapers, config, allEdges) {
|
|
1877
|
+
const depth = config.depth;
|
|
1878
|
+
const maxPapers = config.maxPapers;
|
|
1879
|
+
const refsPerPaper = config.maxRefsPerPaper;
|
|
1880
|
+
let frontier = [...seedPapers];
|
|
1881
|
+
const visited = /* @__PURE__ */ new Set();
|
|
1882
|
+
const edgeSeen = /* @__PURE__ */ new Set();
|
|
1883
|
+
for (const paper of seedPapers) {
|
|
1884
|
+
visited.add(`${paper.source}:${paper.source_id}`);
|
|
1885
|
+
}
|
|
1886
|
+
for (let d = 0; d < depth; d++) {
|
|
1887
|
+
const nextFrontier = [];
|
|
1888
|
+
const atCapacity = db.getPaperCount() >= maxPapers;
|
|
1889
|
+
logger7.info(
|
|
1890
|
+
{ depth: d, frontier: frontier.length, atCapacity, totalPapers: db.getPaperCount() },
|
|
1891
|
+
"Starting depth traversal"
|
|
1892
|
+
);
|
|
1893
|
+
for (const paper of frontier) {
|
|
1894
|
+
if (paper.paper_id === void 0) continue;
|
|
1895
|
+
try {
|
|
1896
|
+
const refs = await adapter.fetchReferences(paper.source_id, refsPerPaper);
|
|
1897
|
+
for (const ref of refs) {
|
|
1898
|
+
const key = `${ref.source}:${ref.source_id}`;
|
|
1899
|
+
const existingPaper = db.getPaperBySourceId(ref.source, ref.source_id);
|
|
1900
|
+
if (existingPaper && existingPaper.paper_id !== void 0) {
|
|
1901
|
+
const edgeKey = `${paper.paper_id}->${existingPaper.paper_id}`;
|
|
1902
|
+
if (!edgeSeen.has(edgeKey)) {
|
|
1903
|
+
edgeSeen.add(edgeKey);
|
|
1904
|
+
const edge = {
|
|
1905
|
+
src_paper_id: paper.paper_id,
|
|
1906
|
+
dst_paper_id: existingPaper.paper_id,
|
|
1907
|
+
type: "CITES" /* CITES */,
|
|
1908
|
+
weight: 1,
|
|
1909
|
+
confidence: 1,
|
|
1910
|
+
rationale: null,
|
|
1911
|
+
evidence: null,
|
|
1912
|
+
created_by: "algo",
|
|
1913
|
+
provenance_json: JSON.stringify({ source: paper.source, depth: d })
|
|
1914
|
+
};
|
|
1915
|
+
db.insertEdges([edge]);
|
|
1916
|
+
allEdges.push(edge);
|
|
1917
|
+
}
|
|
1918
|
+
} else if (!visited.has(key) && !atCapacity && db.getPaperCount() < maxPapers) {
|
|
1919
|
+
visited.add(key);
|
|
1920
|
+
const [refId] = db.insertPapers([ref]);
|
|
1921
|
+
if (refId !== void 0) {
|
|
1922
|
+
const edgeKey = `${paper.paper_id}->${refId}`;
|
|
1923
|
+
if (!edgeSeen.has(edgeKey)) {
|
|
1924
|
+
edgeSeen.add(edgeKey);
|
|
1925
|
+
const edge = {
|
|
1926
|
+
src_paper_id: paper.paper_id,
|
|
1927
|
+
dst_paper_id: refId,
|
|
1928
|
+
type: "CITES" /* CITES */,
|
|
1929
|
+
weight: 1,
|
|
1930
|
+
confidence: 1,
|
|
1931
|
+
rationale: null,
|
|
1932
|
+
evidence: null,
|
|
1933
|
+
created_by: "algo",
|
|
1934
|
+
provenance_json: JSON.stringify({ source: paper.source, depth: d })
|
|
1935
|
+
};
|
|
1936
|
+
db.insertEdges([edge]);
|
|
1937
|
+
allEdges.push(edge);
|
|
1938
|
+
}
|
|
1939
|
+
nextFrontier.push({ ...ref, paper_id: refId });
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
}
|
|
1943
|
+
} catch (error) {
|
|
1944
|
+
logger7.warn({ paperId: paper.source_id, error }, "Failed to traverse paper");
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
logger7.info(
|
|
1948
|
+
{ depth: d + 1, frontier: nextFrontier.length, totalPapers: db.getPaperCount(), edgesCreated: allEdges.length },
|
|
1949
|
+
"Citation depth traversed"
|
|
1950
|
+
);
|
|
1951
|
+
if (nextFrontier.length === 0) break;
|
|
1952
|
+
frontier = nextFrontier;
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
// src/exporters/export.ts
|
|
1957
|
+
import { writeFileSync } from "fs";
|
|
1958
|
+
var logger8 = getLogger();
|
|
1959
|
+
function exportGraph(dbPath, outputPath, format) {
|
|
1960
|
+
const db = new PaperGraphDatabase(dbPath);
|
|
1961
|
+
try {
|
|
1962
|
+
const data = {
|
|
1963
|
+
papers: db.getAllPapers(),
|
|
1964
|
+
edges: db.getAllEdges(),
|
|
1965
|
+
clusters: db.getAllClusters(),
|
|
1966
|
+
entities: db.getAllEntities()
|
|
1967
|
+
};
|
|
1968
|
+
let content;
|
|
1969
|
+
switch (format) {
|
|
1970
|
+
case "json":
|
|
1971
|
+
content = exportJson(data);
|
|
1972
|
+
break;
|
|
1973
|
+
case "graphml":
|
|
1974
|
+
content = exportGraphML(data);
|
|
1975
|
+
break;
|
|
1976
|
+
case "gexf":
|
|
1977
|
+
content = exportGEXF(data);
|
|
1978
|
+
break;
|
|
1979
|
+
case "csv":
|
|
1980
|
+
content = exportCSV(data);
|
|
1981
|
+
break;
|
|
1982
|
+
case "mermaid":
|
|
1983
|
+
content = exportMermaid(data);
|
|
1984
|
+
break;
|
|
1985
|
+
default:
|
|
1986
|
+
throw new Error(`Unsupported export format: ${format}`);
|
|
1987
|
+
}
|
|
1988
|
+
writeFileSync(outputPath, content, "utf-8");
|
|
1989
|
+
logger8.info({ format, outputPath, papers: data.papers.length, edges: data.edges.length }, "Graph exported");
|
|
1990
|
+
} finally {
|
|
1991
|
+
db.close();
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
function exportJson(data) {
|
|
1995
|
+
return JSON.stringify({
|
|
1996
|
+
papergraph: {
|
|
1997
|
+
version: "1.0.0",
|
|
1998
|
+
exported_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
1999
|
+
},
|
|
2000
|
+
papers: data.papers.map((p) => ({
|
|
2001
|
+
id: p.paper_id,
|
|
2002
|
+
source: p.source,
|
|
2003
|
+
source_id: p.source_id,
|
|
2004
|
+
doi: p.doi,
|
|
2005
|
+
title: p.title,
|
|
2006
|
+
abstract: p.abstract?.slice(0, 500),
|
|
2007
|
+
year: p.year,
|
|
2008
|
+
venue: p.venue,
|
|
2009
|
+
url: p.url,
|
|
2010
|
+
citation_count: p.citation_count,
|
|
2011
|
+
influence_score: p.influence_score
|
|
2012
|
+
})),
|
|
2013
|
+
edges: data.edges.map((e) => ({
|
|
2014
|
+
source: e.src_paper_id,
|
|
2015
|
+
target: e.dst_paper_id,
|
|
2016
|
+
type: e.type,
|
|
2017
|
+
weight: e.weight,
|
|
2018
|
+
confidence: e.confidence
|
|
2019
|
+
})),
|
|
2020
|
+
clusters: data.clusters.map((c) => ({
|
|
2021
|
+
id: c.cluster_id,
|
|
2022
|
+
name: c.name,
|
|
2023
|
+
method: c.method
|
|
2024
|
+
})),
|
|
2025
|
+
entities: data.entities.map((e) => ({
|
|
2026
|
+
id: e.entity_id,
|
|
2027
|
+
type: e.type,
|
|
2028
|
+
name: e.name
|
|
2029
|
+
}))
|
|
2030
|
+
}, null, 2);
|
|
2031
|
+
}
|
|
2032
|
+
function exportGraphML(data) {
|
|
2033
|
+
const esc = (s) => (s ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
2034
|
+
let xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
2035
|
+
<graphml xmlns="http://graphml.graphstudio.org/xmlns/graphml"
|
|
2036
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
|
2037
|
+
<key id="title" for="node" attr.name="title" attr.type="string"/>
|
|
2038
|
+
<key id="year" for="node" attr.name="year" attr.type="int"/>
|
|
2039
|
+
<key id="venue" for="node" attr.name="venue" attr.type="string"/>
|
|
2040
|
+
<key id="citation_count" for="node" attr.name="citation_count" attr.type="int"/>
|
|
2041
|
+
<key id="doi" for="node" attr.name="doi" attr.type="string"/>
|
|
2042
|
+
<key id="influence" for="node" attr.name="influence" attr.type="double"/>
|
|
2043
|
+
<key id="type" for="edge" attr.name="type" attr.type="string"/>
|
|
2044
|
+
<key id="weight" for="edge" attr.name="weight" attr.type="double"/>
|
|
2045
|
+
<graph id="papergraph" edgedefault="directed">
|
|
2046
|
+
`;
|
|
2047
|
+
for (const paper of data.papers) {
|
|
2048
|
+
xml += ` <node id="n${paper.paper_id}">
|
|
2049
|
+
<data key="title">${esc(paper.title)}</data>
|
|
2050
|
+
<data key="year">${paper.year ?? ""}</data>
|
|
2051
|
+
<data key="venue">${esc(paper.venue)}</data>
|
|
2052
|
+
<data key="citation_count">${paper.citation_count}</data>
|
|
2053
|
+
<data key="doi">${esc(paper.doi)}</data>
|
|
2054
|
+
<data key="influence">${paper.influence_score ?? 0}</data>
|
|
2055
|
+
</node>
|
|
2056
|
+
`;
|
|
2057
|
+
}
|
|
2058
|
+
for (const edge of data.edges) {
|
|
2059
|
+
xml += ` <edge source="n${edge.src_paper_id}" target="n${edge.dst_paper_id}">
|
|
2060
|
+
<data key="type">${edge.type}</data>
|
|
2061
|
+
<data key="weight">${edge.weight}</data>
|
|
2062
|
+
</edge>
|
|
2063
|
+
`;
|
|
2064
|
+
}
|
|
2065
|
+
xml += ` </graph>
|
|
2066
|
+
</graphml>`;
|
|
2067
|
+
return xml;
|
|
2068
|
+
}
|
|
2069
|
+
function exportGEXF(data) {
|
|
2070
|
+
const esc = (s) => (s ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
2071
|
+
let xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
2072
|
+
<gexf xmlns="http://www.gexf.net/1.3"
|
|
2073
|
+
version="1.3">
|
|
2074
|
+
<meta>
|
|
2075
|
+
<creator>PaperGraph</creator>
|
|
2076
|
+
<description>Research paper connectivity graph</description>
|
|
2077
|
+
</meta>
|
|
2078
|
+
<graph defaultedgetype="directed">
|
|
2079
|
+
<attributes class="node">
|
|
2080
|
+
<attribute id="0" title="title" type="string"/>
|
|
2081
|
+
<attribute id="1" title="year" type="integer"/>
|
|
2082
|
+
<attribute id="2" title="venue" type="string"/>
|
|
2083
|
+
<attribute id="3" title="citations" type="integer"/>
|
|
2084
|
+
<attribute id="4" title="influence" type="float"/>
|
|
2085
|
+
</attributes>
|
|
2086
|
+
<attributes class="edge">
|
|
2087
|
+
<attribute id="0" title="type" type="string"/>
|
|
2088
|
+
</attributes>
|
|
2089
|
+
<nodes>
|
|
2090
|
+
`;
|
|
2091
|
+
for (const paper of data.papers) {
|
|
2092
|
+
xml += ` <node id="${paper.paper_id}" label="${esc(paper.title?.slice(0, 60))}">
|
|
2093
|
+
<attvalues>
|
|
2094
|
+
<attvalue for="0" value="${esc(paper.title)}"/>
|
|
2095
|
+
<attvalue for="1" value="${paper.year ?? 0}"/>
|
|
2096
|
+
<attvalue for="2" value="${esc(paper.venue)}"/>
|
|
2097
|
+
<attvalue for="3" value="${paper.citation_count}"/>
|
|
2098
|
+
<attvalue for="4" value="${paper.influence_score ?? 0}"/>
|
|
2099
|
+
</attvalues>
|
|
2100
|
+
</node>
|
|
2101
|
+
`;
|
|
2102
|
+
}
|
|
2103
|
+
xml += ` </nodes>
|
|
2104
|
+
<edges>
|
|
2105
|
+
`;
|
|
2106
|
+
let edgeIdx = 0;
|
|
2107
|
+
for (const edge of data.edges) {
|
|
2108
|
+
xml += ` <edge id="${edgeIdx++}" source="${edge.src_paper_id}" target="${edge.dst_paper_id}" weight="${edge.weight}">
|
|
2109
|
+
<attvalues>
|
|
2110
|
+
<attvalue for="0" value="${edge.type}"/>
|
|
2111
|
+
</attvalues>
|
|
2112
|
+
</edge>
|
|
2113
|
+
`;
|
|
2114
|
+
}
|
|
2115
|
+
xml += ` </edges>
|
|
2116
|
+
</graph>
|
|
2117
|
+
</gexf>`;
|
|
2118
|
+
return xml;
|
|
2119
|
+
}
|
|
2120
|
+
function exportCSV(data) {
|
|
2121
|
+
let csv = "paper_id,source,source_id,doi,title,year,venue,citation_count,influence_score\n";
|
|
2122
|
+
for (const paper of data.papers) {
|
|
2123
|
+
csv += [
|
|
2124
|
+
paper.paper_id,
|
|
2125
|
+
paper.source,
|
|
2126
|
+
paper.source_id,
|
|
2127
|
+
`"${(paper.doi ?? "").replace(/"/g, '""')}"`,
|
|
2128
|
+
`"${(paper.title ?? "").replace(/"/g, '""')}"`,
|
|
2129
|
+
paper.year ?? "",
|
|
2130
|
+
`"${(paper.venue ?? "").replace(/"/g, '""')}"`,
|
|
2131
|
+
paper.citation_count,
|
|
2132
|
+
paper.influence_score ?? ""
|
|
2133
|
+
].join(",") + "\n";
|
|
2134
|
+
}
|
|
2135
|
+
csv += "\n# EDGES\nsrc_paper_id,dst_paper_id,type,weight,confidence\n";
|
|
2136
|
+
for (const edge of data.edges) {
|
|
2137
|
+
csv += `${edge.src_paper_id},${edge.dst_paper_id},${edge.type},${edge.weight},${edge.confidence}
|
|
2138
|
+
`;
|
|
2139
|
+
}
|
|
2140
|
+
return csv;
|
|
2141
|
+
}
|
|
2142
|
+
function exportMermaid(data) {
|
|
2143
|
+
let diagram = "graph TD\n";
|
|
2144
|
+
for (const paper of data.papers) {
|
|
2145
|
+
const label = (paper.title ?? "Untitled").slice(0, 40).replace(/"/g, "'");
|
|
2146
|
+
diagram += ` P${paper.paper_id}["${label}"]
|
|
2147
|
+
`;
|
|
2148
|
+
}
|
|
2149
|
+
diagram += "\n";
|
|
2150
|
+
const maxEdges = 100;
|
|
2151
|
+
const edgesToRender = data.edges.slice(0, maxEdges);
|
|
2152
|
+
for (const edge of edgesToRender) {
|
|
2153
|
+
const style = edge.type === "CITES" ? "-->" : "-.->";
|
|
2154
|
+
diagram += ` P${edge.src_paper_id} ${style} P${edge.dst_paper_id}
|
|
2155
|
+
`;
|
|
2156
|
+
}
|
|
2157
|
+
if (data.edges.length > maxEdges) {
|
|
2158
|
+
diagram += `
|
|
2159
|
+
%% Note: ${data.edges.length - maxEdges} additional edges omitted
|
|
2160
|
+
`;
|
|
2161
|
+
}
|
|
2162
|
+
return diagram;
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
// src/viewer/html-viewer.ts
|
|
2166
|
+
import { writeFileSync as writeFileSync2 } from "fs";
|
|
2167
|
+
var logger9 = getLogger();
|
|
2168
|
+
function generateViewer(dbPath, outputPath) {
|
|
2169
|
+
const db = new PaperGraphDatabase(dbPath);
|
|
2170
|
+
try {
|
|
2171
|
+
const papers = db.getAllPapers();
|
|
2172
|
+
const edges = db.getAllEdges();
|
|
2173
|
+
const clusters = db.getAllClusters();
|
|
2174
|
+
const rawDb = db.getRawDb();
|
|
2175
|
+
const pcRows = rawDb.prepare("SELECT paper_id, cluster_id FROM paper_clusters").all();
|
|
2176
|
+
const paperCluster = /* @__PURE__ */ new Map();
|
|
2177
|
+
for (const row of pcRows) {
|
|
2178
|
+
paperCluster.set(row.paper_id, row.cluster_id);
|
|
2179
|
+
}
|
|
2180
|
+
const graphData = buildCytoscapeData(papers, edges, clusters, paperCluster);
|
|
2181
|
+
const html = buildHtml(graphData, papers.length, edges.length);
|
|
2182
|
+
writeFileSync2(outputPath, html, "utf-8");
|
|
2183
|
+
logger9.info({ outputPath, papers: papers.length, edges: edges.length }, "HTML viewer generated");
|
|
2184
|
+
} finally {
|
|
2185
|
+
db.close();
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
2188
|
+
function buildCytoscapeData(papers, edges, clusters, paperCluster) {
|
|
2189
|
+
const colors = [
|
|
2190
|
+
"#6366f1",
|
|
2191
|
+
"#f43f5e",
|
|
2192
|
+
"#10b981",
|
|
2193
|
+
"#f59e0b",
|
|
2194
|
+
"#3b82f6",
|
|
2195
|
+
"#8b5cf6",
|
|
2196
|
+
"#ec4899",
|
|
2197
|
+
"#14b8a6",
|
|
2198
|
+
"#ef4444",
|
|
2199
|
+
"#06b6d4",
|
|
2200
|
+
"#84cc16",
|
|
2201
|
+
"#a855f7",
|
|
2202
|
+
"#f97316",
|
|
2203
|
+
"#22d3ee",
|
|
2204
|
+
"#e879f9"
|
|
2205
|
+
];
|
|
2206
|
+
const clusterColor = /* @__PURE__ */ new Map();
|
|
2207
|
+
let colorIdx = 0;
|
|
2208
|
+
for (const c of clusters) {
|
|
2209
|
+
if (c.cluster_id !== void 0) {
|
|
2210
|
+
clusterColor.set(c.cluster_id, colors[colorIdx % colors.length]);
|
|
2211
|
+
colorIdx++;
|
|
2212
|
+
}
|
|
2213
|
+
}
|
|
2214
|
+
const nodes = papers.map((p) => {
|
|
2215
|
+
const clusterId = paperCluster.get(p.paper_id);
|
|
2216
|
+
const color = clusterId !== void 0 ? clusterColor.get(clusterId) ?? "#6366f1" : "#6366f1";
|
|
2217
|
+
const size = Math.max(20, Math.min(60, 20 + (p.influence_score ?? 0) * 1e3));
|
|
2218
|
+
return {
|
|
2219
|
+
data: {
|
|
2220
|
+
id: `p${p.paper_id}`,
|
|
2221
|
+
label: (p.title ?? "Untitled").slice(0, 50),
|
|
2222
|
+
title: p.title,
|
|
2223
|
+
year: p.year,
|
|
2224
|
+
venue: p.venue,
|
|
2225
|
+
doi: p.doi,
|
|
2226
|
+
url: p.url,
|
|
2227
|
+
citations: p.citation_count,
|
|
2228
|
+
influence: p.influence_score,
|
|
2229
|
+
cluster: clusterId,
|
|
2230
|
+
color,
|
|
2231
|
+
size
|
|
2232
|
+
}
|
|
2233
|
+
};
|
|
2234
|
+
});
|
|
2235
|
+
const edgeTypeColors = {
|
|
2236
|
+
CITES: "#64748b",
|
|
2237
|
+
SIMILAR_TEXT: "#6366f1",
|
|
2238
|
+
CO_CITED: "#10b981",
|
|
2239
|
+
BIB_COUPLED: "#f59e0b",
|
|
2240
|
+
EXTENDS: "#3b82f6",
|
|
2241
|
+
CONTRADICTS: "#ef4444",
|
|
2242
|
+
REVIEWS: "#8b5cf6",
|
|
2243
|
+
REPLICATES: "#ec4899",
|
|
2244
|
+
USES_DATA: "#14b8a6",
|
|
2245
|
+
SHARES_METHOD: "#f97316",
|
|
2246
|
+
SAME_VENUE: "#84cc16",
|
|
2247
|
+
SAME_AUTHOR: "#a855f7",
|
|
2248
|
+
LLM_SEMANTIC: "#22d3ee",
|
|
2249
|
+
LLM_METHODOLOGICAL: "#e879f9",
|
|
2250
|
+
LLM_BUILDS_ON: "#06b6d4",
|
|
2251
|
+
LLM_DISAGREES_WITH: "#f43f5e"
|
|
2252
|
+
};
|
|
2253
|
+
const cyEdges = edges.map((e, i) => ({
|
|
2254
|
+
data: {
|
|
2255
|
+
id: `e${i}`,
|
|
2256
|
+
source: `p${e.src_paper_id}`,
|
|
2257
|
+
target: `p${e.dst_paper_id}`,
|
|
2258
|
+
type: e.type,
|
|
2259
|
+
weight: e.weight,
|
|
2260
|
+
color: edgeTypeColors[e.type] ?? "#64748b"
|
|
2261
|
+
}
|
|
2262
|
+
}));
|
|
2263
|
+
return JSON.stringify([...nodes, ...cyEdges]);
|
|
2264
|
+
}
|
|
2265
|
+
function buildHtml(graphData, paperCount, edgeCount) {
|
|
2266
|
+
return `<!DOCTYPE html>
|
|
2267
|
+
<html lang="en">
|
|
2268
|
+
<head>
|
|
2269
|
+
<meta charset="UTF-8">
|
|
2270
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
2271
|
+
<title>PaperGraph Viewer</title>
|
|
2272
|
+
<script src="https://unpkg.com/cytoscape@3.30.4/dist/cytoscape.min.js"></script>
|
|
2273
|
+
<script src="https://unpkg.com/elkjs@0.9.3/lib/elk.bundled.js"></script>
|
|
2274
|
+
<script src="https://unpkg.com/cytoscape-elk@2.2.0/dist/cytoscape-elk.js"></script>
|
|
2275
|
+
<style>
|
|
2276
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
2277
|
+
body {
|
|
2278
|
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
|
2279
|
+
background: #0f172a;
|
|
2280
|
+
color: #e2e8f0;
|
|
2281
|
+
height: 100vh;
|
|
2282
|
+
overflow: hidden;
|
|
2283
|
+
}
|
|
2284
|
+
#cy {
|
|
2285
|
+
width: 100%;
|
|
2286
|
+
height: 100vh;
|
|
2287
|
+
position: absolute;
|
|
2288
|
+
top: 0;
|
|
2289
|
+
left: 0;
|
|
2290
|
+
}
|
|
2291
|
+
.panel {
|
|
2292
|
+
position: absolute;
|
|
2293
|
+
background: rgba(15, 23, 42, 0.85);
|
|
2294
|
+
backdrop-filter: blur(16px);
|
|
2295
|
+
border: 1px solid rgba(100, 116, 139, 0.3);
|
|
2296
|
+
border-radius: 12px;
|
|
2297
|
+
padding: 16px;
|
|
2298
|
+
z-index: 10;
|
|
2299
|
+
}
|
|
2300
|
+
.header {
|
|
2301
|
+
top: 16px;
|
|
2302
|
+
left: 16px;
|
|
2303
|
+
display: flex;
|
|
2304
|
+
align-items: center;
|
|
2305
|
+
gap: 12px;
|
|
2306
|
+
}
|
|
2307
|
+
.header h1 {
|
|
2308
|
+
font-size: 18px;
|
|
2309
|
+
font-weight: 700;
|
|
2310
|
+
background: linear-gradient(135deg, #6366f1, #a855f7);
|
|
2311
|
+
-webkit-background-clip: text;
|
|
2312
|
+
-webkit-text-fill-color: transparent;
|
|
2313
|
+
}
|
|
2314
|
+
.stats {
|
|
2315
|
+
font-size: 12px;
|
|
2316
|
+
color: #94a3b8;
|
|
2317
|
+
}
|
|
2318
|
+
.search-panel {
|
|
2319
|
+
top: 16px;
|
|
2320
|
+
right: 16px;
|
|
2321
|
+
width: 300px;
|
|
2322
|
+
}
|
|
2323
|
+
.search-panel input {
|
|
2324
|
+
width: 100%;
|
|
2325
|
+
padding: 8px 12px;
|
|
2326
|
+
background: rgba(30, 41, 59, 0.8);
|
|
2327
|
+
border: 1px solid rgba(100, 116, 139, 0.3);
|
|
2328
|
+
border-radius: 8px;
|
|
2329
|
+
color: #e2e8f0;
|
|
2330
|
+
font-size: 14px;
|
|
2331
|
+
outline: none;
|
|
2332
|
+
}
|
|
2333
|
+
.search-panel input:focus {
|
|
2334
|
+
border-color: #6366f1;
|
|
2335
|
+
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.15);
|
|
2336
|
+
}
|
|
2337
|
+
.detail-panel {
|
|
2338
|
+
bottom: 16px;
|
|
2339
|
+
right: 16px;
|
|
2340
|
+
width: 360px;
|
|
2341
|
+
max-height: 50vh;
|
|
2342
|
+
overflow-y: auto;
|
|
2343
|
+
display: none;
|
|
2344
|
+
}
|
|
2345
|
+
.detail-panel.active { display: block; }
|
|
2346
|
+
.detail-panel h3 {
|
|
2347
|
+
font-size: 15px;
|
|
2348
|
+
font-weight: 600;
|
|
2349
|
+
margin-bottom: 8px;
|
|
2350
|
+
line-height: 1.3;
|
|
2351
|
+
}
|
|
2352
|
+
.detail-field {
|
|
2353
|
+
display: flex;
|
|
2354
|
+
justify-content: space-between;
|
|
2355
|
+
font-size: 12px;
|
|
2356
|
+
padding: 4px 0;
|
|
2357
|
+
border-bottom: 1px solid rgba(100, 116, 139, 0.15);
|
|
2358
|
+
}
|
|
2359
|
+
.detail-field .label { color: #94a3b8; }
|
|
2360
|
+
.detail-field .value { color: #e2e8f0; font-weight: 500; }
|
|
2361
|
+
.detail-field a { color: #6366f1; text-decoration: none; }
|
|
2362
|
+
.detail-field a:hover { text-decoration: underline; }
|
|
2363
|
+
.legend {
|
|
2364
|
+
bottom: 16px;
|
|
2365
|
+
left: 16px;
|
|
2366
|
+
font-size: 11px;
|
|
2367
|
+
}
|
|
2368
|
+
.legend-item {
|
|
2369
|
+
display: flex;
|
|
2370
|
+
align-items: center;
|
|
2371
|
+
gap: 6px;
|
|
2372
|
+
margin: 3px 0;
|
|
2373
|
+
}
|
|
2374
|
+
.legend-dot {
|
|
2375
|
+
width: 8px;
|
|
2376
|
+
height: 8px;
|
|
2377
|
+
border-radius: 50%;
|
|
2378
|
+
flex-shrink: 0;
|
|
2379
|
+
}
|
|
2380
|
+
.controls {
|
|
2381
|
+
top: 80px;
|
|
2382
|
+
right: 16px;
|
|
2383
|
+
display: flex;
|
|
2384
|
+
gap: 6px;
|
|
2385
|
+
}
|
|
2386
|
+
.btn {
|
|
2387
|
+
padding: 6px 12px;
|
|
2388
|
+
background: rgba(30, 41, 59, 0.8);
|
|
2389
|
+
border: 1px solid rgba(100, 116, 139, 0.3);
|
|
2390
|
+
border-radius: 6px;
|
|
2391
|
+
color: #e2e8f0;
|
|
2392
|
+
font-size: 12px;
|
|
2393
|
+
cursor: pointer;
|
|
2394
|
+
transition: all 0.15s;
|
|
2395
|
+
}
|
|
2396
|
+
.btn:hover {
|
|
2397
|
+
background: rgba(99, 102, 241, 0.2);
|
|
2398
|
+
border-color: #6366f1;
|
|
2399
|
+
}
|
|
2400
|
+
</style>
|
|
2401
|
+
</head>
|
|
2402
|
+
<body>
|
|
2403
|
+
<div id="cy"></div>
|
|
2404
|
+
|
|
2405
|
+
<div class="panel header">
|
|
2406
|
+
<h1>PaperGraph</h1>
|
|
2407
|
+
<span class="stats">${paperCount} papers \xB7 ${edgeCount} edges</span>
|
|
2408
|
+
</div>
|
|
2409
|
+
|
|
2410
|
+
<div class="panel search-panel">
|
|
2411
|
+
<input type="text" id="search" placeholder="Search papers..." autocomplete="off" />
|
|
2412
|
+
</div>
|
|
2413
|
+
|
|
2414
|
+
<div class="panel controls">
|
|
2415
|
+
<button class="btn" id="fitBtn">Fit</button>
|
|
2416
|
+
<button class="btn" id="layoutBtn">Re-layout</button>
|
|
2417
|
+
<button class="btn" id="resetBtn">Reset</button>
|
|
2418
|
+
</div>
|
|
2419
|
+
|
|
2420
|
+
<div class="panel detail-panel" id="detail">
|
|
2421
|
+
<h3 id="detail-title"></h3>
|
|
2422
|
+
<div id="detail-fields"></div>
|
|
2423
|
+
</div>
|
|
2424
|
+
|
|
2425
|
+
<div class="panel legend">
|
|
2426
|
+
<div class="legend-item"><div class="legend-dot" style="background:#64748b"></div> Cites</div>
|
|
2427
|
+
<div class="legend-item"><div class="legend-dot" style="background:#6366f1"></div> Similar</div>
|
|
2428
|
+
<div class="legend-item"><div class="legend-dot" style="background:#10b981"></div> Co-cited</div>
|
|
2429
|
+
<div class="legend-item"><div class="legend-dot" style="background:#f59e0b"></div> Coupled</div>
|
|
2430
|
+
</div>
|
|
2431
|
+
|
|
2432
|
+
<script>
|
|
2433
|
+
const graphData = ${graphData};
|
|
2434
|
+
|
|
2435
|
+
const cy = cytoscape({
|
|
2436
|
+
container: document.getElementById('cy'),
|
|
2437
|
+
elements: graphData,
|
|
2438
|
+
style: [
|
|
2439
|
+
{
|
|
2440
|
+
selector: 'node',
|
|
2441
|
+
style: {
|
|
2442
|
+
'label': 'data(label)',
|
|
2443
|
+
'background-color': 'data(color)',
|
|
2444
|
+
'width': 'data(size)',
|
|
2445
|
+
'height': 'data(size)',
|
|
2446
|
+
'font-size': '8px',
|
|
2447
|
+
'color': '#e2e8f0',
|
|
2448
|
+
'text-outline-color': '#0f172a',
|
|
2449
|
+
'text-outline-width': 2,
|
|
2450
|
+
'text-valign': 'bottom',
|
|
2451
|
+
'text-margin-y': 5,
|
|
2452
|
+
'text-max-width': '100px',
|
|
2453
|
+
'text-wrap': 'ellipsis',
|
|
2454
|
+
'border-width': 2,
|
|
2455
|
+
'border-color': 'data(color)',
|
|
2456
|
+
'border-opacity': 0.6,
|
|
2457
|
+
},
|
|
2458
|
+
},
|
|
2459
|
+
{
|
|
2460
|
+
selector: 'edge',
|
|
2461
|
+
style: {
|
|
2462
|
+
'width': function(e) { return Math.max(1, e.data('weight') * 3); },
|
|
2463
|
+
'line-color': 'data(color)',
|
|
2464
|
+
'target-arrow-color': 'data(color)',
|
|
2465
|
+
'target-arrow-shape': 'triangle',
|
|
2466
|
+
'curve-style': 'bezier',
|
|
2467
|
+
'opacity': 0.5,
|
|
2468
|
+
'arrow-scale': 0.8,
|
|
2469
|
+
},
|
|
2470
|
+
},
|
|
2471
|
+
{
|
|
2472
|
+
selector: 'node:selected',
|
|
2473
|
+
style: {
|
|
2474
|
+
'border-width': 4,
|
|
2475
|
+
'border-color': '#f59e0b',
|
|
2476
|
+
'border-opacity': 1,
|
|
2477
|
+
},
|
|
2478
|
+
},
|
|
2479
|
+
{
|
|
2480
|
+
selector: '.highlighted',
|
|
2481
|
+
style: {
|
|
2482
|
+
'opacity': 1,
|
|
2483
|
+
'border-width': 3,
|
|
2484
|
+
'border-color': '#f59e0b',
|
|
2485
|
+
},
|
|
2486
|
+
},
|
|
2487
|
+
{
|
|
2488
|
+
selector: '.faded',
|
|
2489
|
+
style: { 'opacity': 0.15 },
|
|
2490
|
+
},
|
|
2491
|
+
],
|
|
2492
|
+
layout: {
|
|
2493
|
+
name: 'cose',
|
|
2494
|
+
animate: false,
|
|
2495
|
+
nodeRepulsion: 8000,
|
|
2496
|
+
idealEdgeLength: 120,
|
|
2497
|
+
nodeOverlap: 20,
|
|
2498
|
+
},
|
|
2499
|
+
wheelSensitivity: 0.3,
|
|
2500
|
+
});
|
|
2501
|
+
|
|
2502
|
+
// Detail panel
|
|
2503
|
+
cy.on('tap', 'node', function(evt) {
|
|
2504
|
+
const d = evt.target.data();
|
|
2505
|
+
const panel = document.getElementById('detail');
|
|
2506
|
+
panel.classList.add('active');
|
|
2507
|
+
document.getElementById('detail-title').textContent = d.title;
|
|
2508
|
+
|
|
2509
|
+
const fields = [
|
|
2510
|
+
['Year', d.year],
|
|
2511
|
+
['Venue', d.venue],
|
|
2512
|
+
['Citations', d.citations],
|
|
2513
|
+
['Influence', d.influence?.toFixed(6)],
|
|
2514
|
+
['DOI', d.doi ? '<a href="https://doi.org/' + d.doi + '" target="_blank">' + d.doi + '</a>' : '\u2014'],
|
|
2515
|
+
['URL', d.url ? '<a href="' + d.url + '" target="_blank">Open</a>' : '\u2014'],
|
|
2516
|
+
];
|
|
2517
|
+
|
|
2518
|
+
document.getElementById('detail-fields').innerHTML = fields
|
|
2519
|
+
.map(([l, v]) => '<div class="detail-field"><span class="label">' + l + '</span><span class="value">' + (v ?? '\u2014') + '</span></div>')
|
|
2520
|
+
.join('');
|
|
2521
|
+
});
|
|
2522
|
+
|
|
2523
|
+
cy.on('tap', function(evt) {
|
|
2524
|
+
if (evt.target === cy) {
|
|
2525
|
+
document.getElementById('detail').classList.remove('active');
|
|
2526
|
+
cy.elements().removeClass('highlighted faded');
|
|
2527
|
+
}
|
|
2528
|
+
});
|
|
2529
|
+
|
|
2530
|
+
// Highlight neighbors on node click
|
|
2531
|
+
cy.on('select', 'node', function(evt) {
|
|
2532
|
+
const node = evt.target;
|
|
2533
|
+
const neighborhood = node.closedNeighborhood();
|
|
2534
|
+
cy.elements().addClass('faded');
|
|
2535
|
+
neighborhood.removeClass('faded').addClass('highlighted');
|
|
2536
|
+
});
|
|
2537
|
+
|
|
2538
|
+
cy.on('unselect', 'node', function() {
|
|
2539
|
+
cy.elements().removeClass('highlighted faded');
|
|
2540
|
+
});
|
|
2541
|
+
|
|
2542
|
+
// Search
|
|
2543
|
+
document.getElementById('search').addEventListener('input', function(e) {
|
|
2544
|
+
const q = e.target.value.toLowerCase().trim();
|
|
2545
|
+
if (!q) {
|
|
2546
|
+
cy.elements().removeClass('highlighted faded');
|
|
2547
|
+
return;
|
|
2548
|
+
}
|
|
2549
|
+
cy.elements().addClass('faded');
|
|
2550
|
+
cy.nodes().filter(n => {
|
|
2551
|
+
const d = n.data();
|
|
2552
|
+
return (d.title || '').toLowerCase().includes(q) ||
|
|
2553
|
+
(d.venue || '').toLowerCase().includes(q) ||
|
|
2554
|
+
(d.doi || '').toLowerCase().includes(q);
|
|
2555
|
+
}).removeClass('faded').addClass('highlighted');
|
|
2556
|
+
});
|
|
2557
|
+
|
|
2558
|
+
// Buttons
|
|
2559
|
+
document.getElementById('fitBtn').addEventListener('click', () => cy.fit(50));
|
|
2560
|
+
document.getElementById('layoutBtn').addEventListener('click', () => {
|
|
2561
|
+
cy.layout({ name: 'cose', animate: true, animationDuration: 500, nodeRepulsion: 8000 }).run();
|
|
2562
|
+
});
|
|
2563
|
+
document.getElementById('resetBtn').addEventListener('click', () => {
|
|
2564
|
+
cy.elements().removeClass('highlighted faded');
|
|
2565
|
+
document.getElementById('search').value = '';
|
|
2566
|
+
document.getElementById('detail').classList.remove('active');
|
|
2567
|
+
cy.fit(50);
|
|
2568
|
+
});
|
|
2569
|
+
</script>
|
|
2570
|
+
</body>
|
|
2571
|
+
</html>`;
|
|
2572
|
+
}
|
|
2573
|
+
|
|
2574
|
+
// src/cli/index.ts
|
|
2575
|
+
var VERSION = "1.0.0";
|
|
2576
|
+
var program = new Command();
|
|
2577
|
+
program.name("papergraph").description("Build research-paper connectivity graphs from topics, keywords, or paper titles.").version(VERSION);
|
|
2578
|
+
program.command("build").description("Build a graph database from papers").requiredOption("-t, --topic <topic>", "Search topic").option("-p, --paper <titles...>", "Paper titles to seed").option("--doi <dois...>", "DOIs to seed").option("-s, --source <source>", "Data source: openalex | s2", "openalex").option("--spine <spine>", "Graph spine: citation | similarity | co-citation | coupling | hybrid", "citation").option("-d, --depth <n>", "Citation traversal depth", "2").option("-m, --max-papers <n>", "Maximum papers to collect", "200").option("-o, --out <path>", "Output database path", "./papergraph.db").option("--max-refs <n>", "Max references per paper", "20").option("--max-cites <n>", "Max citations per paper", "20").option("--year-from <year>", "Filter papers from year").option("--year-to <year>", "Filter papers to year").option("--log-level <level>", "Log level: debug | info | warn | error", "info").option("--json-logs", "Output JSON logs", false).option("--no-cache", "Disable response caching").action(async (opts) => {
|
|
2579
|
+
const cliConfig = {
|
|
2580
|
+
topic: opts.topic,
|
|
2581
|
+
paper: opts.paper,
|
|
2582
|
+
doi: opts.doi,
|
|
2583
|
+
source: opts.source,
|
|
2584
|
+
spine: opts.spine,
|
|
2585
|
+
depth: parseInt(opts.depth, 10),
|
|
2586
|
+
maxPapers: parseInt(opts.maxPapers, 10),
|
|
2587
|
+
maxRefsPerPaper: parseInt(opts.maxRefs, 10),
|
|
2588
|
+
maxCitesPerPaper: parseInt(opts.maxCites, 10),
|
|
2589
|
+
out: opts.out,
|
|
2590
|
+
yearFrom: opts.yearFrom ? parseInt(opts.yearFrom, 10) : void 0,
|
|
2591
|
+
yearTo: opts.yearTo ? parseInt(opts.yearTo, 10) : void 0,
|
|
2592
|
+
logLevel: opts.logLevel,
|
|
2593
|
+
jsonLogs: opts.jsonLogs,
|
|
2594
|
+
noCache: !opts.cache
|
|
2595
|
+
};
|
|
2596
|
+
const config = await resolveConfig(cliConfig);
|
|
2597
|
+
initLogger({ level: config.logLevel, jsonLogs: config.jsonLogs });
|
|
2598
|
+
getHttpClient({ timeout: 3e4 });
|
|
2599
|
+
const logger10 = getLogger();
|
|
2600
|
+
logger10.info({ topic: config.topic, source: config.source, spine: config.spine }, "Starting build");
|
|
2601
|
+
try {
|
|
2602
|
+
const dbPath = await buildGraph(config);
|
|
2603
|
+
logger10.info({ dbPath }, "Build complete!");
|
|
2604
|
+
} catch (error) {
|
|
2605
|
+
logger10.error({ error }, "Build failed");
|
|
2606
|
+
process.exit(1);
|
|
2607
|
+
}
|
|
2608
|
+
});
|
|
2609
|
+
program.command("export").description("Export graph to JSON, GraphML, GEXF, CSV, or Mermaid").requiredOption("-i, --input <dbPath>", "Input database path").requiredOption("-f, --format <format>", "Export format: json | graphml | gexf | csv | mermaid").option("-o, --out <path>", "Output file path").action((opts) => {
|
|
2610
|
+
const format = opts.format.toLowerCase();
|
|
2611
|
+
const validFormats = ["json", "graphml", "gexf", "csv", "mermaid"];
|
|
2612
|
+
if (!validFormats.includes(format)) {
|
|
2613
|
+
console.error(`Invalid format: ${format}. Valid: ${validFormats.join(", ")}`);
|
|
2614
|
+
process.exit(1);
|
|
2615
|
+
}
|
|
2616
|
+
const extensions = {
|
|
2617
|
+
json: ".json",
|
|
2618
|
+
graphml: ".graphml",
|
|
2619
|
+
gexf: ".gexf",
|
|
2620
|
+
csv: ".csv",
|
|
2621
|
+
mermaid: ".md"
|
|
2622
|
+
};
|
|
2623
|
+
const outputPath = opts.out ?? opts.input.replace(".db", extensions[format] ?? ".out");
|
|
2624
|
+
try {
|
|
2625
|
+
exportGraph(opts.input, outputPath, format);
|
|
2626
|
+
console.log(`Exported to ${outputPath}`);
|
|
2627
|
+
} catch (error) {
|
|
2628
|
+
console.error("Export failed:", error);
|
|
2629
|
+
process.exit(1);
|
|
2630
|
+
}
|
|
2631
|
+
});
|
|
2632
|
+
program.command("view").description("Generate a self-contained HTML viewer").requiredOption("-i, --input <dbPath>", "Input database path").option("-o, --out <path>", "Output HTML file path").action((opts) => {
|
|
2633
|
+
const outputPath = opts.out ?? opts.input.replace(".db", ".html");
|
|
2634
|
+
try {
|
|
2635
|
+
generateViewer(opts.input, outputPath);
|
|
2636
|
+
console.log(`Viewer generated: ${outputPath}`);
|
|
2637
|
+
} catch (error) {
|
|
2638
|
+
console.error("View generation failed:", error);
|
|
2639
|
+
process.exit(1);
|
|
2640
|
+
}
|
|
2641
|
+
});
|
|
2642
|
+
program.command("inspect").description("Show database statistics").requiredOption("-i, --input <dbPath>", "Input database path").action((opts) => {
|
|
2643
|
+
try {
|
|
2644
|
+
const db = new PaperGraphDatabase(opts.input);
|
|
2645
|
+
const stats = db.getStats();
|
|
2646
|
+
db.close();
|
|
2647
|
+
console.log("\n\u{1F4CA} PaperGraph Database Statistics\n");
|
|
2648
|
+
console.log(` Papers: ${stats.papers}`);
|
|
2649
|
+
console.log(` Edges: ${stats.edges}`);
|
|
2650
|
+
console.log(` Clusters: ${stats.clusters}`);
|
|
2651
|
+
console.log(` Entities: ${stats.entities}`);
|
|
2652
|
+
console.log(` Runs: ${stats.runs}`);
|
|
2653
|
+
if (Object.keys(stats.edgesByType).length > 0) {
|
|
2654
|
+
console.log("\n Edge Types:");
|
|
2655
|
+
for (const [type, count] of Object.entries(stats.edgesByType)) {
|
|
2656
|
+
console.log(` ${type}: ${count}`);
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
console.log("");
|
|
2660
|
+
} catch (error) {
|
|
2661
|
+
console.error("Inspect failed:", error);
|
|
2662
|
+
process.exit(1);
|
|
2663
|
+
}
|
|
2664
|
+
});
|
|
2665
|
+
program.command("cache").description("Manage the response cache").argument("<action>", "Action: clear | stats").action((action) => {
|
|
2666
|
+
switch (action) {
|
|
2667
|
+
case "clear":
|
|
2668
|
+
try {
|
|
2669
|
+
const { rmSync } = __require("fs");
|
|
2670
|
+
rmSync(".papergraph-cache", { recursive: true, force: true });
|
|
2671
|
+
console.log("Cache cleared.");
|
|
2672
|
+
} catch {
|
|
2673
|
+
console.log("No cache to clear.");
|
|
2674
|
+
}
|
|
2675
|
+
break;
|
|
2676
|
+
case "stats":
|
|
2677
|
+
try {
|
|
2678
|
+
const { readdirSync, statSync } = __require("fs");
|
|
2679
|
+
const { join } = __require("path");
|
|
2680
|
+
const files = readdirSync(".papergraph-cache");
|
|
2681
|
+
let totalSize = 0;
|
|
2682
|
+
for (const f of files) {
|
|
2683
|
+
totalSize += statSync(join(".papergraph-cache", f)).size;
|
|
2684
|
+
}
|
|
2685
|
+
console.log(`Cache: ${files.length} entries, ${(totalSize / 1024).toFixed(1)} KB`);
|
|
2686
|
+
} catch {
|
|
2687
|
+
console.log("No cache found.");
|
|
2688
|
+
}
|
|
2689
|
+
break;
|
|
2690
|
+
default:
|
|
2691
|
+
console.error(`Unknown action: ${action}. Valid: clear, stats`);
|
|
2692
|
+
process.exit(1);
|
|
2693
|
+
}
|
|
2694
|
+
});
|
|
2695
|
+
program.parse();
|