@djolex999/vir-cli 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli.js +73 -4
- package/dist/cli.js.map +1 -1
- package/dist/config.js +15 -0
- package/dist/config.js.map +1 -1
- package/dist/mcp/server.js +21 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/output/json.js +6 -0
- package/dist/output/json.js.map +1 -1
- package/dist/pipeline/embeddingSweep.js +31 -1
- package/dist/pipeline/embeddingSweep.js.map +1 -1
- package/dist/pipeline/pdfDistiller.js +141 -0
- package/dist/pipeline/pdfDistiller.js.map +1 -0
- package/dist/pipeline/pdfReader.js +76 -0
- package/dist/pipeline/pdfReader.js.map +1 -0
- package/dist/pipeline/run.js +151 -5
- package/dist/pipeline/run.js.map +1 -1
- package/dist/pipeline/writer.js +52 -1
- package/dist/pipeline/writer.js.map +1 -1
- package/dist/search/retriever.js +4 -3
- package/dist/search/retriever.js.map +1 -1
- package/dist/state/db.js +143 -0
- package/dist/state/db.js.map +1 -1
- package/package.json +2 -1
package/dist/output/json.js
CHANGED
|
@@ -15,6 +15,7 @@ const CATEGORY_DIRS = {
|
|
|
15
15
|
tools: "tool",
|
|
16
16
|
articles: "article",
|
|
17
17
|
topics: "topic",
|
|
18
|
+
pdfs: "pdf",
|
|
18
19
|
};
|
|
19
20
|
const WIRE_CATEGORIES = new Set([
|
|
20
21
|
"pattern",
|
|
@@ -23,6 +24,7 @@ const WIRE_CATEGORIES = new Set([
|
|
|
23
24
|
"tool",
|
|
24
25
|
"article",
|
|
25
26
|
"topic",
|
|
27
|
+
"pdf",
|
|
26
28
|
]);
|
|
27
29
|
// Minimal YAML-block parser, kebab-flat. Mirrors mcp/server.ts deliberately —
|
|
28
30
|
// the JSON contract is its own isolated surface and must not couple to the MCP
|
|
@@ -58,6 +60,10 @@ function categoryOf(fm, relPath, topicsDir) {
|
|
|
58
60
|
return "article";
|
|
59
61
|
if (fm.type === "topic")
|
|
60
62
|
return "topic";
|
|
63
|
+
// PDF sub-taxonomy (paper/reference/notes/other) collapses to the single
|
|
64
|
+
// "pdf" wire bucket, exactly like articles collapse to "article".
|
|
65
|
+
if (fm.type === "pdf")
|
|
66
|
+
return "pdf";
|
|
61
67
|
if (fm.category && WIRE_CATEGORIES.has(fm.category)) {
|
|
62
68
|
return fm.category;
|
|
63
69
|
}
|
package/dist/output/json.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AA4CrC,+EAA+E;AAC/E,4EAA4E;AAC5E,+EAA+E;AAC/E,MAAM,aAAa,GAAqC;IACtD,QAAQ,EAAE,SAAS;IACnB,OAAO,EAAE,QAAQ;IACjB,SAAS,EAAE,UAAU;IACrB,KAAK,EAAE,MAAM;IACb,QAAQ,EAAE,SAAS;IACnB,MAAM,EAAE,OAAO;IACf,IAAI,EAAE,KAAK;CACZ,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAS;IACtC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAEH,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACrB,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,KAAK,CAAC,CAAC;YAAE,SAAS;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC/B,IAAI,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,IACE,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC1C,CAAC;YACD,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,OAAO,CAAC,OAAe;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;IAC3D,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU,CACjB,EAA0B,EAC1B,OAAe,EACf,SAAiB;IAEjB,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO;QAAE,OAAO,OAAO,CAAC;IACxC,yEAAyE;IACzE,kEAAkE;IAClE,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;QAAE,OAAO,KAAK,CAAC;IACpC,IAAI,EAAE,CAAC,QAAQ,IAAI,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,OAAO,EAAE,CAAC,QAA4B,CAAC;IACzC,CAAC;IACD,2EAA2E;IAC3E,8EAA8E;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,MAAM,GAAG,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;IAC/C,OAAO,aAAa,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;AACzC,CAAC;AAED,0EAA0E;AAC1E,8EAA8E;AAC9E,6EAA6E;AAC7E,iFAAiF;AACjF,gFAAgF;AAChF,2EAA2E;AAC3E,6EAA6E;AAC7E,MAAM,UAAU,iBAAiB,CAC/B,IAAiB,EACjB,SAAiB,EACjB,SAAS,GAAG,QAAQ;IAEpB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,EAAE,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC;QACnC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,QAAQ,EAAE,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC;YAC5C,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5C,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAC3B,OAAO,EAAE,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YAChE,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,IAAkB,EAClB,OAAe;IAEf,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAClC,CAAC;AAED,8EAA8E;AAC9E,+EAA+E;AAC/E,mEAAmE;AACnE,MAAM,UAAU,oBAAoB,CAClC,SAAkB,EAClB,UAAyB,EACzB,YAAoB,EACpB,MAAY,IAAI,IAAI,EAAE;IAEtB,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU;QAAE,OAAO,MAAM,CAAC;IAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACtC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,QAAQ,GAAG,CAAC,GAAG,YAAY,GAAG,SAAS,CAAC;IAC9C,OAAO,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC;AAC7D,CAAC;AAgBD,MAAM,UAAU,iBAAiB,CAAC,CAAe;IAC/C,OAAO;QACL,MAAM,EAAE,oBAAoB,CAC1B,CAAC,CAAC,eAAe,EACjB,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,YAAY,EACd,CAAC,CAAC,GAAG,CACN;QACD,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,MAAM,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC,WAAW,EAAE;QAC9D,OAAO,EAAE,CAAC,CAAC,OAAO;KACnB,CAAC;AACJ,CAAC"}
|
|
@@ -35,6 +35,17 @@ export function selectArticleEmbeddingTargets(rows) {
|
|
|
35
35
|
r.embedding === null &&
|
|
36
36
|
r.notePath !== null);
|
|
37
37
|
}
|
|
38
|
+
// PDF counterpart, mirroring db.listPdfEmbeddingTargets's SQL filter — the exact
|
|
39
|
+
// complement of getPdfEmbeddings(). Same gates as articles (skipped/error,
|
|
40
|
+
// content present, note_path set, NULL embedding); PDFs have no archived column.
|
|
41
|
+
export function selectPdfEmbeddingTargets(rows) {
|
|
42
|
+
return rows.filter((r) => r.skipped === 0 &&
|
|
43
|
+
r.error === null &&
|
|
44
|
+
r.content !== null &&
|
|
45
|
+
r.content !== "" &&
|
|
46
|
+
r.embedding === null &&
|
|
47
|
+
r.notePath !== null);
|
|
48
|
+
}
|
|
38
49
|
// Best-effort self-heal sweep. A write-time embedding miss (Ollama down when a
|
|
39
50
|
// note was distilled) leaves `embedding = NULL`, which makes the note invisible
|
|
40
51
|
// to the embedding-search path: it never enters getEmbeddings()'s candidate set,
|
|
@@ -52,12 +63,16 @@ export async function sweepEmbeddings(db) {
|
|
|
52
63
|
const targets = db.listEmbeddingTargets();
|
|
53
64
|
const topicTargets = db.listTopicEmbeddingTargets();
|
|
54
65
|
const articleTargets = db.listArticleEmbeddingTargets();
|
|
66
|
+
const pdfTargets = db.listPdfEmbeddingTargets();
|
|
55
67
|
if (!(await isOllamaAvailableCached())) {
|
|
56
68
|
return {
|
|
57
69
|
ran: false,
|
|
58
70
|
embedded: 0,
|
|
59
71
|
errors: 0,
|
|
60
|
-
pending: targets.length +
|
|
72
|
+
pending: targets.length +
|
|
73
|
+
topicTargets.length +
|
|
74
|
+
articleTargets.length +
|
|
75
|
+
pdfTargets.length,
|
|
61
76
|
};
|
|
62
77
|
}
|
|
63
78
|
let embedded = 0;
|
|
@@ -104,6 +119,21 @@ export async function sweepEmbeddings(db) {
|
|
|
104
119
|
db.storeArticleEmbedding(a.path, vec);
|
|
105
120
|
embedded += 1;
|
|
106
121
|
}
|
|
122
|
+
// PDFs heal the same way (their own table). A paper distilled while Ollama was
|
|
123
|
+
// down left `embedding` NULL; back-fill keyed by the source path (the PK) via
|
|
124
|
+
// storePdfEmbedding — shipped WITH the getPdfEmbeddings read path so the new
|
|
125
|
+
// pdfs table can't reopen the NULL-embedding blind spot (the 0.8.2/0.8.3 trap).
|
|
126
|
+
for (const p of pdfTargets) {
|
|
127
|
+
if (!p.content || p.content.trim().length === 0)
|
|
128
|
+
continue;
|
|
129
|
+
const vec = await embeddingForNote(p.content);
|
|
130
|
+
if (!vec) {
|
|
131
|
+
errors += 1;
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
db.storePdfEmbedding(p.path, vec);
|
|
135
|
+
embedded += 1;
|
|
136
|
+
}
|
|
107
137
|
return { ran: true, embedded, errors, pending: errors };
|
|
108
138
|
}
|
|
109
139
|
//# sourceMappingURL=embeddingSweep.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,
|
|
1
|
+
{"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,GAMhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,uBAAuB,CAAC;AAE/B,+EAA+E;AAC/E,qEAAqE;AACrE,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,WAAW;AACX,MAAM,UAAU,sBAAsB,CACpC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,QAAQ,KAAK,IAAI;QACnB,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,CAC1B,CAAC;AACJ,CAAC;AAED,yDAAyD;AACzD,2EAA2E;AAC3E,8EAA8E;AAC9E,oCAAoC;AACpC,MAAM,UAAU,2BAA2B,CACzC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,EAAE,CACtE,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,8EAA8E;AAC9E,6EAA6E;AAC7E,oEAAoE;AACpE,MAAM,UAAU,6BAA6B,CAE3C,IAAS;IACT,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAED,iFAAiF;AACjF,2EAA2E;AAC3E,iFAAiF;AACjF,MAAM,UAAU,yBAAyB,CACvC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAaD,+EAA+E;AAC/E,gFAAgF;AAChF,iFAAiF;AACjF,uEAAuE;AACvE,2EAA2E;AAC3E,gFAAgF;AAChF,2EAA2E;AAC3E,wCAAwC;AACxC,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6EAA6E;AAC7E,oCAAoC;AACpC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,EAAW;IAC/C,MAAM,OAAO,GAAG,EAAE,CAAC,oBAAoB,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,EAAE,CAAC,yBAAyB,EAAE,CAAC;IACpD,MAAM,cAAc,GAAG,EAAE,CAAC,2BAA2B,EAAE,CAAC;IACxD,MAAM,UAAU,GAAG,EAAE,CAAC,uBAAuB,EAAE,CAAC;IAChD,IAAI,CAAC,CAAC,MAAM,uBAAuB,EAAE,CAAC,EAAE,CAAC;QACvC,OAAO;YACL,GAAG,EAAE,KAAK;YACV,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,CAAC;YACT,OAAO,EACL,OAAO,CAAC,MAAM;gBACd,YAAY,CAAC,MAAM;gBACnB,cAAc,CAAC,MAAM;gBACrB,UAAU,CAAC,MAAM;SACpB,CAAC;IACJ,CAAC;IACD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,cAAc,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAChD,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,0EAA0E;IAC1E,6EAA6E;IAC7E,2EAA2E;IAC3E,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,8EAA8E;IAC9E,6EAA6E;IAC7E,+EAA+E;IAC/E,8EAA8E;IAC9E,qCAAqC;IACrC,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACtC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,8EAA8E;IAC9E,6EAA6E;IAC7E,gFAAgF;IAChF,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AAC1D,CAAC"}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { callLLM, maybeAnthropicClient, normalizeModelName, withRateLimitRetry, } from "./distiller.js";
|
|
4
|
+
import { scrub } from "./scrubber.js";
|
|
5
|
+
export const PDF_CATEGORIES = [
|
|
6
|
+
"paper",
|
|
7
|
+
"reference",
|
|
8
|
+
"notes",
|
|
9
|
+
"other",
|
|
10
|
+
];
|
|
11
|
+
// Neutral fallback when the model returns an unrecognized category: "other"
|
|
12
|
+
// claims the least about the document's intent.
|
|
13
|
+
const DEFAULT_PDF_CATEGORY = "other";
|
|
14
|
+
export const PDFS_SUBDIR = "pdfs";
|
|
15
|
+
// Papers are long → bound the distill input so a single paper can't blow up
|
|
16
|
+
// token cost (hybrid routing would otherwise push it to Sonnet on size).
|
|
17
|
+
// Mirrors articleDistiller's 24k-char bound.
|
|
18
|
+
const CLASSIFY_EXCERPT_CHARS = 3000;
|
|
19
|
+
const MAX_BODY_CHARS = 24_000;
|
|
20
|
+
export async function distillPdf(parsed, cfg) {
|
|
21
|
+
const client = maybeAnthropicClient(cfg);
|
|
22
|
+
const classifyModel = normalizeModelName(cfg.models.classify, cfg.provider);
|
|
23
|
+
const distillModel = normalizeModelName(cfg.models.distill, cfg.provider);
|
|
24
|
+
// Extracted text still flows to a provider — scrub keys/paths/emails.
|
|
25
|
+
const body = scrub(parsed.text).slice(0, MAX_BODY_CHARS);
|
|
26
|
+
if (body.trim().length === 0)
|
|
27
|
+
return null;
|
|
28
|
+
const clsText = await withRateLimitRetry(() => callLLM(cfg, client, {
|
|
29
|
+
prompt: classifyPrompt(parsed, body),
|
|
30
|
+
model: classifyModel,
|
|
31
|
+
maxTokens: 200,
|
|
32
|
+
cost: { stage: "pdf-classify" },
|
|
33
|
+
}));
|
|
34
|
+
const classification = parsePdfClassification(clsText);
|
|
35
|
+
const markdown = (await withRateLimitRetry(() => callLLM(cfg, client, {
|
|
36
|
+
prompt: distillPrompt(parsed, body),
|
|
37
|
+
model: distillModel,
|
|
38
|
+
maxTokens: 1500,
|
|
39
|
+
cost: { stage: "pdf-distill" },
|
|
40
|
+
}))).trim();
|
|
41
|
+
if (markdown.length === 0)
|
|
42
|
+
return null;
|
|
43
|
+
return { classification, markdown };
|
|
44
|
+
}
|
|
45
|
+
function classifyPrompt(parsed, body) {
|
|
46
|
+
return `Classify this PDF document into exactly one category. Output JSON only:
|
|
47
|
+
{ "category": "paper" | "reference" | "notes" | "other",
|
|
48
|
+
"confidence": number (0..1) }
|
|
49
|
+
|
|
50
|
+
paper = a research/academic paper (abstract, methods, results, contributions)
|
|
51
|
+
reference = documentation, a spec, manual, datasheet, or material to look up later
|
|
52
|
+
notes = lecture notes, slides, a course handout, or personal study notes
|
|
53
|
+
other = anything that doesn't fit the above
|
|
54
|
+
|
|
55
|
+
Title: ${parsed.title}
|
|
56
|
+
|
|
57
|
+
${body.slice(0, CLASSIFY_EXCERPT_CHARS)}`;
|
|
58
|
+
}
|
|
59
|
+
function distillPrompt(parsed, body) {
|
|
60
|
+
return `Distill this PDF document into a durable knowledge note. Output markdown
|
|
61
|
+
only — no preamble, start with '## Summary'. Use these sections:
|
|
62
|
+
|
|
63
|
+
- ## Summary (2-3 sentences, in your own words)
|
|
64
|
+
- ## Key Points (bullet list: the main claims, findings, methods, or arguments)
|
|
65
|
+
- ## Methods & Findings (for a paper: the approach and what it concluded; omit
|
|
66
|
+
this section entirely if the document isn't a study)
|
|
67
|
+
- ## Related (plain-English topics this connects to, one per bullet — these get
|
|
68
|
+
turned into wikilinks automatically, so write them as short noun phrases)
|
|
69
|
+
|
|
70
|
+
COPYRIGHT — strict: this is someone else's IP. Never reproduce more than 15
|
|
71
|
+
consecutive words verbatim from the source. Paraphrase everything in your own
|
|
72
|
+
words. Do not reproduce figures, tables, equations, or full passages. Summarize
|
|
73
|
+
and cite by context, never quote at length.
|
|
74
|
+
|
|
75
|
+
Title: ${parsed.title}
|
|
76
|
+
Source: ${parsed.filePath}
|
|
77
|
+
Document:
|
|
78
|
+
${body}`;
|
|
79
|
+
}
|
|
80
|
+
export function parsePdfClassification(text) {
|
|
81
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
82
|
+
if (!match)
|
|
83
|
+
return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
|
|
84
|
+
let obj;
|
|
85
|
+
try {
|
|
86
|
+
obj = JSON.parse(match[0]);
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
|
|
90
|
+
}
|
|
91
|
+
const rawCat = typeof obj.category === "string" ? obj.category : "";
|
|
92
|
+
const category = PDF_CATEGORIES.includes(rawCat)
|
|
93
|
+
? rawCat
|
|
94
|
+
: DEFAULT_PDF_CATEGORY;
|
|
95
|
+
const confRaw = typeof obj.confidence === "number"
|
|
96
|
+
? obj.confidence
|
|
97
|
+
: Number(obj.confidence ?? 0);
|
|
98
|
+
const confidence = Number.isFinite(confRaw)
|
|
99
|
+
? Math.max(0, Math.min(1, confRaw))
|
|
100
|
+
: 0;
|
|
101
|
+
return { category, confidence };
|
|
102
|
+
}
|
|
103
|
+
// Stable across content edits: a re-extracted PDF (new hash/text) keeps the same
|
|
104
|
+
// slug as long as its source path is unchanged, so a re-distill overwrites the
|
|
105
|
+
// same note instead of orphaning the old one (mirrors articleSlug, keyed off the
|
|
106
|
+
// source path rather than the content hash).
|
|
107
|
+
export function pdfSlug(parsed) {
|
|
108
|
+
const base = kebab(parsed.title).slice(0, 60);
|
|
109
|
+
const suffix = createHash("sha256")
|
|
110
|
+
.update(parsed.filePath)
|
|
111
|
+
.digest("hex")
|
|
112
|
+
.slice(0, 8);
|
|
113
|
+
return base.length > 0 ? `${base}-${suffix}` : `pdf-${suffix}`;
|
|
114
|
+
}
|
|
115
|
+
export function pdfRelPath(parsed) {
|
|
116
|
+
return join(PDFS_SUBDIR, `${pdfSlug(parsed)}.md`);
|
|
117
|
+
}
|
|
118
|
+
export function buildPdfFrontmatter(parsed, distilled) {
|
|
119
|
+
const lines = ["---", "type: pdf"];
|
|
120
|
+
lines.push(`category: ${distilled.classification.category}`);
|
|
121
|
+
lines.push(`source_path: ${parsed.filePath}`);
|
|
122
|
+
lines.push(`source_title: "${escapeYaml(parsed.title)}"`);
|
|
123
|
+
lines.push(`pages: ${parsed.pageCount}`);
|
|
124
|
+
lines.push(`distilled_at: ${new Date().toISOString()}`);
|
|
125
|
+
lines.push(`confidence: ${distilled.classification.confidence}`);
|
|
126
|
+
lines.push(`hash: ${parsed.hash}`);
|
|
127
|
+
lines.push("---", "");
|
|
128
|
+
return lines.join("\n");
|
|
129
|
+
}
|
|
130
|
+
function escapeYaml(s) {
|
|
131
|
+
return s.replace(/"/g, '\\"');
|
|
132
|
+
}
|
|
133
|
+
// Local copy of writer.kebab() — writer.ts depends on this module, so importing
|
|
134
|
+
// from it would create a cycle (mirrors articleDistiller.kebab).
|
|
135
|
+
function kebab(s) {
|
|
136
|
+
return s
|
|
137
|
+
.toLowerCase()
|
|
138
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
139
|
+
.replace(/^-+|-+$/g, "");
|
|
140
|
+
}
|
|
141
|
+
//# sourceMappingURL=pdfDistiller.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdfDistiller.js","sourceRoot":"","sources":["../../src/pipeline/pdfDistiller.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EACL,OAAO,EACP,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAMtC,MAAM,CAAC,MAAM,cAAc,GAAkB;IAC3C,OAAO;IACP,WAAW;IACX,OAAO;IACP,OAAO;CACR,CAAC;AAEF,4EAA4E;AAC5E,gDAAgD;AAChD,MAAM,oBAAoB,GAAgB,OAAO,CAAC;AAElD,MAAM,CAAC,MAAM,WAAW,GAAG,MAAM,CAAC;AAElC,4EAA4E;AAC5E,yEAAyE;AACzE,6CAA6C;AAC7C,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,cAAc,GAAG,MAAM,CAAC;AAY9B,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAiB,EACjB,GAAW;IAEX,MAAM,MAAM,GAAG,oBAAoB,CAAC,GAAG,CAAC,CAAC;IACzC,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE1E,sEAAsE;IACtE,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,cAAc,CAAC,CAAC;IACzD,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5C,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,cAAc,CAAC,MAAM,EAAE,IAAI,CAAC;QACpC,KAAK,EAAE,aAAa;QACpB,SAAS,EAAE,GAAG;QACd,IAAI,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE;KAChC,CAAC,CACH,CAAC;IACF,MAAM,cAAc,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;IAEvD,MAAM,QAAQ,GAAG,CACf,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5B,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,aAAa,CAAC,MAAM,EAAE,IAAI,CAAC;QACnC,KAAK,EAAE,YAAY;QACnB,SAAS,EAAE,IAAI;QACf,IAAI,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE;KAC/B,CAAC,CACH,CACF,CAAC,IAAI,EAAE,CAAC;IACT,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC;AACtC,CAAC;AAED,SAAS,cAAc,CAAC,MAAiB,EAAE,IAAY;IACrD,OAAO;;;;;;;;;SASA,MAAM,CAAC,KAAK;;EAEnB,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,sBAAsB,CAAC,EAAE,CAAC;AAC1C,CAAC;AAED,SAAS,aAAa,CAAC,MAAiB,EAAE,IAAY;IACpD,OAAO;;;;;;;;;;;;;;;SAeA,MAAM,CAAC,KAAK;UACX,MAAM,CAAC,QAAQ;;EAEvB,IAAI,EAAE,CAAC;AACT,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,IAAY;IACjD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IACxC,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IACrE,IAAI,GAA4B,CAAC;IACjC,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAA4B,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IACD,MAAM,MAAM,GAAG,OAAO,GAAG,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;IACpE,MAAM,QAAQ,GAAI,cAA2B,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5D,CAAC,CAAE,MAAsB;QACzB,CAAC,CAAC,oBAAoB,CAAC;IACzB,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,UAAU,KAAK,QAAQ;QAChC,CAAC,CAAC,GAAG,CAAC,UAAU;QAChB,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAClC,MAAM,UAAU,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC;QACzC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QACnC,CAAC,CAAC,CAAC,CAAC;IACN,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC;AAClC,CAAC;AAED,iFAAiF;AACjF,+EAA+E;AAC/E,iFAAiF;AACjF,6CAA6C;AAC7C,MAAM,UAAU,OAAO,CAAC,MAAiB;IACvC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC;SAChC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;SACvB,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACf,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC,OAAO,MAAM,EAAE,CAAC;AACjE,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,MAAiB;IAC1C,OAAO,IAAI,CAAC,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACpD,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,MAAiB,EACjB,SAAuB;IAEvB,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7D,KAAK,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC9C,KAAK,CAAC,IAAI,CAAC,kBAAkB,UAAU,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,iBAAiB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,eAAe,SAAS,CAAC,cAAc,CAAC,UAAU,EAAE,CAAC,CAAC;IACjE,KAAK,CAAC,IAAI,CAAC,SAAS,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACtB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;AAChC,CAAC;AAED,gFAAgF;AAChF,iEAAiE;AACjE,SAAS,KAAK,CAAC,CAAS;IACtB,OAAO,CAAC;SACL,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;AAC7B,CAAC"}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
3
|
+
import { basename, join } from "node:path";
|
|
4
|
+
import { extractText, getDocumentProxy } from "unpdf";
|
|
5
|
+
export async function parsePdf(filePath) {
|
|
6
|
+
const bytes = readFileSync(filePath);
|
|
7
|
+
const hash = createHash("sha256").update(bytes).digest("hex");
|
|
8
|
+
// unpdf bundles pdf.js; getDocumentProxy throws on corrupt/non-PDF bytes,
|
|
9
|
+
// which the run layer catches per-file and records (never crashes the run).
|
|
10
|
+
const proxy = await getDocumentProxy(new Uint8Array(bytes));
|
|
11
|
+
const { totalPages, text } = await extractText(proxy, { mergePages: true });
|
|
12
|
+
const metaTitle = await readMetadataTitle(proxy);
|
|
13
|
+
return {
|
|
14
|
+
filePath,
|
|
15
|
+
hash,
|
|
16
|
+
title: pdfTitle(metaTitle, filePath),
|
|
17
|
+
text: Array.isArray(text) ? text.join("\n") : text,
|
|
18
|
+
pageCount: totalPages,
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export function scanPdfs(dir) {
|
|
22
|
+
const files = [];
|
|
23
|
+
walk(dir, files);
|
|
24
|
+
const out = [];
|
|
25
|
+
for (const f of files) {
|
|
26
|
+
try {
|
|
27
|
+
const hash = createHash("sha256").update(readFileSync(f)).digest("hex");
|
|
28
|
+
out.push({ filePath: f, hash });
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// unreadable / mid-write file — skip, never fail the whole scan
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return out;
|
|
35
|
+
}
|
|
36
|
+
// Pure: metadata Title wins when non-blank; otherwise the filename (sans .pdf).
|
|
37
|
+
export function pdfTitle(metaTitle, filePath) {
|
|
38
|
+
const t = (metaTitle ?? "").trim();
|
|
39
|
+
if (t.length > 0)
|
|
40
|
+
return t;
|
|
41
|
+
return basename(filePath).replace(/\.pdf$/i, "");
|
|
42
|
+
}
|
|
43
|
+
async function readMetadataTitle(proxy) {
|
|
44
|
+
try {
|
|
45
|
+
const meta = (await proxy.getMetadata());
|
|
46
|
+
const title = meta?.info?.Title;
|
|
47
|
+
return typeof title === "string" ? title : undefined;
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
function walk(dir, acc) {
|
|
54
|
+
let entries;
|
|
55
|
+
try {
|
|
56
|
+
entries = readdirSync(dir);
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
for (const name of entries) {
|
|
62
|
+
const full = join(dir, name);
|
|
63
|
+
let st;
|
|
64
|
+
try {
|
|
65
|
+
st = statSync(full);
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
if (st.isDirectory())
|
|
71
|
+
walk(full, acc);
|
|
72
|
+
else if (st.isFile() && name.toLowerCase().endsWith(".pdf"))
|
|
73
|
+
acc.push(full);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=pdfReader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdfReader.js","sourceRoot":"","sources":["../../src/pipeline/pdfReader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC3C,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,MAAM,OAAO,CAAC;AAwBtD,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,QAAgB;IAC7C,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAE9D,0EAA0E;IAC1E,4EAA4E;IAC5E,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5D,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5E,MAAM,SAAS,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,CAAC;IAEjD,OAAO;QACL,QAAQ;QACR,IAAI;QACJ,KAAK,EAAE,QAAQ,CAAC,SAAS,EAAE,QAAQ,CAAC;QACpC,IAAI,EAAE,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI;QAClD,SAAS,EAAE,UAAU;KACtB,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACjB,MAAM,GAAG,GAAgB,EAAE,CAAC;IAC5B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACxE,GAAG,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;QAAC,MAAM,CAAC;YACP,gEAAgE;QAClE,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,gFAAgF;AAChF,MAAM,UAAU,QAAQ,CAAC,SAA6B,EAAE,QAAgB;IACtE,MAAM,CAAC,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACnC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC3B,OAAO,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACnD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,KAAmD;IAEnD,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,WAAW,EAAE,CAEtC,CAAC;QACF,MAAM,KAAK,GAAG,IAAI,EAAE,IAAI,EAAE,KAAK,CAAC;QAChC,OAAO,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,GAAW,EAAE,GAAa;IACtC,IAAI,OAAiB,CAAC;IACtB,IAAI,CAAC;QACH,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;IACT,CAAC;IACD,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QAC7B,IAAI,EAAE,CAAC;QACP,IAAI,CAAC;YACH,EAAE,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QACD,IAAI,EAAE,CAAC,WAAW,EAAE;YAAE,IAAI,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;aACjC,IAAI,EAAE,CAAC,MAAM,EAAE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC"}
|
package/dist/pipeline/run.js
CHANGED
|
@@ -10,6 +10,8 @@ import { parseSession } from "./parser.js";
|
|
|
10
10
|
import { scanSessions } from "./scanner.js";
|
|
11
11
|
import { scanArticles } from "./articleReader.js";
|
|
12
12
|
import { distillArticle } from "./articleDistiller.js";
|
|
13
|
+
import { parsePdf, scanPdfs } from "./pdfReader.js";
|
|
14
|
+
import { distillPdf } from "./pdfDistiller.js";
|
|
13
15
|
import { scrub } from "./scrubber.js";
|
|
14
16
|
import { summarizeProject } from "./summarizer.js";
|
|
15
17
|
import { filterToolCalls } from "./toolCallFilter.js";
|
|
@@ -32,6 +34,10 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
32
34
|
articlesDistilled: 0,
|
|
33
35
|
articlesSkipped: 0,
|
|
34
36
|
articlesErrored: 0,
|
|
37
|
+
pdfsScanned: 0,
|
|
38
|
+
pdfsDistilled: 0,
|
|
39
|
+
pdfsSkipped: 0,
|
|
40
|
+
pdfsErrored: 0,
|
|
35
41
|
};
|
|
36
42
|
const interactive = !opts.quiet;
|
|
37
43
|
// File-only logging — used for the daemon run.log regardless of UI mode.
|
|
@@ -52,9 +58,11 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
52
58
|
? "run --rewrite-only"
|
|
53
59
|
: opts.articlesOnly
|
|
54
60
|
? "run --articles-only"
|
|
55
|
-
: opts.
|
|
56
|
-
? "run --
|
|
57
|
-
:
|
|
61
|
+
: opts.pdfsOnly
|
|
62
|
+
? "run --pdfs-only"
|
|
63
|
+
: opts.full
|
|
64
|
+
? "run --full"
|
|
65
|
+
: "run");
|
|
58
66
|
ui.blank();
|
|
59
67
|
}
|
|
60
68
|
fileLog(`vir run start (full=${opts.full ? "true" : "false"} rewriteOnly=${opts.rewriteOnly ? "true" : "false"})`);
|
|
@@ -147,6 +155,34 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
147
155
|
db.close();
|
|
148
156
|
return summary;
|
|
149
157
|
}
|
|
158
|
+
// --pdfs-only: skip the session AND article pipelines.
|
|
159
|
+
if (opts.pdfsOnly) {
|
|
160
|
+
if (!cfg.pdfsDir) {
|
|
161
|
+
if (interactive) {
|
|
162
|
+
ui.row(ui.warn(ui.WARN_GLYPH), ui.text("pdfsDir is not set — nothing to distill"));
|
|
163
|
+
}
|
|
164
|
+
fileLog("pdfs-only run but pdfsDir is unset");
|
|
165
|
+
db.close();
|
|
166
|
+
return summary;
|
|
167
|
+
}
|
|
168
|
+
await runPdfPhase(cfg, db, writer, summary, fileLog, interactive);
|
|
169
|
+
if (interactive) {
|
|
170
|
+
ui.blank();
|
|
171
|
+
ui.divider();
|
|
172
|
+
ui.summary({
|
|
173
|
+
pdfs: { value: summary.pdfsScanned, color: ui.info },
|
|
174
|
+
distilled: { value: summary.pdfsDistilled, color: ui.success },
|
|
175
|
+
skipped: { value: summary.pdfsSkipped, color: ui.warn },
|
|
176
|
+
errored: {
|
|
177
|
+
value: summary.pdfsErrored,
|
|
178
|
+
color: summary.pdfsErrored > 0 ? ui.errorColor : ui.dim,
|
|
179
|
+
},
|
|
180
|
+
});
|
|
181
|
+
ui.divider();
|
|
182
|
+
}
|
|
183
|
+
db.close();
|
|
184
|
+
return summary;
|
|
185
|
+
}
|
|
150
186
|
const distiller = new Distiller(cfg, {
|
|
151
187
|
forceDistillModel: opts.forceDistillModel,
|
|
152
188
|
});
|
|
@@ -193,7 +229,8 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
193
229
|
// the end-of-run sweep back-fills. The sweep heals them when Ollama is up.
|
|
194
230
|
const pendingEmbedding = db.listEmbeddingTargets().length +
|
|
195
231
|
db.listTopicEmbeddingTargets().length +
|
|
196
|
-
db.listArticleEmbeddingTargets().length
|
|
232
|
+
db.listArticleEmbeddingTargets().length +
|
|
233
|
+
db.listPdfEmbeddingTargets().length;
|
|
197
234
|
if (interactive) {
|
|
198
235
|
ui.line(ui.dim(` ${discovered.length} files found · ${cached} cached · ${preflightNew} new` +
|
|
199
236
|
(pendingEmbedding > 0
|
|
@@ -260,6 +297,20 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
260
297
|
ui.divider();
|
|
261
298
|
ui.line(ui.dim(" estimates assume typical output sizes; actuals may vary ±30%"));
|
|
262
299
|
}
|
|
300
|
+
// PDFs are estimated separately: papers exceed the 24k-char distill cap, so
|
|
301
|
+
// the per-PDF input is the cap (an accurate figure, not just an upper bound).
|
|
302
|
+
// No text extraction here — only count new PDFs by their cheap byte hash.
|
|
303
|
+
if (cfg.pdfsDir && cfg.distillPdfs) {
|
|
304
|
+
const newPdfs = scanPdfs(cfg.pdfsDir).filter((f) => opts.full || !db.isPdfProcessed(f.filePath, f.hash)).length;
|
|
305
|
+
if (newPdfs > 0) {
|
|
306
|
+
const perPdf = computeCost(cfg.provider, classifyModel, Math.ceil(3000 / CHARS_PER_TOKEN), 200, cfg.pricing, cfg.kieTopUpTier) +
|
|
307
|
+
computeCost(cfg.provider, distillModel, Math.ceil(24_000 / CHARS_PER_TOKEN), 1500, cfg.pricing, cfg.kieTopUpTier);
|
|
308
|
+
if (interactive) {
|
|
309
|
+
ui.line(ui.dim(` ${newPdfs} new PDF(s): ~${ui.formatUsd(perPdf)} each (input capped at 24k chars) → ~${ui.formatUsd(perPdf * newPdfs)}`));
|
|
310
|
+
}
|
|
311
|
+
fileLog(`dry-run: newPdfs=${newPdfs} perPdf=${ui.formatUsd(perPdf)} estTotal=${ui.formatUsd(perPdf * newPdfs)}`);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
263
314
|
fileLog(`dry-run: sessions=${estimated} filtered=${filteredOut} estTotal=${ui.formatUsd(totalCost)}`);
|
|
264
315
|
db.close();
|
|
265
316
|
return summary;
|
|
@@ -384,6 +435,11 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
384
435
|
if (cfg.articlesDir && cfg.distillArticles) {
|
|
385
436
|
await runArticlePhase(cfg, db, writer, summary, fileLog, interactive);
|
|
386
437
|
}
|
|
438
|
+
// Third input source: PDFs / papers. Gated identically; an install without
|
|
439
|
+
// pdfsDir skips this entirely (the article pattern, cloned).
|
|
440
|
+
if (cfg.pdfsDir && cfg.distillPdfs) {
|
|
441
|
+
await runPdfPhase(cfg, db, writer, summary, fileLog, interactive);
|
|
442
|
+
}
|
|
387
443
|
// Self-heal: back-fill notes whose write-time embedding silently no-op'd
|
|
388
444
|
// (Ollama down during distill). Without this a transient outage is permanent
|
|
389
445
|
// — the note never enters the embedding-search candidate set. Best-effort and
|
|
@@ -411,7 +467,7 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
411
467
|
catch (err) {
|
|
412
468
|
fileLog(`embedding sweep failed: ${err.message}`);
|
|
413
469
|
}
|
|
414
|
-
fileLog(`vir run done — scanned=${summary.scanned} new=${summary.scanned - summary.alreadyProcessed} distilled=${summary.distilled} skipped=${summary.skippedByFilter} lowConf=${summary.lowConfidence} errored=${summary.errored} articles=${summary.articlesDistilled}`);
|
|
470
|
+
fileLog(`vir run done — scanned=${summary.scanned} new=${summary.scanned - summary.alreadyProcessed} distilled=${summary.distilled} skipped=${summary.skippedByFilter} lowConf=${summary.lowConfidence} errored=${summary.errored} articles=${summary.articlesDistilled} pdfs=${summary.pdfsDistilled}`);
|
|
415
471
|
if (interactive) {
|
|
416
472
|
ui.blank();
|
|
417
473
|
ui.divider();
|
|
@@ -431,6 +487,9 @@ export async function runPipeline(cfg, opts = {}) {
|
|
|
431
487
|
if (cfg.articlesDir && cfg.distillArticles) {
|
|
432
488
|
stats.articles = { value: summary.articlesDistilled, color: ui.success };
|
|
433
489
|
}
|
|
490
|
+
if (cfg.pdfsDir && cfg.distillPdfs) {
|
|
491
|
+
stats.pdfs = { value: summary.pdfsDistilled, color: ui.success };
|
|
492
|
+
}
|
|
434
493
|
ui.summary(stats);
|
|
435
494
|
ui.divider();
|
|
436
495
|
}
|
|
@@ -522,6 +581,93 @@ async function runArticlePhase(cfg, db, writer, summary, fileLog, interactive) {
|
|
|
522
581
|
}
|
|
523
582
|
}
|
|
524
583
|
}
|
|
584
|
+
// Third input source: PDFs / papers. Mirrors runArticlePhase, but scanPdfs
|
|
585
|
+
// returns cheap {path, hash} entries (PDF text extraction is expensive via
|
|
586
|
+
// pdf.js) and only files that aren't already processed get parsed — instead of
|
|
587
|
+
// extracting the whole directory up front. Each PDF is hashed for idempotency
|
|
588
|
+
// and wrapped in its own try/catch so one bad file never aborts the run.
|
|
589
|
+
async function runPdfPhase(cfg, db, writer, summary, fileLog, interactive) {
|
|
590
|
+
if (!cfg.pdfsDir)
|
|
591
|
+
return;
|
|
592
|
+
const scanSpinner = interactive ? ui.spinner("scanning pdfs").start() : null;
|
|
593
|
+
let sources;
|
|
594
|
+
try {
|
|
595
|
+
sources = scanPdfs(cfg.pdfsDir);
|
|
596
|
+
}
|
|
597
|
+
catch (err) {
|
|
598
|
+
if (scanSpinner)
|
|
599
|
+
scanSpinner.fail(ui.errorColor("pdf scan failed"));
|
|
600
|
+
fileLog(`pdf scan failed: ${err.message}`);
|
|
601
|
+
return;
|
|
602
|
+
}
|
|
603
|
+
summary.pdfsScanned = sources.length;
|
|
604
|
+
if (scanSpinner) {
|
|
605
|
+
scanSpinner.succeed(ui.text(`scanned ${ui.info(String(sources.length))} ${ui.dim("pdfs")}`));
|
|
606
|
+
}
|
|
607
|
+
fileLog(`scanned ${sources.length} pdfs`);
|
|
608
|
+
for (const src of sources) {
|
|
609
|
+
try {
|
|
610
|
+
if (db.isPdfProcessed(src.filePath, src.hash))
|
|
611
|
+
continue;
|
|
612
|
+
// Extraction is heavy and only happens for new files (gated above).
|
|
613
|
+
const parsed = await parsePdf(src.filePath);
|
|
614
|
+
const distilled = await distillPdf(parsed, cfg);
|
|
615
|
+
if (!distilled) {
|
|
616
|
+
summary.pdfsSkipped += 1;
|
|
617
|
+
db.recordPdf({
|
|
618
|
+
path: parsed.filePath,
|
|
619
|
+
hash: parsed.hash,
|
|
620
|
+
skipped: true,
|
|
621
|
+
});
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
const notePath = await writer.writePdf(parsed, distilled);
|
|
625
|
+
summary.pdfsDistilled += 1;
|
|
626
|
+
summary.notesWritten.push(notePath);
|
|
627
|
+
db.recordPdf({
|
|
628
|
+
path: parsed.filePath,
|
|
629
|
+
hash: parsed.hash,
|
|
630
|
+
skipped: false,
|
|
631
|
+
notePath,
|
|
632
|
+
content: distilled.markdown,
|
|
633
|
+
category: distilled.classification.category,
|
|
634
|
+
title: parsed.title,
|
|
635
|
+
pages: parsed.pageCount,
|
|
636
|
+
confidence: distilled.classification.confidence,
|
|
637
|
+
distilledAt: new Date().toISOString(),
|
|
638
|
+
});
|
|
639
|
+
if (interactive) {
|
|
640
|
+
ui.categoryRow(distilled.classification.category, parsed.title);
|
|
641
|
+
}
|
|
642
|
+
fileLog(`distilled pdf → ${distilled.classification.category}/${parsed.title}`);
|
|
643
|
+
if (distilled.classification.confidence >= 0.8) {
|
|
644
|
+
notify(`Vir — new ${distilled.classification.category}`, parsed.title);
|
|
645
|
+
}
|
|
646
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
647
|
+
}
|
|
648
|
+
catch (err) {
|
|
649
|
+
summary.pdfsErrored += 1;
|
|
650
|
+
const msg = err.message ?? String(err);
|
|
651
|
+
if (interactive) {
|
|
652
|
+
ui.row(ui.errorColor(ui.CROSS), ui.text(`pdf error: ${msg}`));
|
|
653
|
+
}
|
|
654
|
+
fileLog(`error on pdf ${src.filePath}: ${msg}`);
|
|
655
|
+
try {
|
|
656
|
+
// Record with the source hash so a corrupt PDF isn't retried every run
|
|
657
|
+
// (same idempotency contract as articles).
|
|
658
|
+
db.recordPdf({
|
|
659
|
+
path: src.filePath,
|
|
660
|
+
hash: src.hash,
|
|
661
|
+
skipped: false,
|
|
662
|
+
error: msg,
|
|
663
|
+
});
|
|
664
|
+
}
|
|
665
|
+
catch {
|
|
666
|
+
// ignore record errors
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
525
671
|
async function rewriteOne(writer, row) {
|
|
526
672
|
const parsed = {
|
|
527
673
|
path: row.path,
|