@djolex999/vir-cli 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ const CATEGORY_DIRS = {
15
15
  tools: "tool",
16
16
  articles: "article",
17
17
  topics: "topic",
18
+ pdfs: "pdf",
18
19
  };
19
20
  const WIRE_CATEGORIES = new Set([
20
21
  "pattern",
@@ -23,6 +24,7 @@ const WIRE_CATEGORIES = new Set([
23
24
  "tool",
24
25
  "article",
25
26
  "topic",
27
+ "pdf",
26
28
  ]);
27
29
  // Minimal YAML-block parser, kebab-flat. Mirrors mcp/server.ts deliberately —
28
30
  // the JSON contract is its own isolated surface and must not couple to the MCP
@@ -58,6 +60,10 @@ function categoryOf(fm, relPath, topicsDir) {
58
60
  return "article";
59
61
  if (fm.type === "topic")
60
62
  return "topic";
63
+ // PDF sub-taxonomy (paper/reference/notes/other) collapses to the single
64
+ // "pdf" wire bucket, exactly like articles collapse to "article".
65
+ if (fm.type === "pdf")
66
+ return "pdf";
61
67
  if (fm.category && WIRE_CATEGORIES.has(fm.category)) {
62
68
  return fm.category;
63
69
  }
@@ -1 +1 @@
1
- {"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AA2CrC,+EAA+E;AAC/E,4EAA4E;AAC5E,+EAA+E;AAC/E,MAAM,aAAa,GAAqC;IACtD,QAAQ,EAAE,SAAS;IACnB,OAAO,EAAE,QAAQ;IACjB,SAAS,EAAE,UAAU;IACrB,KAAK,EAAE,MAAM;IACb,QAAQ,EAAE,SAAS;IACnB,MAAM,EAAE,OAAO;CAChB,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAS;IACtC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,OAAO;CACR,CAAC,CAAC;AAEH,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACrB,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,KAAK,CAAC,CAAC;YAAE,SAAS;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC/B,IAAI,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,IACE,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC1C,CAAC;YACD,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,OAAO,CAAC,OAAe;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;IAC3D,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU,CACjB,EAA0B,EAC1B,OAAe,EACf,SAAiB;IAEjB,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO;QAAE,OAAO,OAAO,CAAC;IACxC,IAAI,EAAE,CAAC,QAAQ,IAAI,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,OAAO,EAAE,CAAC,QAA4B,CAAC;IACzC,CAAC;IACD,2EAA2E;IAC3E,8EAA8E;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,MAAM,GAAG,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;IAC/C,OAAO,aAAa,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;AACzC,CAAC;AAED,0EAA0E;AAC1E,8EAA8E;AAC9E,6EAA6E;AAC7E,iFAAiF;AACjF,gFAAgF;AAChF,2EAA2E;AAC3E,6EAA6E;AAC7E,MAAM,UAAU,iBAAiB,CAC/B,IAAiB,EACjB,SAAiB,EACjB,SAAS,GAAG,QAAQ;IAEpB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,EAAE,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC;QACnC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,QAAQ,EAAE,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC;YAC5C,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5C,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAC3B,OAAO,EAAE,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YAChE,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,IAAkB,EAClB,OAAe;IAEf,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAClC,CAAC;AAED,8EAA8E;AAC9E,+EAA+E;AAC/E,mEAAmE;AACnE,MAAM,UAAU,oBAAoB,CAClC,SAAkB,EAClB,UAAyB,EACzB,YAAoB,EACpB,MAAY,IAAI,IAAI,EAAE;IAEtB,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU;QAAE,OAAO,MAAM,CAAC;IAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACtC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,QAAQ,GAAG,CAAC,GAAG,YAAY,GAAG,SAAS,CAAC;IAC9C,OAAO,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC;AAC7D,CAAC;AAgBD,MAAM,UAAU,iBAAiB,CAAC,CAAe;IAC/C,OAAO;QACL,MAAM,EAAE,oBAAoB,CAC1B,CAAC,CAAC,eAAe,EACjB,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,YAAY,EACd,CAAC,CAAC,GAAG,CACN;QACD,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,MAAM,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC,WAAW,EAAE;QAC9D,OAAO,EAAE,CAAC,CAAC,OAAO;KACnB,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AA4CrC,+EAA+E;AAC/E,4EAA4E;AAC5E,+EAA+E;AAC/E,MAAM,aAAa,GAAqC;IACtD,QAAQ,EAAE,SAAS;IACnB,OAAO,EAAE,QAAQ;IACjB,SAAS,EAAE,UAAU;IACrB,KAAK,EAAE,MAAM;IACb,QAAQ,EAAE,SAAS;IACnB,MAAM,EAAE,OAAO;IACf,IAAI,EAAE,KAAK;CACZ,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAS;IACtC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAEH,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACrB,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,KAAK,CAAC,CAAC;YAAE,SAAS;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC/B,IAAI,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,IACE,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC1C,CAAC;YACD,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,OAAO,CAAC,OAAe;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;IAC3D,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU,CACjB,EAA0B,EAC1B,OAAe,EACf,SAAiB;IAEjB,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO;QAAE,OAAO,OAAO,CAAC;IACxC,yEAAyE;IACzE,kEAAkE;IAClE,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;QAAE,OAAO,KAAK,CAAC;IACpC,IAAI,EAAE,CAAC,QAAQ,IAAI,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,OAAO,EAAE,CAAC,QAA4B,CAAC;IACzC,CAAC;IACD,2EAA2E;IAC3E,8EAA8E;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,MAAM,GAAG,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;IAC/C,OAAO,aAAa,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;AACzC,CAAC;AAED,0EAA0E;AAC1E,8EAA8E;AAC9E,6EAA6E;AAC7E,iFAAiF;AACjF,gFAAgF;AAChF,2EAA2E;AAC3E,6EAA6E;AAC7E,MAAM,UAAU,iBAAiB,CAC/B,IAAiB,EACjB,SAAiB,EACjB,SAAS,GAAG,QAAQ;IAEpB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,EAAE,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC;QACnC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,QAAQ,EAAE,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC;YAC5C,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5C,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAC3B,OAAO,EAAE,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YAChE,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,IAAkB,EAClB,OAAe;IAEf,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAClC,CAAC;AAED,8EAA8E;AAC9E,+EAA+E;AAC/E,mEAAmE;AACnE,MAAM,UAAU,oBAAoB,CAClC,SAAkB,EAClB,UAAyB,EACzB,YAAoB,EACpB,MAAY,IAAI,IAAI,EAAE;IAEtB,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU;QAAE,OAAO,MAAM,CAAC;IAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACtC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,QAAQ,GAAG,CAAC,GAAG,YAAY,GAAG,SAAS,CAAC;IAC9C,OAAO,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC;AAC7D,CAAC;AAgBD,MAAM,UAAU,iBAAiB,CAAC,CAAe;IAC/C,OAAO;QACL,MAAM,EAAE,oBAAoB,CAC1B,CAAC,CAAC,eAAe,EACjB,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,YAAY,EACd,CAAC,CAAC,GAAG,CACN;QACD,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,MAAM,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC,WAAW,EAAE;QAC9D,OAAO,EAAE,CAAC,CAAC,OAAO;KACnB,CAAC;AACJ,CAAC"}
@@ -35,6 +35,17 @@ export function selectArticleEmbeddingTargets(rows) {
35
35
  r.embedding === null &&
36
36
  r.notePath !== null);
37
37
  }
38
+ // PDF counterpart, mirroring db.listPdfEmbeddingTargets's SQL filter — the exact
39
+ // complement of getPdfEmbeddings(). Same gates as articles (skipped/error,
40
+ // content present, note_path set, NULL embedding); PDFs have no archived column.
41
+ export function selectPdfEmbeddingTargets(rows) {
42
+ return rows.filter((r) => r.skipped === 0 &&
43
+ r.error === null &&
44
+ r.content !== null &&
45
+ r.content !== "" &&
46
+ r.embedding === null &&
47
+ r.notePath !== null);
48
+ }
38
49
  // Best-effort self-heal sweep. A write-time embedding miss (Ollama down when a
39
50
  // note was distilled) leaves `embedding = NULL`, which makes the note invisible
40
51
  // to the embedding-search path: it never enters getEmbeddings()'s candidate set,
@@ -52,12 +63,16 @@ export async function sweepEmbeddings(db) {
52
63
  const targets = db.listEmbeddingTargets();
53
64
  const topicTargets = db.listTopicEmbeddingTargets();
54
65
  const articleTargets = db.listArticleEmbeddingTargets();
66
+ const pdfTargets = db.listPdfEmbeddingTargets();
55
67
  if (!(await isOllamaAvailableCached())) {
56
68
  return {
57
69
  ran: false,
58
70
  embedded: 0,
59
71
  errors: 0,
60
- pending: targets.length + topicTargets.length + articleTargets.length,
72
+ pending: targets.length +
73
+ topicTargets.length +
74
+ articleTargets.length +
75
+ pdfTargets.length,
61
76
  };
62
77
  }
63
78
  let embedded = 0;
@@ -104,6 +119,21 @@ export async function sweepEmbeddings(db) {
104
119
  db.storeArticleEmbedding(a.path, vec);
105
120
  embedded += 1;
106
121
  }
122
+ // PDFs heal the same way (their own table). A paper distilled while Ollama was
123
+ // down left `embedding` NULL; back-fill keyed by the source path (the PK) via
124
+ // storePdfEmbedding — shipped WITH the getPdfEmbeddings read path so the new
125
+ // pdfs table can't reopen the NULL-embedding blind spot (the 0.8.2/0.8.3 trap).
126
+ for (const p of pdfTargets) {
127
+ if (!p.content || p.content.trim().length === 0)
128
+ continue;
129
+ const vec = await embeddingForNote(p.content);
130
+ if (!vec) {
131
+ errors += 1;
132
+ continue;
133
+ }
134
+ db.storePdfEmbedding(p.path, vec);
135
+ embedded += 1;
136
+ }
107
137
  return { ran: true, embedded, errors, pending: errors };
108
138
  }
109
139
  //# sourceMappingURL=embeddingSweep.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,GAKhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,uBAAuB,CAAC;AAE/B,+EAA+E;AAC/E,qEAAqE;AACrE,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,WAAW;AACX,MAAM,UAAU,sBAAsB,CACpC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,QAAQ,KAAK,IAAI;QACnB,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,CAC1B,CAAC;AACJ,CAAC;AAED,yDAAyD;AACzD,2EAA2E;AAC3E,8EAA8E;AAC9E,oCAAoC;AACpC,MAAM,UAAU,2BAA2B,CACzC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,EAAE,CACtE,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,8EAA8E;AAC9E,6EAA6E;AAC7E,oEAAoE;AACpE,MAAM,UAAU,6BAA6B,CAE3C,IAAS;IACT,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAaD,+EAA+E;AAC/E,gFAAgF;AAChF,iFAAiF;AACjF,uEAAuE;AACvE,2EAA2E;AAC3E,gFAAgF;AAChF,2EAA2E;AAC3E,wCAAwC;AACxC,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6EAA6E;AAC7E,oCAAoC;AACpC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,EAAW;IAC/C,MAAM,OAAO,GAAG,EAAE,CAAC,oBAAoB,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,EAAE,CAAC,yBAAyB,EAAE,CAAC;IACpD,MAAM,cAAc,GAAG,EAAE,CAAC,2BAA2B,EAAE,CAAC;IACxD,IAAI,CAAC,CAAC,MAAM,uBAAuB,EAAE,CAAC,EAAE,CAAC;QACvC,OAAO;YACL,GAAG,EAAE,KAAK;YACV,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,CAAC;YACT,OAAO,EAAE,OAAO,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,GAAG,cAAc,CAAC,MAAM;SACtE,CAAC;IACJ,CAAC;IACD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,cAAc,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAChD,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,0EAA0E;IAC1E,6EAA6E;IAC7E,2EAA2E;IAC3E,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,8EAA8E;IAC9E,6EAA6E;IAC7E,+EAA+E;IAC/E,8EAA8E;IAC9E,qCAAqC;IACrC,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACtC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AAC1D,CAAC"}
1
+ {"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,GAMhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,uBAAuB,CAAC;AAE/B,+EAA+E;AAC/E,qEAAqE;AACrE,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,WAAW;AACX,MAAM,UAAU,sBAAsB,CACpC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,QAAQ,KAAK,IAAI;QACnB,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,CAC1B,CAAC;AACJ,CAAC;AAED,yDAAyD;AACzD,2EAA2E;AAC3E,8EAA8E;AAC9E,oCAAoC;AACpC,MAAM,UAAU,2BAA2B,CACzC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,EAAE,CACtE,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,8EAA8E;AAC9E,6EAA6E;AAC7E,oEAAoE;AACpE,MAAM,UAAU,6BAA6B,CAE3C,IAAS;IACT,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAED,iFAAiF;AACjF,2EAA2E;AAC3E,iFAAiF;AACjF,MAAM,UAAU,yBAAyB,CACvC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAaD,+EAA+E;AAC/E,gFAAgF;AAChF,iFAAiF;AACjF,uEAAuE;AACvE,2EAA2E;AAC3E,gFAAgF;AAChF,2EAA2E;AAC3E,wCAAwC;AACxC,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6EAA6E;AAC7E,oCAAoC;AACpC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,EAAW;IAC/C,MAAM,OAAO,GAAG,EAAE,CAAC,oBAAoB,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,EAAE,CAAC,yBAAyB,EAAE,CAAC;IACpD,MAAM,cAAc,GAAG,EAAE,CAAC,2BAA2B,EAAE,CAAC;IACxD,MAAM,UAAU,GAAG,EAAE,CAAC,uBAAuB,EAAE,CAAC;IAChD,IAAI,CAAC,CAAC,MAAM,uBAAuB,EAAE,CAAC,EAAE,CAAC;QACvC,OAAO;YACL,GAAG,EAAE,KAAK;YACV,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,CAAC;YACT,OAAO,EACL,OAAO,CAAC,MAAM;gBACd,YAAY,CAAC,MAAM;gBACnB,cAAc,CAAC,MAAM;gBACrB,UAAU,CAAC,MAAM;SACpB,CAAC;IACJ,CAAC;IACD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,cAAc,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAChD,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,0EAA0E;IAC1E,6EAA6E;IAC7E,2EAA2E;IAC3E,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,8EAA8E;IAC9E,6EAA6E;IAC7E,+EAA+E;IAC/E,8EAA8E;IAC9E,qCAAqC;IACrC,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACtC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,8EAA8E;IAC9E,6EAA6E;IAC7E,gFAAgF;IAChF,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AAC1D,CAAC"}
@@ -0,0 +1,141 @@
1
+ import { createHash } from "node:crypto";
2
+ import { join } from "node:path";
3
+ import { callLLM, maybeAnthropicClient, normalizeModelName, withRateLimitRetry, } from "./distiller.js";
4
+ import { scrub } from "./scrubber.js";
5
+ export const PDF_CATEGORIES = [
6
+ "paper",
7
+ "reference",
8
+ "notes",
9
+ "other",
10
+ ];
11
+ // Neutral fallback when the model returns an unrecognized category: "other"
12
+ // claims the least about the document's intent.
13
+ const DEFAULT_PDF_CATEGORY = "other";
14
+ export const PDFS_SUBDIR = "pdfs";
15
+ // Papers are long → bound the distill input so a single paper can't blow up
16
+ // token cost (hybrid routing would otherwise push it to Sonnet on size).
17
+ // Mirrors articleDistiller's 24k-char bound.
18
+ const CLASSIFY_EXCERPT_CHARS = 3000;
19
+ const MAX_BODY_CHARS = 24_000;
20
+ export async function distillPdf(parsed, cfg) {
21
+ const client = maybeAnthropicClient(cfg);
22
+ const classifyModel = normalizeModelName(cfg.models.classify, cfg.provider);
23
+ const distillModel = normalizeModelName(cfg.models.distill, cfg.provider);
24
+ // Extracted text still flows to a provider — scrub keys/paths/emails.
25
+ const body = scrub(parsed.text).slice(0, MAX_BODY_CHARS);
26
+ if (body.trim().length === 0)
27
+ return null;
28
+ const clsText = await withRateLimitRetry(() => callLLM(cfg, client, {
29
+ prompt: classifyPrompt(parsed, body),
30
+ model: classifyModel,
31
+ maxTokens: 200,
32
+ cost: { stage: "pdf-classify" },
33
+ }));
34
+ const classification = parsePdfClassification(clsText);
35
+ const markdown = (await withRateLimitRetry(() => callLLM(cfg, client, {
36
+ prompt: distillPrompt(parsed, body),
37
+ model: distillModel,
38
+ maxTokens: 1500,
39
+ cost: { stage: "pdf-distill" },
40
+ }))).trim();
41
+ if (markdown.length === 0)
42
+ return null;
43
+ return { classification, markdown };
44
+ }
45
+ function classifyPrompt(parsed, body) {
46
+ return `Classify this PDF document into exactly one category. Output JSON only:
47
+ { "category": "paper" | "reference" | "notes" | "other",
48
+ "confidence": number (0..1) }
49
+
50
+ paper = a research/academic paper (abstract, methods, results, contributions)
51
+ reference = documentation, a spec, manual, datasheet, or material to look up later
52
+ notes = lecture notes, slides, a course handout, or personal study notes
53
+ other = anything that doesn't fit the above
54
+
55
+ Title: ${parsed.title}
56
+
57
+ ${body.slice(0, CLASSIFY_EXCERPT_CHARS)}`;
58
+ }
59
+ function distillPrompt(parsed, body) {
60
+ return `Distill this PDF document into a durable knowledge note. Output markdown
61
+ only — no preamble, start with '## Summary'. Use these sections:
62
+
63
+ - ## Summary (2-3 sentences, in your own words)
64
+ - ## Key Points (bullet list: the main claims, findings, methods, or arguments)
65
+ - ## Methods & Findings (for a paper: the approach and what it concluded; omit
66
+ this section entirely if the document isn't a study)
67
+ - ## Related (plain-English topics this connects to, one per bullet — these get
68
+ turned into wikilinks automatically, so write them as short noun phrases)
69
+
70
+ COPYRIGHT — strict: this is someone else's IP. Never reproduce more than 15
71
+ consecutive words verbatim from the source. Paraphrase everything in your own
72
+ words. Do not reproduce figures, tables, equations, or full passages. Summarize
73
+ and cite by context, never quote at length.
74
+
75
+ Title: ${parsed.title}
76
+ Source: ${parsed.filePath}
77
+ Document:
78
+ ${body}`;
79
+ }
80
+ export function parsePdfClassification(text) {
81
+ const match = text.match(/\{[\s\S]*\}/);
82
+ if (!match)
83
+ return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
84
+ let obj;
85
+ try {
86
+ obj = JSON.parse(match[0]);
87
+ }
88
+ catch {
89
+ return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
90
+ }
91
+ const rawCat = typeof obj.category === "string" ? obj.category : "";
92
+ const category = PDF_CATEGORIES.includes(rawCat)
93
+ ? rawCat
94
+ : DEFAULT_PDF_CATEGORY;
95
+ const confRaw = typeof obj.confidence === "number"
96
+ ? obj.confidence
97
+ : Number(obj.confidence ?? 0);
98
+ const confidence = Number.isFinite(confRaw)
99
+ ? Math.max(0, Math.min(1, confRaw))
100
+ : 0;
101
+ return { category, confidence };
102
+ }
103
+ // Stable across content edits: a re-extracted PDF (new hash/text) keeps the same
104
+ // slug as long as its source path is unchanged, so a re-distill overwrites the
105
+ // same note instead of orphaning the old one (mirrors articleSlug, keyed off the
106
+ // source path rather than the content hash).
107
+ export function pdfSlug(parsed) {
108
+ const base = kebab(parsed.title).slice(0, 60);
109
+ const suffix = createHash("sha256")
110
+ .update(parsed.filePath)
111
+ .digest("hex")
112
+ .slice(0, 8);
113
+ return base.length > 0 ? `${base}-${suffix}` : `pdf-${suffix}`;
114
+ }
115
+ export function pdfRelPath(parsed) {
116
+ return join(PDFS_SUBDIR, `${pdfSlug(parsed)}.md`);
117
+ }
118
+ export function buildPdfFrontmatter(parsed, distilled) {
119
+ const lines = ["---", "type: pdf"];
120
+ lines.push(`category: ${distilled.classification.category}`);
121
+ lines.push(`source_path: ${parsed.filePath}`);
122
+ lines.push(`source_title: "${escapeYaml(parsed.title)}"`);
123
+ lines.push(`pages: ${parsed.pageCount}`);
124
+ lines.push(`distilled_at: ${new Date().toISOString()}`);
125
+ lines.push(`confidence: ${distilled.classification.confidence}`);
126
+ lines.push(`hash: ${parsed.hash}`);
127
+ lines.push("---", "");
128
+ return lines.join("\n");
129
+ }
130
+ function escapeYaml(s) {
131
+ return s.replace(/"/g, '\\"');
132
+ }
133
+ // Local copy of writer.kebab() — writer.ts depends on this module, so importing
134
+ // from it would create a cycle (mirrors articleDistiller.kebab).
135
+ function kebab(s) {
136
+ return s
137
+ .toLowerCase()
138
+ .replace(/[^a-z0-9]+/g, "-")
139
+ .replace(/^-+|-+$/g, "");
140
+ }
141
+ //# sourceMappingURL=pdfDistiller.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfDistiller.js","sourceRoot":"","sources":["../../src/pipeline/pdfDistiller.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EACL,OAAO,EACP,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAMtC,MAAM,CAAC,MAAM,cAAc,GAAkB;IAC3C,OAAO;IACP,WAAW;IACX,OAAO;IACP,OAAO;CACR,CAAC;AAEF,4EAA4E;AAC5E,gDAAgD;AAChD,MAAM,oBAAoB,GAAgB,OAAO,CAAC;AAElD,MAAM,CAAC,MAAM,WAAW,GAAG,MAAM,CAAC;AAElC,4EAA4E;AAC5E,yEAAyE;AACzE,6CAA6C;AAC7C,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,cAAc,GAAG,MAAM,CAAC;AAY9B,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAiB,EACjB,GAAW;IAEX,MAAM,MAAM,GAAG,oBAAoB,CAAC,GAAG,CAAC,CAAC;IACzC,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE1E,sEAAsE;IACtE,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,cAAc,CAAC,CAAC;IACzD,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5C,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,cAAc,CAAC,MAAM,EAAE,IAAI,CAAC;QACpC,KAAK,EAAE,aAAa;QACpB,SAAS,EAAE,GAAG;QACd,IAAI,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE;KAChC,CAAC,CACH,CAAC;IACF,MAAM,cAAc,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;IAEvD,MAAM,QAAQ,GAAG,CACf,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5B,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,aAAa,CAAC,MAAM,EAAE,IAAI,CAAC;QACnC,KAAK,EAAE,YAAY;QACnB,SAAS,EAAE,IAAI;QACf,IAAI,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE;KAC/B,CAAC,CACH,CACF,CAAC,IAAI,EAAE,CAAC;IACT,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC;AACtC,CAAC;AAED,SAAS,cAAc,CAAC,MAAiB,EAAE,IAAY;IACrD,OAAO;;;;;;;;;SASA,MAAM,CAAC,KAAK;;EAEnB,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,sBAAsB,CAAC,EAAE,CAAC;AAC1C,CAAC;AAED,SAAS,aAAa,CAAC,MAAiB,EAAE,IAAY;IACpD,OAAO;;;;;;;;;;;;;;;SAeA,MAAM,CAAC,KAAK;UACX,MAAM,CAAC,QAAQ;;EAEvB,IAAI,EAAE,CAAC;AACT,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,IAAY;IACjD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IACxC,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IACrE,IAAI,GAA4B,CAAC;IACjC,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAA4B,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IACD,MAAM,MAAM,GAAG,OAAO,GAAG,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;IACpE,MAAM,QAAQ,GAAI,cAA2B,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5D,CAAC,CAAE,MAAsB;QACzB,CAAC,CAAC,oBAAoB,CAAC;IACzB,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,UAAU,KAAK,QAAQ;QAChC,CAAC,CAAC,GAAG,CAAC,UAAU;QAChB,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAClC,MAAM,UAAU,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC;QACzC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QACnC,CAAC,CAAC,CAAC,CAAC;IACN,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC;AAClC,CAAC;AAED,iFAAiF;AACjF,+EAA+E;AAC/E,iFAAiF;AACjF,6CAA6C;AAC7C,MAAM,UAAU,OAAO,CAAC,MAAiB;IACvC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC;SAChC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;SACvB,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACf,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC,OAAO,MAAM,EAAE,CAAC;AACjE,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,MAAiB;IAC1C,OAAO,IAAI,CAAC,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACpD,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,MAAiB,EACjB,SAAuB;IAEvB,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7D,KAAK,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC9C,KAAK,CAAC,IAAI,CAAC,kBAAkB,UAAU,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,iBAAiB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,eAAe,SAAS,CAAC,cAAc,CAAC,UAAU,EAAE,CAAC,CAAC;IACjE,KAAK,CAAC,IAAI,CAAC,SAAS,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACtB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;AAChC,CAAC;AAED,gFAAgF;AAChF,iEAAiE;AACjE,SAAS,KAAK,CAAC,CAAS;IACtB,OAAO,CAAC;SACL,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;AAC7B,CAAC"}
@@ -0,0 +1,76 @@
1
+ import { createHash } from "node:crypto";
2
+ import { readFileSync, readdirSync, statSync } from "node:fs";
3
+ import { basename, join } from "node:path";
4
+ import { extractText, getDocumentProxy } from "unpdf";
5
+ export async function parsePdf(filePath) {
6
+ const bytes = readFileSync(filePath);
7
+ const hash = createHash("sha256").update(bytes).digest("hex");
8
+ // unpdf bundles pdf.js; getDocumentProxy throws on corrupt/non-PDF bytes,
9
+ // which the run layer catches per-file and records (never crashes the run).
10
+ const proxy = await getDocumentProxy(new Uint8Array(bytes));
11
+ const { totalPages, text } = await extractText(proxy, { mergePages: true });
12
+ const metaTitle = await readMetadataTitle(proxy);
13
+ return {
14
+ filePath,
15
+ hash,
16
+ title: pdfTitle(metaTitle, filePath),
17
+ text: Array.isArray(text) ? text.join("\n") : text,
18
+ pageCount: totalPages,
19
+ };
20
+ }
21
+ export function scanPdfs(dir) {
22
+ const files = [];
23
+ walk(dir, files);
24
+ const out = [];
25
+ for (const f of files) {
26
+ try {
27
+ const hash = createHash("sha256").update(readFileSync(f)).digest("hex");
28
+ out.push({ filePath: f, hash });
29
+ }
30
+ catch {
31
+ // unreadable / mid-write file — skip, never fail the whole scan
32
+ }
33
+ }
34
+ return out;
35
+ }
36
+ // Pure: metadata Title wins when non-blank; otherwise the filename (sans .pdf).
37
+ export function pdfTitle(metaTitle, filePath) {
38
+ const t = (metaTitle ?? "").trim();
39
+ if (t.length > 0)
40
+ return t;
41
+ return basename(filePath).replace(/\.pdf$/i, "");
42
+ }
43
+ async function readMetadataTitle(proxy) {
44
+ try {
45
+ const meta = (await proxy.getMetadata());
46
+ const title = meta?.info?.Title;
47
+ return typeof title === "string" ? title : undefined;
48
+ }
49
+ catch {
50
+ return undefined;
51
+ }
52
+ }
53
+ function walk(dir, acc) {
54
+ let entries;
55
+ try {
56
+ entries = readdirSync(dir);
57
+ }
58
+ catch {
59
+ return;
60
+ }
61
+ for (const name of entries) {
62
+ const full = join(dir, name);
63
+ let st;
64
+ try {
65
+ st = statSync(full);
66
+ }
67
+ catch {
68
+ continue;
69
+ }
70
+ if (st.isDirectory())
71
+ walk(full, acc);
72
+ else if (st.isFile() && name.toLowerCase().endsWith(".pdf"))
73
+ acc.push(full);
74
+ }
75
+ }
76
+ //# sourceMappingURL=pdfReader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfReader.js","sourceRoot":"","sources":["../../src/pipeline/pdfReader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC3C,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,MAAM,OAAO,CAAC;AAwBtD,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,QAAgB;IAC7C,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAE9D,0EAA0E;IAC1E,4EAA4E;IAC5E,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5D,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5E,MAAM,SAAS,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,CAAC;IAEjD,OAAO;QACL,QAAQ;QACR,IAAI;QACJ,KAAK,EAAE,QAAQ,CAAC,SAAS,EAAE,QAAQ,CAAC;QACpC,IAAI,EAAE,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI;QAClD,SAAS,EAAE,UAAU;KACtB,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACjB,MAAM,GAAG,GAAgB,EAAE,CAAC;IAC5B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACxE,GAAG,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;QAAC,MAAM,CAAC;YACP,gEAAgE;QAClE,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,gFAAgF;AAChF,MAAM,UAAU,QAAQ,CAAC,SAA6B,EAAE,QAAgB;IACtE,MAAM,CAAC,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACnC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC3B,OAAO,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACnD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,KAAmD;IAEnD,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,WAAW,EAAE,CAEtC,CAAC;QACF,MAAM,KAAK,GAAG,IAAI,EAAE,IAAI,EAAE,KAAK,CAAC;QAChC,OAAO,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,GAAW,EAAE,GAAa;IACtC,IAAI,OAAiB,CAAC;IACtB,IAAI,CAAC;QACH,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;IACT,CAAC;IACD,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QAC7B,IAAI,EAAE,CAAC;QACP,IAAI,CAAC;YACH,EAAE,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QACD,IAAI,EAAE,CAAC,WAAW,EAAE;YAAE,IAAI,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;aACjC,IAAI,EAAE,CAAC,MAAM,EAAE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC"}
@@ -10,6 +10,8 @@ import { parseSession } from "./parser.js";
10
10
  import { scanSessions } from "./scanner.js";
11
11
  import { scanArticles } from "./articleReader.js";
12
12
  import { distillArticle } from "./articleDistiller.js";
13
+ import { parsePdf, scanPdfs } from "./pdfReader.js";
14
+ import { distillPdf } from "./pdfDistiller.js";
13
15
  import { scrub } from "./scrubber.js";
14
16
  import { summarizeProject } from "./summarizer.js";
15
17
  import { filterToolCalls } from "./toolCallFilter.js";
@@ -32,6 +34,10 @@ export async function runPipeline(cfg, opts = {}) {
32
34
  articlesDistilled: 0,
33
35
  articlesSkipped: 0,
34
36
  articlesErrored: 0,
37
+ pdfsScanned: 0,
38
+ pdfsDistilled: 0,
39
+ pdfsSkipped: 0,
40
+ pdfsErrored: 0,
35
41
  };
36
42
  const interactive = !opts.quiet;
37
43
  // File-only logging — used for the daemon run.log regardless of UI mode.
@@ -52,9 +58,11 @@ export async function runPipeline(cfg, opts = {}) {
52
58
  ? "run --rewrite-only"
53
59
  : opts.articlesOnly
54
60
  ? "run --articles-only"
55
- : opts.full
56
- ? "run --full"
57
- : "run");
61
+ : opts.pdfsOnly
62
+ ? "run --pdfs-only"
63
+ : opts.full
64
+ ? "run --full"
65
+ : "run");
58
66
  ui.blank();
59
67
  }
60
68
  fileLog(`vir run start (full=${opts.full ? "true" : "false"} rewriteOnly=${opts.rewriteOnly ? "true" : "false"})`);
@@ -147,6 +155,34 @@ export async function runPipeline(cfg, opts = {}) {
147
155
  db.close();
148
156
  return summary;
149
157
  }
158
+ // --pdfs-only: skip the session AND article pipelines.
159
+ if (opts.pdfsOnly) {
160
+ if (!cfg.pdfsDir) {
161
+ if (interactive) {
162
+ ui.row(ui.warn(ui.WARN_GLYPH), ui.text("pdfsDir is not set — nothing to distill"));
163
+ }
164
+ fileLog("pdfs-only run but pdfsDir is unset");
165
+ db.close();
166
+ return summary;
167
+ }
168
+ await runPdfPhase(cfg, db, writer, summary, fileLog, interactive);
169
+ if (interactive) {
170
+ ui.blank();
171
+ ui.divider();
172
+ ui.summary({
173
+ pdfs: { value: summary.pdfsScanned, color: ui.info },
174
+ distilled: { value: summary.pdfsDistilled, color: ui.success },
175
+ skipped: { value: summary.pdfsSkipped, color: ui.warn },
176
+ errored: {
177
+ value: summary.pdfsErrored,
178
+ color: summary.pdfsErrored > 0 ? ui.errorColor : ui.dim,
179
+ },
180
+ });
181
+ ui.divider();
182
+ }
183
+ db.close();
184
+ return summary;
185
+ }
150
186
  const distiller = new Distiller(cfg, {
151
187
  forceDistillModel: opts.forceDistillModel,
152
188
  });
@@ -193,7 +229,8 @@ export async function runPipeline(cfg, opts = {}) {
193
229
  // the end-of-run sweep back-fills. The sweep heals them when Ollama is up.
194
230
  const pendingEmbedding = db.listEmbeddingTargets().length +
195
231
  db.listTopicEmbeddingTargets().length +
196
- db.listArticleEmbeddingTargets().length;
232
+ db.listArticleEmbeddingTargets().length +
233
+ db.listPdfEmbeddingTargets().length;
197
234
  if (interactive) {
198
235
  ui.line(ui.dim(` ${discovered.length} files found · ${cached} cached · ${preflightNew} new` +
199
236
  (pendingEmbedding > 0
@@ -260,6 +297,20 @@ export async function runPipeline(cfg, opts = {}) {
260
297
  ui.divider();
261
298
  ui.line(ui.dim(" estimates assume typical output sizes; actuals may vary ±30%"));
262
299
  }
300
+ // PDFs are estimated separately: papers exceed the 24k-char distill cap, so
301
+ // the per-PDF input is the cap (an accurate figure, not just an upper bound).
302
+ // No text extraction here — only count new PDFs by their cheap byte hash.
303
+ if (cfg.pdfsDir && cfg.distillPdfs) {
304
+ const newPdfs = scanPdfs(cfg.pdfsDir).filter((f) => opts.full || !db.isPdfProcessed(f.filePath, f.hash)).length;
305
+ if (newPdfs > 0) {
306
+ const perPdf = computeCost(cfg.provider, classifyModel, Math.ceil(3000 / CHARS_PER_TOKEN), 200, cfg.pricing, cfg.kieTopUpTier) +
307
+ computeCost(cfg.provider, distillModel, Math.ceil(24_000 / CHARS_PER_TOKEN), 1500, cfg.pricing, cfg.kieTopUpTier);
308
+ if (interactive) {
309
+ ui.line(ui.dim(` ${newPdfs} new PDF(s): ~${ui.formatUsd(perPdf)} each (input capped at 24k chars) → ~${ui.formatUsd(perPdf * newPdfs)}`));
310
+ }
311
+ fileLog(`dry-run: newPdfs=${newPdfs} perPdf=${ui.formatUsd(perPdf)} estTotal=${ui.formatUsd(perPdf * newPdfs)}`);
312
+ }
313
+ }
263
314
  fileLog(`dry-run: sessions=${estimated} filtered=${filteredOut} estTotal=${ui.formatUsd(totalCost)}`);
264
315
  db.close();
265
316
  return summary;
@@ -384,6 +435,11 @@ export async function runPipeline(cfg, opts = {}) {
384
435
  if (cfg.articlesDir && cfg.distillArticles) {
385
436
  await runArticlePhase(cfg, db, writer, summary, fileLog, interactive);
386
437
  }
438
+ // Third input source: PDFs / papers. Gated identically; an install without
439
+ // pdfsDir skips this entirely (the article pattern, cloned).
440
+ if (cfg.pdfsDir && cfg.distillPdfs) {
441
+ await runPdfPhase(cfg, db, writer, summary, fileLog, interactive);
442
+ }
387
443
  // Self-heal: back-fill notes whose write-time embedding silently no-op'd
388
444
  // (Ollama down during distill). Without this a transient outage is permanent
389
445
  // — the note never enters the embedding-search candidate set. Best-effort and
@@ -411,7 +467,7 @@ export async function runPipeline(cfg, opts = {}) {
411
467
  catch (err) {
412
468
  fileLog(`embedding sweep failed: ${err.message}`);
413
469
  }
414
- fileLog(`vir run done — scanned=${summary.scanned} new=${summary.scanned - summary.alreadyProcessed} distilled=${summary.distilled} skipped=${summary.skippedByFilter} lowConf=${summary.lowConfidence} errored=${summary.errored} articles=${summary.articlesDistilled}`);
470
+ fileLog(`vir run done — scanned=${summary.scanned} new=${summary.scanned - summary.alreadyProcessed} distilled=${summary.distilled} skipped=${summary.skippedByFilter} lowConf=${summary.lowConfidence} errored=${summary.errored} articles=${summary.articlesDistilled} pdfs=${summary.pdfsDistilled}`);
415
471
  if (interactive) {
416
472
  ui.blank();
417
473
  ui.divider();
@@ -431,6 +487,9 @@ export async function runPipeline(cfg, opts = {}) {
431
487
  if (cfg.articlesDir && cfg.distillArticles) {
432
488
  stats.articles = { value: summary.articlesDistilled, color: ui.success };
433
489
  }
490
+ if (cfg.pdfsDir && cfg.distillPdfs) {
491
+ stats.pdfs = { value: summary.pdfsDistilled, color: ui.success };
492
+ }
434
493
  ui.summary(stats);
435
494
  ui.divider();
436
495
  }
@@ -522,6 +581,93 @@ async function runArticlePhase(cfg, db, writer, summary, fileLog, interactive) {
522
581
  }
523
582
  }
524
583
  }
584
+ // Third input source: PDFs / papers. Mirrors runArticlePhase, but scanPdfs
585
+ // returns cheap {path, hash} entries (PDF text extraction is expensive via
586
+ // pdf.js) and only files that aren't already processed get parsed — instead of
587
+ // extracting the whole directory up front. Each PDF is hashed for idempotency
588
+ // and wrapped in its own try/catch so one bad file never aborts the run.
589
+ async function runPdfPhase(cfg, db, writer, summary, fileLog, interactive) {
590
+ if (!cfg.pdfsDir)
591
+ return;
592
+ const scanSpinner = interactive ? ui.spinner("scanning pdfs").start() : null;
593
+ let sources;
594
+ try {
595
+ sources = scanPdfs(cfg.pdfsDir);
596
+ }
597
+ catch (err) {
598
+ if (scanSpinner)
599
+ scanSpinner.fail(ui.errorColor("pdf scan failed"));
600
+ fileLog(`pdf scan failed: ${err.message}`);
601
+ return;
602
+ }
603
+ summary.pdfsScanned = sources.length;
604
+ if (scanSpinner) {
605
+ scanSpinner.succeed(ui.text(`scanned ${ui.info(String(sources.length))} ${ui.dim("pdfs")}`));
606
+ }
607
+ fileLog(`scanned ${sources.length} pdfs`);
608
+ for (const src of sources) {
609
+ try {
610
+ if (db.isPdfProcessed(src.filePath, src.hash))
611
+ continue;
612
+ // Extraction is heavy and only happens for new files (gated above).
613
+ const parsed = await parsePdf(src.filePath);
614
+ const distilled = await distillPdf(parsed, cfg);
615
+ if (!distilled) {
616
+ summary.pdfsSkipped += 1;
617
+ db.recordPdf({
618
+ path: parsed.filePath,
619
+ hash: parsed.hash,
620
+ skipped: true,
621
+ });
622
+ continue;
623
+ }
624
+ const notePath = await writer.writePdf(parsed, distilled);
625
+ summary.pdfsDistilled += 1;
626
+ summary.notesWritten.push(notePath);
627
+ db.recordPdf({
628
+ path: parsed.filePath,
629
+ hash: parsed.hash,
630
+ skipped: false,
631
+ notePath,
632
+ content: distilled.markdown,
633
+ category: distilled.classification.category,
634
+ title: parsed.title,
635
+ pages: parsed.pageCount,
636
+ confidence: distilled.classification.confidence,
637
+ distilledAt: new Date().toISOString(),
638
+ });
639
+ if (interactive) {
640
+ ui.categoryRow(distilled.classification.category, parsed.title);
641
+ }
642
+ fileLog(`distilled pdf → ${distilled.classification.category}/${parsed.title}`);
643
+ if (distilled.classification.confidence >= 0.8) {
644
+ notify(`Vir — new ${distilled.classification.category}`, parsed.title);
645
+ }
646
+ await new Promise((r) => setTimeout(r, 2000));
647
+ }
648
+ catch (err) {
649
+ summary.pdfsErrored += 1;
650
+ const msg = err.message ?? String(err);
651
+ if (interactive) {
652
+ ui.row(ui.errorColor(ui.CROSS), ui.text(`pdf error: ${msg}`));
653
+ }
654
+ fileLog(`error on pdf ${src.filePath}: ${msg}`);
655
+ try {
656
+ // Record with the source hash so a corrupt PDF isn't retried every run
657
+ // (same idempotency contract as articles).
658
+ db.recordPdf({
659
+ path: src.filePath,
660
+ hash: src.hash,
661
+ skipped: false,
662
+ error: msg,
663
+ });
664
+ }
665
+ catch {
666
+ // ignore record errors
667
+ }
668
+ }
669
+ }
670
+ }
525
671
  async function rewriteOne(writer, row) {
526
672
  const parsed = {
527
673
  path: row.path,