@djolex999/vir-cli 0.10.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ const CATEGORY_DIRS = {
15
15
  tools: "tool",
16
16
  articles: "article",
17
17
  topics: "topic",
18
+ pdfs: "pdf",
18
19
  };
19
20
  const WIRE_CATEGORIES = new Set([
20
21
  "pattern",
@@ -23,6 +24,7 @@ const WIRE_CATEGORIES = new Set([
23
24
  "tool",
24
25
  "article",
25
26
  "topic",
27
+ "pdf",
26
28
  ]);
27
29
  // Minimal YAML-block parser, kebab-flat. Mirrors mcp/server.ts deliberately —
28
30
  // the JSON contract is its own isolated surface and must not couple to the MCP
@@ -58,6 +60,10 @@ function categoryOf(fm, relPath, topicsDir) {
58
60
  return "article";
59
61
  if (fm.type === "topic")
60
62
  return "topic";
63
+ // PDF sub-taxonomy (paper/reference/notes/other) collapses to the single
64
+ // "pdf" wire bucket, exactly like articles collapse to "article".
65
+ if (fm.type === "pdf")
66
+ return "pdf";
61
67
  if (fm.category && WIRE_CATEGORIES.has(fm.category)) {
62
68
  return fm.category;
63
69
  }
@@ -1 +1 @@
1
- {"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AA2CrC,+EAA+E;AAC/E,4EAA4E;AAC5E,+EAA+E;AAC/E,MAAM,aAAa,GAAqC;IACtD,QAAQ,EAAE,SAAS;IACnB,OAAO,EAAE,QAAQ;IACjB,SAAS,EAAE,UAAU;IACrB,KAAK,EAAE,MAAM;IACb,QAAQ,EAAE,SAAS;IACnB,MAAM,EAAE,OAAO;CAChB,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAS;IACtC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,OAAO;CACR,CAAC,CAAC;AAEH,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACrB,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,KAAK,CAAC,CAAC;YAAE,SAAS;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC/B,IAAI,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,IACE,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC1C,CAAC;YACD,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,OAAO,CAAC,OAAe;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;IAC3D,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU,CACjB,EAA0B,EAC1B,OAAe,EACf,SAAiB;IAEjB,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO;QAAE,OAAO,OAAO,CAAC;IACxC,IAAI,EAAE,CAAC,QAAQ,IAAI,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,OAAO,EAAE,CAAC,QAA4B,CAAC;IACzC,CAAC;IACD,2EAA2E;IAC3E,8EAA8E;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,MAAM,GAAG,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;IAC/C,OAAO,aAAa,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;AACzC,CAAC;AAED,0EAA0E;AAC1E,8EAA8E;AAC9E,6EAA6E;AAC7E,iFAAiF;AACjF,gFAAgF;AAChF,2EAA2E;AAC3E,6EAA6E;AAC7E,MAAM,UAAU,iBAAiB,CAC/B,IAAiB,EACjB,SAAiB,EACjB,SAAS,GAAG,QAAQ;IAEpB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,EAAE,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC;QACnC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,QAAQ,EAAE,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC;YAC5C,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5C,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAC3B,OAAO,EAAE,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YAChE,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,IAAkB,EAClB,OAAe;IAEf,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAClC,CAAC;AAED,8EAA8E;AAC9E,+EAA+E;AAC/E,mEAAmE;AACnE,MAAM,UAAU,oBAAoB,CAClC,SAAkB,EAClB,UAAyB,EACzB,YAAoB,EACpB,MAAY,IAAI,IAAI,EAAE;IAEtB,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU;QAAE,OAAO,MAAM,CAAC;IAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACtC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,QAAQ,GAAG,CAAC,GAAG,YAAY,GAAG,SAAS,CAAC;IAC9C,OAAO,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC;AAC7D,CAAC;AAgBD,MAAM,UAAU,iBAAiB,CAAC,CAAe;IAC/C,OAAO;QACL,MAAM,EAAE,oBAAoB,CAC1B,CAAC,CAAC,eAAe,EACjB,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,YAAY,EACd,CAAC,CAAC,GAAG,CACN;QACD,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,MAAM,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC,WAAW,EAAE;QAC9D,OAAO,EAAE,CAAC,CAAC,OAAO;KACnB,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/output/json.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AA4CrC,+EAA+E;AAC/E,4EAA4E;AAC5E,+EAA+E;AAC/E,MAAM,aAAa,GAAqC;IACtD,QAAQ,EAAE,SAAS;IACnB,OAAO,EAAE,QAAQ;IACjB,SAAS,EAAE,UAAU;IACrB,KAAK,EAAE,MAAM;IACb,QAAQ,EAAE,SAAS;IACnB,MAAM,EAAE,OAAO;IACf,IAAI,EAAE,KAAK;CACZ,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAS;IACtC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAEH,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACrB,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,KAAK,CAAC,CAAC;YAAE,SAAS;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC/B,IAAI,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,IACE,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC1C,CAAC;YACD,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,OAAO,CAAC,OAAe;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;IAC3D,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAS,UAAU,CACjB,EAA0B,EAC1B,OAAe,EACf,SAAiB;IAEjB,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO;QAAE,OAAO,OAAO,CAAC;IACxC,yEAAyE;IACzE,kEAAkE;IAClE,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;QAAE,OAAO,KAAK,CAAC;IACpC,IAAI,EAAE,CAAC,QAAQ,IAAI,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,OAAO,EAAE,CAAC,QAA4B,CAAC;IACzC,CAAC;IACD,2EAA2E;IAC3E,8EAA8E;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,MAAM,GAAG,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;IAC/C,OAAO,aAAa,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;AACzC,CAAC;AAED,0EAA0E;AAC1E,8EAA8E;AAC9E,6EAA6E;AAC7E,iFAAiF;AACjF,gFAAgF;AAChF,2EAA2E;AAC3E,6EAA6E;AAC7E,MAAM,UAAU,iBAAiB,CAC/B,IAAiB,EACjB,SAAiB,EACjB,SAAS,GAAG,QAAQ;IAEpB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,EAAE,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC;QACnC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,QAAQ,EAAE,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC;YAC5C,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5C,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAC3B,OAAO,EAAE,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YAChE,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,IAAkB,EAClB,OAAe;IAEf,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAClC,CAAC;AAED,8EAA8E;AAC9E,+EAA+E;AAC/E,mEAAmE;AACnE,MAAM,UAAU,oBAAoB,CAClC,SAAkB,EAClB,UAAyB,EACzB,YAAoB,EACpB,MAAY,IAAI,IAAI,EAAE;IAEtB,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU;QAAE,OAAO,MAAM,CAAC;IAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACtC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,QAAQ,GAAG,CAAC,GAAG,YAAY,GAAG,SAAS,CAAC;IAC9C,OAAO,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC;AAC7D,CAAC;AAgBD,MAAM,UAAU,iBAAiB,CAAC,CAAe;IAC/C,OAAO;QACL,MAAM,EAAE,oBAAoB,CAC1B,CAAC,CAAC,eAAe,EACjB,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,YAAY,EACd,CAAC,CAAC,GAAG,CACN;QACD,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,MAAM,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC,WAAW,EAAE;QAC9D,OAAO,EAAE,CAAC,CAAC,OAAO;KACnB,CAAC;AACJ,CAAC"}
@@ -35,6 +35,17 @@ export function selectArticleEmbeddingTargets(rows) {
35
35
  r.embedding === null &&
36
36
  r.notePath !== null);
37
37
  }
38
+ // PDF counterpart, mirroring db.listPdfEmbeddingTargets's SQL filter — the exact
39
+ // complement of getPdfEmbeddings(). Same gates as articles (skipped/error,
40
+ // content present, note_path set, NULL embedding); PDFs have no archived column.
41
+ export function selectPdfEmbeddingTargets(rows) {
42
+ return rows.filter((r) => r.skipped === 0 &&
43
+ r.error === null &&
44
+ r.content !== null &&
45
+ r.content !== "" &&
46
+ r.embedding === null &&
47
+ r.notePath !== null);
48
+ }
38
49
  // Best-effort self-heal sweep. A write-time embedding miss (Ollama down when a
39
50
  // note was distilled) leaves `embedding = NULL`, which makes the note invisible
40
51
  // to the embedding-search path: it never enters getEmbeddings()'s candidate set,
@@ -52,12 +63,16 @@ export async function sweepEmbeddings(db) {
52
63
  const targets = db.listEmbeddingTargets();
53
64
  const topicTargets = db.listTopicEmbeddingTargets();
54
65
  const articleTargets = db.listArticleEmbeddingTargets();
66
+ const pdfTargets = db.listPdfEmbeddingTargets();
55
67
  if (!(await isOllamaAvailableCached())) {
56
68
  return {
57
69
  ran: false,
58
70
  embedded: 0,
59
71
  errors: 0,
60
- pending: targets.length + topicTargets.length + articleTargets.length,
72
+ pending: targets.length +
73
+ topicTargets.length +
74
+ articleTargets.length +
75
+ pdfTargets.length,
61
76
  };
62
77
  }
63
78
  let embedded = 0;
@@ -104,6 +119,21 @@ export async function sweepEmbeddings(db) {
104
119
  db.storeArticleEmbedding(a.path, vec);
105
120
  embedded += 1;
106
121
  }
122
+ // PDFs heal the same way (their own table). A paper distilled while Ollama was
123
+ // down left `embedding` NULL; back-fill keyed by the source path (the PK) via
124
+ // storePdfEmbedding — shipped WITH the getPdfEmbeddings read path so the new
125
+ // pdfs table can't reopen the NULL-embedding blind spot (the 0.8.2/0.8.3 trap).
126
+ for (const p of pdfTargets) {
127
+ if (!p.content || p.content.trim().length === 0)
128
+ continue;
129
+ const vec = await embeddingForNote(p.content);
130
+ if (!vec) {
131
+ errors += 1;
132
+ continue;
133
+ }
134
+ db.storePdfEmbedding(p.path, vec);
135
+ embedded += 1;
136
+ }
107
137
  return { ran: true, embedded, errors, pending: errors };
108
138
  }
109
139
  //# sourceMappingURL=embeddingSweep.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,GAKhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,uBAAuB,CAAC;AAE/B,+EAA+E;AAC/E,qEAAqE;AACrE,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,WAAW;AACX,MAAM,UAAU,sBAAsB,CACpC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,QAAQ,KAAK,IAAI;QACnB,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,CAC1B,CAAC;AACJ,CAAC;AAED,yDAAyD;AACzD,2EAA2E;AAC3E,8EAA8E;AAC9E,oCAAoC;AACpC,MAAM,UAAU,2BAA2B,CACzC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,EAAE,CACtE,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,8EAA8E;AAC9E,6EAA6E;AAC7E,oEAAoE;AACpE,MAAM,UAAU,6BAA6B,CAE3C,IAAS;IACT,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAaD,+EAA+E;AAC/E,gFAAgF;AAChF,iFAAiF;AACjF,uEAAuE;AACvE,2EAA2E;AAC3E,gFAAgF;AAChF,2EAA2E;AAC3E,wCAAwC;AACxC,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6EAA6E;AAC7E,oCAAoC;AACpC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,EAAW;IAC/C,MAAM,OAAO,GAAG,EAAE,CAAC,oBAAoB,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,EAAE,CAAC,yBAAyB,EAAE,CAAC;IACpD,MAAM,cAAc,GAAG,EAAE,CAAC,2BAA2B,EAAE,CAAC;IACxD,IAAI,CAAC,CAAC,MAAM,uBAAuB,EAAE,CAAC,EAAE,CAAC;QACvC,OAAO;YACL,GAAG,EAAE,KAAK;YACV,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,CAAC;YACT,OAAO,EAAE,OAAO,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,GAAG,cAAc,CAAC,MAAM;SACtE,CAAC;IACJ,CAAC;IACD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,cAAc,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAChD,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,0EAA0E;IAC1E,6EAA6E;IAC7E,2EAA2E;IAC3E,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,8EAA8E;IAC9E,6EAA6E;IAC7E,+EAA+E;IAC/E,8EAA8E;IAC9E,qCAAqC;IACrC,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACtC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AAC1D,CAAC"}
1
+ {"version":3,"file":"embeddingSweep.js","sourceRoot":"","sources":["../../src/pipeline/embeddingSweep.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,GAMhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,gBAAgB,EAChB,uBAAuB,GACxB,MAAM,uBAAuB,CAAC;AAE/B,+EAA+E;AAC/E,qEAAqE;AACrE,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAChF,WAAW;AACX,MAAM,UAAU,sBAAsB,CACpC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,QAAQ,KAAK,IAAI;QACnB,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,CAC1B,CAAC;AACJ,CAAC;AAED,yDAAyD;AACzD,2EAA2E;AAC3E,8EAA8E;AAC9E,oCAAoC;AACpC,MAAM,UAAU,2BAA2B,CACzC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,OAAO,KAAK,EAAE,CACtE,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,8EAA8E;AAC9E,6EAA6E;AAC7E,oEAAoE;AACpE,MAAM,UAAU,6BAA6B,CAE3C,IAAS;IACT,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAED,iFAAiF;AACjF,2EAA2E;AAC3E,iFAAiF;AACjF,MAAM,UAAU,yBAAyB,CACvC,IAAS;IAET,OAAO,IAAI,CAAC,MAAM,CAChB,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,KAAK,KAAK,IAAI;QAChB,CAAC,CAAC,OAAO,KAAK,IAAI;QAClB,CAAC,CAAC,OAAO,KAAK,EAAE;QAChB,CAAC,CAAC,SAAS,KAAK,IAAI;QACpB,CAAC,CAAC,QAAQ,KAAK,IAAI,CACtB,CAAC;AACJ,CAAC;AAaD,+EAA+E;AAC/E,gFAAgF;AAChF,iFAAiF;AACjF,uEAAuE;AACvE,2EAA2E;AAC3E,gFAAgF;AAChF,2EAA2E;AAC3E,wCAAwC;AACxC,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6EAA6E;AAC7E,oCAAoC;AACpC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,EAAW;IAC/C,MAAM,OAAO,GAAG,EAAE,CAAC,oBAAoB,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,EAAE,CAAC,yBAAyB,EAAE,CAAC;IACpD,MAAM,cAAc,GAAG,EAAE,CAAC,2BAA2B,EAAE,CAAC;IACxD,MAAM,UAAU,GAAG,EAAE,CAAC,uBAAuB,EAAE,CAAC;IAChD,IAAI,CAAC,CAAC,MAAM,uBAAuB,EAAE,CAAC,EAAE,CAAC;QACvC,OAAO;YACL,GAAG,EAAE,KAAK;YACV,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,CAAC;YACT,OAAO,EACL,OAAO,CAAC,MAAM;gBACd,YAAY,CAAC,MAAM;gBACnB,cAAc,CAAC,MAAM;gBACrB,UAAU,CAAC,MAAM;SACpB,CAAC;IACJ,CAAC;IACD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,cAAc,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAChD,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,0EAA0E;IAC1E,6EAA6E;IAC7E,2EAA2E;IAC3E,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,8EAA8E;IAC9E,6EAA6E;IAC7E,+EAA+E;IAC/E,8EAA8E;IAC9E,qCAAqC;IACrC,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACtC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,+EAA+E;IAC/E,8EAA8E;IAC9E,6EAA6E;IAC7E,gFAAgF;IAChF,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC1D,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC9C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAClC,QAAQ,IAAI,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AAC1D,CAAC"}
@@ -0,0 +1,141 @@
1
+ import { createHash } from "node:crypto";
2
+ import { join } from "node:path";
3
+ import { callLLM, maybeAnthropicClient, normalizeModelName, withRateLimitRetry, } from "./distiller.js";
4
+ import { scrub } from "./scrubber.js";
5
+ export const PDF_CATEGORIES = [
6
+ "paper",
7
+ "reference",
8
+ "notes",
9
+ "other",
10
+ ];
11
+ // Neutral fallback when the model returns an unrecognized category: "other"
12
+ // claims the least about the document's intent.
13
+ const DEFAULT_PDF_CATEGORY = "other";
14
+ export const PDFS_SUBDIR = "pdfs";
15
+ // Papers are long → bound the distill input so a single paper can't blow up
16
+ // token cost (hybrid routing would otherwise push it to Sonnet on size).
17
+ // Mirrors articleDistiller's 24k-char bound.
18
+ const CLASSIFY_EXCERPT_CHARS = 3000;
19
+ const MAX_BODY_CHARS = 24_000;
20
+ export async function distillPdf(parsed, cfg) {
21
+ const client = maybeAnthropicClient(cfg);
22
+ const classifyModel = normalizeModelName(cfg.models.classify, cfg.provider);
23
+ const distillModel = normalizeModelName(cfg.models.distill, cfg.provider);
24
+ // Extracted text still flows to a provider — scrub keys/paths/emails.
25
+ const body = scrub(parsed.text).slice(0, MAX_BODY_CHARS);
26
+ if (body.trim().length === 0)
27
+ return null;
28
+ const clsText = await withRateLimitRetry(() => callLLM(cfg, client, {
29
+ prompt: classifyPrompt(parsed, body),
30
+ model: classifyModel,
31
+ maxTokens: 200,
32
+ cost: { stage: "pdf-classify" },
33
+ }));
34
+ const classification = parsePdfClassification(clsText);
35
+ const markdown = (await withRateLimitRetry(() => callLLM(cfg, client, {
36
+ prompt: distillPrompt(parsed, body),
37
+ model: distillModel,
38
+ maxTokens: 1500,
39
+ cost: { stage: "pdf-distill" },
40
+ }))).trim();
41
+ if (markdown.length === 0)
42
+ return null;
43
+ return { classification, markdown };
44
+ }
45
+ function classifyPrompt(parsed, body) {
46
+ return `Classify this PDF document into exactly one category. Output JSON only:
47
+ { "category": "paper" | "reference" | "notes" | "other",
48
+ "confidence": number (0..1) }
49
+
50
+ paper = a research/academic paper (abstract, methods, results, contributions)
51
+ reference = documentation, a spec, manual, datasheet, or material to look up later
52
+ notes = lecture notes, slides, a course handout, or personal study notes
53
+ other = anything that doesn't fit the above
54
+
55
+ Title: ${parsed.title}
56
+
57
+ ${body.slice(0, CLASSIFY_EXCERPT_CHARS)}`;
58
+ }
59
+ function distillPrompt(parsed, body) {
60
+ return `Distill this PDF document into a durable knowledge note. Output markdown
61
+ only — no preamble, start with '## Summary'. Use these sections:
62
+
63
+ - ## Summary (2-3 sentences, in your own words)
64
+ - ## Key Points (bullet list: the main claims, findings, methods, or arguments)
65
+ - ## Methods & Findings (for a paper: the approach and what it concluded; omit
66
+ this section entirely if the document isn't a study)
67
+ - ## Related (plain-English topics this connects to, one per bullet — these get
68
+ turned into wikilinks automatically, so write them as short noun phrases)
69
+
70
+ COPYRIGHT — strict: this is someone else's IP. Never reproduce more than 15
71
+ consecutive words verbatim from the source. Paraphrase everything in your own
72
+ words. Do not reproduce figures, tables, equations, or full passages. Summarize
73
+ and cite by context, never quote at length.
74
+
75
+ Title: ${parsed.title}
76
+ Source: ${parsed.filePath}
77
+ Document:
78
+ ${body}`;
79
+ }
80
+ export function parsePdfClassification(text) {
81
+ const match = text.match(/\{[\s\S]*\}/);
82
+ if (!match)
83
+ return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
84
+ let obj;
85
+ try {
86
+ obj = JSON.parse(match[0]);
87
+ }
88
+ catch {
89
+ return { category: DEFAULT_PDF_CATEGORY, confidence: 0 };
90
+ }
91
+ const rawCat = typeof obj.category === "string" ? obj.category : "";
92
+ const category = PDF_CATEGORIES.includes(rawCat)
93
+ ? rawCat
94
+ : DEFAULT_PDF_CATEGORY;
95
+ const confRaw = typeof obj.confidence === "number"
96
+ ? obj.confidence
97
+ : Number(obj.confidence ?? 0);
98
+ const confidence = Number.isFinite(confRaw)
99
+ ? Math.max(0, Math.min(1, confRaw))
100
+ : 0;
101
+ return { category, confidence };
102
+ }
103
+ // Stable across content edits: a re-extracted PDF (new hash/text) keeps the same
104
+ // slug as long as its source path is unchanged, so a re-distill overwrites the
105
+ // same note instead of orphaning the old one (mirrors articleSlug, keyed off the
106
+ // source path rather than the content hash).
107
+ export function pdfSlug(parsed) {
108
+ const base = kebab(parsed.title).slice(0, 60);
109
+ const suffix = createHash("sha256")
110
+ .update(parsed.filePath)
111
+ .digest("hex")
112
+ .slice(0, 8);
113
+ return base.length > 0 ? `${base}-${suffix}` : `pdf-${suffix}`;
114
+ }
115
+ export function pdfRelPath(parsed) {
116
+ return join(PDFS_SUBDIR, `${pdfSlug(parsed)}.md`);
117
+ }
118
+ export function buildPdfFrontmatter(parsed, distilled) {
119
+ const lines = ["---", "type: pdf"];
120
+ lines.push(`category: ${distilled.classification.category}`);
121
+ lines.push(`source_path: ${parsed.filePath}`);
122
+ lines.push(`source_title: "${escapeYaml(parsed.title)}"`);
123
+ lines.push(`pages: ${parsed.pageCount}`);
124
+ lines.push(`distilled_at: ${new Date().toISOString()}`);
125
+ lines.push(`confidence: ${distilled.classification.confidence}`);
126
+ lines.push(`hash: ${parsed.hash}`);
127
+ lines.push("---", "");
128
+ return lines.join("\n");
129
+ }
130
+ function escapeYaml(s) {
131
+ return s.replace(/"/g, '\\"');
132
+ }
133
+ // Local copy of writer.kebab() — writer.ts depends on this module, so importing
134
+ // from it would create a cycle (mirrors articleDistiller.kebab).
135
+ function kebab(s) {
136
+ return s
137
+ .toLowerCase()
138
+ .replace(/[^a-z0-9]+/g, "-")
139
+ .replace(/^-+|-+$/g, "");
140
+ }
141
+ //# sourceMappingURL=pdfDistiller.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfDistiller.js","sourceRoot":"","sources":["../../src/pipeline/pdfDistiller.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EACL,OAAO,EACP,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAMtC,MAAM,CAAC,MAAM,cAAc,GAAkB;IAC3C,OAAO;IACP,WAAW;IACX,OAAO;IACP,OAAO;CACR,CAAC;AAEF,4EAA4E;AAC5E,gDAAgD;AAChD,MAAM,oBAAoB,GAAgB,OAAO,CAAC;AAElD,MAAM,CAAC,MAAM,WAAW,GAAG,MAAM,CAAC;AAElC,4EAA4E;AAC5E,yEAAyE;AACzE,6CAA6C;AAC7C,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,cAAc,GAAG,MAAM,CAAC;AAY9B,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAiB,EACjB,GAAW;IAEX,MAAM,MAAM,GAAG,oBAAoB,CAAC,GAAG,CAAC,CAAC;IACzC,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,kBAAkB,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE1E,sEAAsE;IACtE,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,cAAc,CAAC,CAAC;IACzD,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5C,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,cAAc,CAAC,MAAM,EAAE,IAAI,CAAC;QACpC,KAAK,EAAE,aAAa;QACpB,SAAS,EAAE,GAAG;QACd,IAAI,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE;KAChC,CAAC,CACH,CAAC;IACF,MAAM,cAAc,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;IAEvD,MAAM,QAAQ,GAAG,CACf,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5B,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE;QACnB,MAAM,EAAE,aAAa,CAAC,MAAM,EAAE,IAAI,CAAC;QACnC,KAAK,EAAE,YAAY;QACnB,SAAS,EAAE,IAAI;QACf,IAAI,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE;KAC/B,CAAC,CACH,CACF,CAAC,IAAI,EAAE,CAAC;IACT,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC;AACtC,CAAC;AAED,SAAS,cAAc,CAAC,MAAiB,EAAE,IAAY;IACrD,OAAO;;;;;;;;;SASA,MAAM,CAAC,KAAK;;EAEnB,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,sBAAsB,CAAC,EAAE,CAAC;AAC1C,CAAC;AAED,SAAS,aAAa,CAAC,MAAiB,EAAE,IAAY;IACpD,OAAO;;;;;;;;;;;;;;;SAeA,MAAM,CAAC,KAAK;UACX,MAAM,CAAC,QAAQ;;EAEvB,IAAI,EAAE,CAAC;AACT,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,IAAY;IACjD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IACxC,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IACrE,IAAI,GAA4B,CAAC;IACjC,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAA4B,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IACD,MAAM,MAAM,GAAG,OAAO,GAAG,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;IACpE,MAAM,QAAQ,GAAI,cAA2B,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5D,CAAC,CAAE,MAAsB;QACzB,CAAC,CAAC,oBAAoB,CAAC;IACzB,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,UAAU,KAAK,QAAQ;QAChC,CAAC,CAAC,GAAG,CAAC,UAAU;QAChB,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAClC,MAAM,UAAU,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC;QACzC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QACnC,CAAC,CAAC,CAAC,CAAC;IACN,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC;AAClC,CAAC;AAED,iFAAiF;AACjF,+EAA+E;AAC/E,iFAAiF;AACjF,6CAA6C;AAC7C,MAAM,UAAU,OAAO,CAAC,MAAiB;IACvC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC;SAChC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;SACvB,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACf,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC,OAAO,MAAM,EAAE,CAAC;AACjE,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,MAAiB;IAC1C,OAAO,IAAI,CAAC,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACpD,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,MAAiB,EACjB,SAAuB;IAEvB,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,aAAa,SAAS,CAAC,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7D,KAAK,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC9C,KAAK,CAAC,IAAI,CAAC,kBAAkB,UAAU,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,iBAAiB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,eAAe,SAAS,CAAC,cAAc,CAAC,UAAU,EAAE,CAAC,CAAC;IACjE,KAAK,CAAC,IAAI,CAAC,SAAS,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,KAAK,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACtB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;AAChC,CAAC;AAED,gFAAgF;AAChF,iEAAiE;AACjE,SAAS,KAAK,CAAC,CAAS;IACtB,OAAO,CAAC;SACL,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;AAC7B,CAAC"}
@@ -0,0 +1,76 @@
1
+ import { createHash } from "node:crypto";
2
+ import { readFileSync, readdirSync, statSync } from "node:fs";
3
+ import { basename, join } from "node:path";
4
+ import { extractText, getDocumentProxy } from "unpdf";
5
+ export async function parsePdf(filePath) {
6
+ const bytes = readFileSync(filePath);
7
+ const hash = createHash("sha256").update(bytes).digest("hex");
8
+ // unpdf bundles pdf.js; getDocumentProxy throws on corrupt/non-PDF bytes,
9
+ // which the run layer catches per-file and records (never crashes the run).
10
+ const proxy = await getDocumentProxy(new Uint8Array(bytes));
11
+ const { totalPages, text } = await extractText(proxy, { mergePages: true });
12
+ const metaTitle = await readMetadataTitle(proxy);
13
+ return {
14
+ filePath,
15
+ hash,
16
+ title: pdfTitle(metaTitle, filePath),
17
+ text: Array.isArray(text) ? text.join("\n") : text,
18
+ pageCount: totalPages,
19
+ };
20
+ }
21
+ export function scanPdfs(dir) {
22
+ const files = [];
23
+ walk(dir, files);
24
+ const out = [];
25
+ for (const f of files) {
26
+ try {
27
+ const hash = createHash("sha256").update(readFileSync(f)).digest("hex");
28
+ out.push({ filePath: f, hash });
29
+ }
30
+ catch {
31
+ // unreadable / mid-write file — skip, never fail the whole scan
32
+ }
33
+ }
34
+ return out;
35
+ }
36
+ // Pure: metadata Title wins when non-blank; otherwise the filename (sans .pdf).
37
+ export function pdfTitle(metaTitle, filePath) {
38
+ const t = (metaTitle ?? "").trim();
39
+ if (t.length > 0)
40
+ return t;
41
+ return basename(filePath).replace(/\.pdf$/i, "");
42
+ }
43
+ async function readMetadataTitle(proxy) {
44
+ try {
45
+ const meta = (await proxy.getMetadata());
46
+ const title = meta?.info?.Title;
47
+ return typeof title === "string" ? title : undefined;
48
+ }
49
+ catch {
50
+ return undefined;
51
+ }
52
+ }
53
+ function walk(dir, acc) {
54
+ let entries;
55
+ try {
56
+ entries = readdirSync(dir);
57
+ }
58
+ catch {
59
+ return;
60
+ }
61
+ for (const name of entries) {
62
+ const full = join(dir, name);
63
+ let st;
64
+ try {
65
+ st = statSync(full);
66
+ }
67
+ catch {
68
+ continue;
69
+ }
70
+ if (st.isDirectory())
71
+ walk(full, acc);
72
+ else if (st.isFile() && name.toLowerCase().endsWith(".pdf"))
73
+ acc.push(full);
74
+ }
75
+ }
76
+ //# sourceMappingURL=pdfReader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfReader.js","sourceRoot":"","sources":["../../src/pipeline/pdfReader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC3C,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,MAAM,OAAO,CAAC;AAwBtD,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,QAAgB;IAC7C,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAE9D,0EAA0E;IAC1E,4EAA4E;IAC5E,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5D,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5E,MAAM,SAAS,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,CAAC;IAEjD,OAAO;QACL,QAAQ;QACR,IAAI;QACJ,KAAK,EAAE,QAAQ,CAAC,SAAS,EAAE,QAAQ,CAAC;QACpC,IAAI,EAAE,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI;QAClD,SAAS,EAAE,UAAU;KACtB,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACjB,MAAM,GAAG,GAAgB,EAAE,CAAC;IAC5B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACxE,GAAG,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;QAAC,MAAM,CAAC;YACP,gEAAgE;QAClE,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,gFAAgF;AAChF,MAAM,UAAU,QAAQ,CAAC,SAA6B,EAAE,QAAgB;IACtE,MAAM,CAAC,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACnC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC3B,OAAO,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACnD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,KAAmD;IAEnD,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,WAAW,EAAE,CAEtC,CAAC;QACF,MAAM,KAAK,GAAG,IAAI,EAAE,IAAI,EAAE,KAAK,CAAC;QAChC,OAAO,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,GAAW,EAAE,GAAa;IACtC,IAAI,OAAiB,CAAC;IACtB,IAAI,CAAC;QACH,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;IACT,CAAC;IACD,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QAC7B,IAAI,EAAE,CAAC;QACP,IAAI,CAAC;YACH,EAAE,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QACD,IAAI,EAAE,CAAC,WAAW,EAAE;YAAE,IAAI,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;aACjC,IAAI,EAAE,CAAC,MAAM,EAAE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC"}