@askalf/deepdive 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -1
- package/dist/agent.d.ts +22 -1
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +99 -12
- package/dist/agent.js.map +1 -1
- package/dist/browser.d.ts +4 -0
- package/dist/browser.d.ts.map +1 -1
- package/dist/browser.js +54 -0
- package/dist/browser.js.map +1 -1
- package/dist/cli.d.ts +2 -0
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +49 -1
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +6 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +12 -0
- package/dist/config.js.map +1 -1
- package/dist/doctor.d.ts.map +1 -1
- package/dist/doctor.js +41 -1
- package/dist/doctor.js.map +1 -1
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/local.d.ts +16 -0
- package/dist/local.d.ts.map +1 -0
- package/dist/local.js +160 -0
- package/dist/local.js.map +1 -0
- package/dist/pdf.d.ts +28 -0
- package/dist/pdf.d.ts.map +1 -0
- package/dist/pdf.js +195 -0
- package/dist/pdf.js.map +1 -0
- package/dist/plan.d.ts +4 -3
- package/dist/plan.d.ts.map +1 -1
- package/dist/plan.js +8 -4
- package/dist/plan.js.map +1 -1
- package/dist/pricing.d.ts +29 -0
- package/dist/pricing.d.ts.map +1 -0
- package/dist/pricing.js +138 -0
- package/dist/pricing.js.map +1 -0
- package/dist/synthesize.d.ts +2 -1
- package/dist/synthesize.d.ts.map +1 -1
- package/dist/synthesize.js +9 -5
- package/dist/synthesize.js.map +1 -1
- package/package.json +2 -1
package/dist/local.js
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
// Local file source ingestion — `--include=<path>` injects files and
|
|
2
|
+
// directories as pre-fetched sources alongside whatever the search /
|
|
3
|
+
// browser pipeline finds on the web.
|
|
4
|
+
//
|
|
5
|
+
// Supports:
|
|
6
|
+
// .pdf — extracted via src/pdf.ts (requires pdfjs-dist)
|
|
7
|
+
// .md, .txt — read as plain text
|
|
8
|
+
// .html, .htm — stripped of tags, then treated as text
|
|
9
|
+
//
|
|
10
|
+
// A local source's URL is the canonical `file://` URI for its absolute
|
|
11
|
+
// path; the synthesizer treats them like any other source and the
|
|
12
|
+
// citation footer renders a clickable file:// link.
|
|
13
|
+
import { promises as fs } from "node:fs";
|
|
14
|
+
import { resolve, basename, extname, sep } from "node:path";
|
|
15
|
+
import { pathToFileURL } from "node:url";
|
|
16
|
+
import { extractPdfText, PdfExtractorMissingError, } from "./pdf.js";
|
|
17
|
+
const TEXT_EXTS = new Set([".md", ".markdown", ".txt", ".text"]);
|
|
18
|
+
const HTML_EXTS = new Set([".html", ".htm"]);
|
|
19
|
+
const PDF_EXTS = new Set([".pdf"]);
|
|
20
|
+
// Public entry point. Walks every input path (file or directory), collects
|
|
21
|
+
// supported files, and extracts content. Errors on a single file are
|
|
22
|
+
// recorded in `skipped`; the rest of the batch still ingests.
|
|
23
|
+
export async function ingestLocalPaths(inputs, opts) {
|
|
24
|
+
const sources = [];
|
|
25
|
+
const skipped = [];
|
|
26
|
+
const files = await expandPaths(inputs);
|
|
27
|
+
for (const abs of files) {
|
|
28
|
+
const ext = extname(abs).toLowerCase();
|
|
29
|
+
try {
|
|
30
|
+
const content = await readAndExtract(abs, ext, opts);
|
|
31
|
+
if (content === null) {
|
|
32
|
+
skipped.push({ path: abs, reason: `unsupported extension: ${ext}` });
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
sources.push({
|
|
36
|
+
url: pathToFileURL(abs).href,
|
|
37
|
+
title: basename(abs),
|
|
38
|
+
fetchedAt: Date.now(),
|
|
39
|
+
content: clampWords(content, opts.maxWordsPerSource),
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
catch (err) {
|
|
43
|
+
const reason = err instanceof PdfExtractorMissingError
|
|
44
|
+
? "pdfjs-dist not installed"
|
|
45
|
+
: err.message ?? "extraction failed";
|
|
46
|
+
skipped.push({ path: abs, reason });
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return { sources, skipped };
|
|
50
|
+
}
|
|
51
|
+
async function readAndExtract(abs, ext, opts) {
|
|
52
|
+
if (TEXT_EXTS.has(ext)) {
|
|
53
|
+
return await fs.readFile(abs, "utf-8");
|
|
54
|
+
}
|
|
55
|
+
if (HTML_EXTS.has(ext)) {
|
|
56
|
+
const html = await fs.readFile(abs, "utf-8");
|
|
57
|
+
return stripTags(html);
|
|
58
|
+
}
|
|
59
|
+
if (PDF_EXTS.has(ext)) {
|
|
60
|
+
const buf = await fs.readFile(abs);
|
|
61
|
+
const pdfOpts = { maxPages: opts.pdfMaxPages };
|
|
62
|
+
const result = await extractPdfText(new Uint8Array(buf), pdfOpts);
|
|
63
|
+
return result.text;
|
|
64
|
+
}
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
// Walks a list of files / dirs / globs into an ordered list of absolute
|
|
68
|
+
// file paths. Directories are walked one level deep by default — recursing
|
|
69
|
+
// into arbitrary trees is opt-in to avoid surprising users with a
|
|
70
|
+
// thousand-file ingestion when they pointed at their home directory.
|
|
71
|
+
// Exported for unit tests.
|
|
72
|
+
export async function expandPaths(inputs) {
|
|
73
|
+
const out = [];
|
|
74
|
+
const seen = new Set();
|
|
75
|
+
for (const raw of inputs) {
|
|
76
|
+
const abs = resolve(raw);
|
|
77
|
+
let stat;
|
|
78
|
+
try {
|
|
79
|
+
stat = await fs.stat(abs);
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
continue; // Missing path — skip silently; caller handles via no-results.
|
|
83
|
+
}
|
|
84
|
+
if (stat.isFile()) {
|
|
85
|
+
if (!seen.has(abs)) {
|
|
86
|
+
seen.add(abs);
|
|
87
|
+
out.push(abs);
|
|
88
|
+
}
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
if (stat.isDirectory()) {
|
|
92
|
+
const entries = await fs.readdir(abs, { withFileTypes: true });
|
|
93
|
+
for (const e of entries) {
|
|
94
|
+
if (!e.isFile())
|
|
95
|
+
continue;
|
|
96
|
+
const ext = extname(e.name).toLowerCase();
|
|
97
|
+
if (!TEXT_EXTS.has(ext) &&
|
|
98
|
+
!HTML_EXTS.has(ext) &&
|
|
99
|
+
!PDF_EXTS.has(ext)) {
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
const full = abs + sep + e.name;
|
|
103
|
+
if (!seen.has(full)) {
|
|
104
|
+
seen.add(full);
|
|
105
|
+
out.push(full);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return out;
|
|
111
|
+
}
|
|
112
|
+
// Exported for unit tests. Minimal HTML→text — drops <script>/<style>
|
|
113
|
+
// and tags, decodes a small set of entities. The browser's text
|
|
114
|
+
// extraction is far better; this is for offline files only (not for
|
|
115
|
+
// sanitizing untrusted input).
|
|
116
|
+
export function stripTags(html) {
|
|
117
|
+
return html
|
|
118
|
+
// Lazy match with `\b[^>]*>` to tolerate whitespace, attributes, or
|
|
119
|
+
// other junk between the tag name and the closing `>` — browsers
|
|
120
|
+
// accept </script foo>, </script\t\n bar>, </style xx>, etc., so a
|
|
121
|
+
// strict </script\s*> check would let those slip through. Bounded
|
|
122
|
+
// ([^>]* with no nested quantifier) so no polynomial-backtracking
|
|
123
|
+
// risk. Scope is offline-trusted input — not a security sanitizer.
|
|
124
|
+
.replace(/<script\b[\s\S]*?<\/script\b[^>]*>/gi, " ")
|
|
125
|
+
.replace(/<style\b[\s\S]*?<\/style\b[^>]*>/gi, " ")
|
|
126
|
+
.replace(/<!--[\s\S]*?-->/g, " ")
|
|
127
|
+
.replace(/<[^>]+>/g, " ")
|
|
128
|
+
// Single-pass entity decode. Sequential .replace() calls would
|
|
129
|
+
// double-unescape "&lt;" to "<"; one pass over the original
|
|
130
|
+
// string preserves the literal that the author wrote.
|
|
131
|
+
.replace(/&(?:amp|lt|gt|quot|nbsp|#39);/gi, decodeEntity)
|
|
132
|
+
.replace(/\s+/g, " ")
|
|
133
|
+
.trim();
|
|
134
|
+
}
|
|
135
|
+
function decodeEntity(match) {
|
|
136
|
+
switch (match.toLowerCase()) {
|
|
137
|
+
case "&":
|
|
138
|
+
return "&";
|
|
139
|
+
case "<":
|
|
140
|
+
return "<";
|
|
141
|
+
case ">":
|
|
142
|
+
return ">";
|
|
143
|
+
case """:
|
|
144
|
+
return '"';
|
|
145
|
+
case " ":
|
|
146
|
+
return " ";
|
|
147
|
+
case "'":
|
|
148
|
+
return "'";
|
|
149
|
+
default:
|
|
150
|
+
return match;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// Word-cap, mirrors what extract.ts does for web sources.
|
|
154
|
+
function clampWords(text, maxWords) {
|
|
155
|
+
const words = text.split(/\s+/).filter(Boolean);
|
|
156
|
+
if (words.length <= maxWords)
|
|
157
|
+
return text.trim();
|
|
158
|
+
return words.slice(0, maxWords).join(" ") + " …";
|
|
159
|
+
}
|
|
160
|
+
//# sourceMappingURL=local.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"local.js","sourceRoot":"","sources":["../src/local.ts"],"names":[],"mappings":"AAAA,qEAAqE;AACrE,qEAAqE;AACrE,qCAAqC;AACrC,EAAE;AACF,YAAY;AACZ,iEAAiE;AACjE,qCAAqC;AACrC,yDAAyD;AACzD,EAAE;AACF,uEAAuE;AACvE,kEAAkE;AAClE,oDAAoD;AAEpD,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,MAAM,WAAW,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EACL,cAAc,EACd,wBAAwB,GAEzB,MAAM,UAAU,CAAC;AAiBlB,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AACjE,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC;AAC7C,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;AAEnC,2EAA2E;AAC3E,qEAAqE;AACrE,8DAA8D;AAC9D,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,MAAgB,EAChB,IAAwB;IAExB,MAAM,OAAO,GAAoC,EAAE,CAAC;IACpD,MAAM,OAAO,GAAuC,EAAE,CAAC;IACvD,MAAM,KAAK,GAAG,MAAM,WAAW,CAAC,MAAM,CAAC,CAAC;IAExC,KAAK,MAAM,GAAG,IAAI,KAAK,EAAE,CAAC;QACxB,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;QACvC,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,CAAC,CAAC;YACrD,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;gBACrB,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,0BAA0B,GAAG,EAAE,EAAE,CAAC,CAAC;gBACrE,SAAS;YACX,CAAC;YACD,OAAO,CAAC,IAAI,CAAC;gBACX,GAAG,EAAE,aAAa,CAAC,GAAG,CAAC,CAAC,IAAI;gBAC5B,KAAK,EAAE,QAAQ,CAAC,GAAG,CAAC;gBACpB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;gBACrB,OAAO,EAAE,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,iBAAiB,CAAC;aACrD,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,MAAM,GACV,GAAG,YAAY,wBAAwB;gBACrC,CAAC,CAAC,0BAA0B;gBAC5B,CAAC,CAAE,GAAa,CAAC,OAAO,IAAI,mBAAmB,CAAC;YACpD,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,GAAW,EACX,GAAW,EACX,IAAwB;IAExB,IAAI,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,MAAM,EAAE,CAAC,QAAQ,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC;IACD,IAAI,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAC7C,OAAO,SAAS,CAAC,IAAI,CAAC,CAAC;IACzB,CAAC;IACD,IAAI,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QACnC,MAAM,OAAO,GAAsB,EAAE,QAAQ,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC;QAClE,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,IAAI,UAAU,CAAC,GAAG,CAAC,EAAE,OAAO,CAAC,CAAC;QAClE,OAAO,MAAM,CAAC,IAAI,CAAC;IACrB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,wEAAwE;AACxE,2EAA2E;AAC3E,kEAAkE;AAClE,qEAAqE;AACrE,2BAA2B;AAC3B,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,MAAgB;IAChD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC;QACzB,IAAI,IAAI,CAAC;QACT,IAAI,CAAC;YACH,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5B,CAAC;QAAC,MAAM,CAAC;YACP,SAAS,CAAC,+DAA+D;QAC3E,CAAC;QACD,IAAI,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;YAClB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBACd,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAChB,CAAC;YACD,SAAS;QACX,CAAC;QACD,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;YACvB,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;YAC/D,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;gBACxB,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE;oBAAE,SAAS;gBAC1B,MAAM,GAAG,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;gBAC1C,IACE,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;oBACnB,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;oBACnB,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,EAClB,CAAC;oBACD,SAAS;gBACX,CAAC;gBACD,MAAM,IAAI,GAAG,GAAG,GAAG,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC;gBAChC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACpB,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBACf,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,sEAAsE;AACtE,gEAAgE;AAChE,oEAAoE;AACpE,+BAA+B;AAC/B,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,OAAO,IAAI;QACT,oEAAoE;QACpE,iEAAiE;QACjE,mEAAmE;QACnE,kEAAkE;QAClE,kEAAkE;QAClE,mEAAmE;SAClE,OAAO,CAAC,sCAAsC,EAAE,GAAG,CAAC;SACpD,OAAO,CAAC,oCAAoC,EAAE,GAAG,CAAC;SAClD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC;SAChC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;QACzB,+DAA+D;QAC/D,gEAAgE;QAChE,sDAAsD;SACrD,OAAO,CAAC,iCAAiC,EAAE,YAAY,CAAC;SACxD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,YAAY,CAAC,KAAa;IACjC,QAAQ,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;QAC5B,KAAK,OAAO;YACV,OAAO,GAAG,CAAC;QACb,KAAK,MAAM;YACT,OAAO,GAAG,CAAC;QACb,KAAK,MAAM;YACT,OAAO,GAAG,CAAC;QACb,KAAK,QAAQ;YACX,OAAO,GAAG,CAAC;QACb,KAAK,QAAQ;YACX,OAAO,GAAG,CAAC;QACb,KAAK,OAAO;YACV,OAAO,GAAG,CAAC;QACb;YACE,OAAO,KAAK,CAAC;IACjB,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,SAAS,UAAU,CAAC,IAAY,EAAE,QAAgB;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAChD,IAAI,KAAK,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;IACjD,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC;AACnD,CAAC"}
|
package/dist/pdf.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export interface PdfExtractOptions {
|
|
2
|
+
maxPages?: number;
|
|
3
|
+
}
|
|
4
|
+
export interface PdfExtractResult {
|
|
5
|
+
text: string;
|
|
6
|
+
pageCount: number;
|
|
7
|
+
parsedPages: number;
|
|
8
|
+
truncated: boolean;
|
|
9
|
+
}
|
|
10
|
+
export declare class PdfExtractorMissingError extends Error {
|
|
11
|
+
constructor();
|
|
12
|
+
}
|
|
13
|
+
export declare function isPdfExtractorAvailable(): Promise<boolean>;
|
|
14
|
+
export declare function _resetPdfjsCache(): void;
|
|
15
|
+
export declare function extractPdfText(bytes: Uint8Array, opts?: PdfExtractOptions): Promise<PdfExtractResult>;
|
|
16
|
+
interface PdfTextItem {
|
|
17
|
+
str?: string;
|
|
18
|
+
hasEOL?: boolean;
|
|
19
|
+
}
|
|
20
|
+
export declare function joinTextItems(items: PdfTextItem[]): string;
|
|
21
|
+
export declare function dedupeRunningHeadersFooters(pages: string[]): string[];
|
|
22
|
+
export declare function looksLikePdf(args: {
|
|
23
|
+
url?: string;
|
|
24
|
+
finalUrl?: string;
|
|
25
|
+
contentType?: string;
|
|
26
|
+
}): boolean;
|
|
27
|
+
export {};
|
|
28
|
+
//# sourceMappingURL=pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../src/pdf.ts"],"names":[],"mappings":"AAkBA,MAAM,WAAW,iBAAiB;IAGhC,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,OAAO,CAAC;CACpB;AAED,qBAAa,wBAAyB,SAAQ,KAAK;;CAOlD;AAwBD,wBAAsB,uBAAuB,IAAI,OAAO,CAAC,OAAO,CAAC,CAEhE;AAGD,wBAAgB,gBAAgB,IAAI,IAAI,CAEvC;AAED,wBAAsB,cAAc,CAClC,KAAK,EAAE,UAAU,EACjB,IAAI,GAAE,iBAAsB,GAC3B,OAAO,CAAC,gBAAgB,CAAC,CA+D3B;AAUD,UAAU,WAAW;IACnB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAKD,wBAAgB,aAAa,CAAC,KAAK,EAAE,WAAW,EAAE,GAAG,MAAM,CAc1D;AAKD,wBAAgB,2BAA2B,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAyBrE;AAgCD,wBAAgB,YAAY,CAAC,IAAI,EAAE;IACjC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CASV"}
|
package/dist/pdf.js
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
// PDF text extraction — optional path, lazy-loaded.
|
|
2
|
+
//
|
|
3
|
+
// Real research questions hit PDFs constantly (academic papers, RFCs,
|
|
4
|
+
// standards bodies). Without this module, deepdive returns near-empty
|
|
5
|
+
// content for those URLs because the headless browser only extracts the
|
|
6
|
+
// rendered viewer chrome.
|
|
7
|
+
//
|
|
8
|
+
// Architectural choice: pdfjs-dist is NOT a runtime dependency of
|
|
9
|
+
// deepdive. It is dynamically imported the first time a PDF is fetched,
|
|
10
|
+
// and a clear "install pdfjs-dist to enable PDF support" message is
|
|
11
|
+
// surfaced if it's missing. This keeps the headline "one runtime
|
|
12
|
+
// dependency" accurate for default installs while making the feature
|
|
13
|
+
// available to anyone who wants it via:
|
|
14
|
+
//
|
|
15
|
+
// npm install -g pdfjs-dist
|
|
16
|
+
//
|
|
17
|
+
// `deepdive doctor` reports the install state.
|
|
18
|
+
export class PdfExtractorMissingError extends Error {
|
|
19
|
+
constructor() {
|
|
20
|
+
super("pdfjs-dist not installed — install with `npm install -g pdfjs-dist` to enable PDF source support");
|
|
21
|
+
this.name = "PdfExtractorMissingError";
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
// Lazy resolver for pdfjs-dist. Cached after the first call so we don't
|
|
25
|
+
// re-resolve on every PDF in a multi-PDF run. Returns null if the module
|
|
26
|
+
// is not installed; the caller is expected to surface a clear message.
|
|
27
|
+
let pdfjsCache = undefined;
|
|
28
|
+
async function loadPdfjs() {
|
|
29
|
+
if (pdfjsCache !== undefined)
|
|
30
|
+
return pdfjsCache;
|
|
31
|
+
try {
|
|
32
|
+
// The legacy build is the only one that runs cleanly in Node without
|
|
33
|
+
// configuring a worker. Keep this import path as a string literal so
|
|
34
|
+
// bundlers / tsc don't try to resolve it at compile time.
|
|
35
|
+
const mod = await import(
|
|
36
|
+
/* @vite-ignore */ "pdfjs-dist/legacy/build/pdf.mjs");
|
|
37
|
+
pdfjsCache = mod;
|
|
38
|
+
return mod;
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
pdfjsCache = null;
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// True iff pdfjs-dist is importable. Exported for doctor.
|
|
46
|
+
export async function isPdfExtractorAvailable() {
|
|
47
|
+
return (await loadPdfjs()) !== null;
|
|
48
|
+
}
|
|
49
|
+
// Resets the cache. Exported for tests; not used in production.
|
|
50
|
+
export function _resetPdfjsCache() {
|
|
51
|
+
pdfjsCache = undefined;
|
|
52
|
+
}
|
|
53
|
+
export async function extractPdfText(bytes, opts = {}) {
|
|
54
|
+
const pdfjs = (await loadPdfjs());
|
|
55
|
+
if (!pdfjs)
|
|
56
|
+
throw new PdfExtractorMissingError();
|
|
57
|
+
// pdfjs-dist's legacy build for Node still wants a worker source path
|
|
58
|
+
// even when `disableWorker: true` is set — it uses it to dynamically
|
|
59
|
+
// import the worker module for the "fake worker" inline path. Resolving
|
|
60
|
+
// the bundled worker file once and assigning its URL satisfies that
|
|
61
|
+
// requirement without spawning a real Worker thread.
|
|
62
|
+
if (pdfjs.GlobalWorkerOptions && !pdfjs.GlobalWorkerOptions.workerSrc) {
|
|
63
|
+
try {
|
|
64
|
+
const { createRequire } = await import("node:module");
|
|
65
|
+
const req = createRequire(import.meta.url);
|
|
66
|
+
pdfjs.GlobalWorkerOptions.workerSrc = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
// Best-effort — if resolution fails, the getDocument call below
|
|
70
|
+
// will throw a clearer error with the original message.
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
const maxPages = Math.max(1, opts.maxPages ?? 50);
|
|
74
|
+
// pdfjs-dist mutates the buffer it parses; pass a copy so cache
|
|
75
|
+
// round-trips don't corrupt across uses.
|
|
76
|
+
const buf = bytes.slice();
|
|
77
|
+
const doc = await pdfjs
|
|
78
|
+
.getDocument({
|
|
79
|
+
data: buf,
|
|
80
|
+
disableWorker: true,
|
|
81
|
+
isEvalSupported: false,
|
|
82
|
+
useSystemFonts: false,
|
|
83
|
+
})
|
|
84
|
+
.promise;
|
|
85
|
+
const pageCount = doc.numPages;
|
|
86
|
+
const parsedPages = Math.min(pageCount, maxPages);
|
|
87
|
+
const pageTexts = [];
|
|
88
|
+
for (let i = 1; i <= parsedPages; i++) {
|
|
89
|
+
const page = await doc.getPage(i);
|
|
90
|
+
const content = await page.getTextContent();
|
|
91
|
+
pageTexts.push(joinTextItems(content.items));
|
|
92
|
+
}
|
|
93
|
+
await doc.destroy().catch(() => undefined);
|
|
94
|
+
const text = dedupeRunningHeadersFooters(pageTexts).join("\n\n");
|
|
95
|
+
return {
|
|
96
|
+
text,
|
|
97
|
+
pageCount,
|
|
98
|
+
parsedPages,
|
|
99
|
+
truncated: parsedPages < pageCount,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
// Exported for unit tests. Joins a page's text-content items into
|
|
103
|
+
// readable prose, emitting a single space between items unless the item
|
|
104
|
+
// signals a hard line break via `hasEOL`.
|
|
105
|
+
export function joinTextItems(items) {
|
|
106
|
+
const out = [];
|
|
107
|
+
for (const item of items) {
|
|
108
|
+
const s = item.str ?? "";
|
|
109
|
+
if (!s && !item.hasEOL)
|
|
110
|
+
continue;
|
|
111
|
+
if (item.hasEOL) {
|
|
112
|
+
out.push(s);
|
|
113
|
+
out.push("\n");
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
out.push(s);
|
|
117
|
+
out.push(" ");
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return collapseWhitespace(out.join(""));
|
|
121
|
+
}
|
|
122
|
+
// Exported for unit tests. Drops lines that appear on >= 60% of pages
|
|
123
|
+
// (running headers / footers / page numbers). Frequency-based — does
|
|
124
|
+
// nothing on single-page PDFs.
|
|
125
|
+
export function dedupeRunningHeadersFooters(pages) {
|
|
126
|
+
if (pages.length < 3)
|
|
127
|
+
return pages;
|
|
128
|
+
const lineCounts = new Map();
|
|
129
|
+
for (const page of pages) {
|
|
130
|
+
const seenInPage = new Set();
|
|
131
|
+
for (const line of page.split(/\n+/)) {
|
|
132
|
+
const norm = normalizeForDedup(line);
|
|
133
|
+
if (!norm)
|
|
134
|
+
continue;
|
|
135
|
+
if (seenInPage.has(norm))
|
|
136
|
+
continue;
|
|
137
|
+
seenInPage.add(norm);
|
|
138
|
+
lineCounts.set(norm, (lineCounts.get(norm) ?? 0) + 1);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
const cutoff = Math.ceil(pages.length * 0.6);
|
|
142
|
+
const drop = new Set();
|
|
143
|
+
for (const [line, count] of lineCounts) {
|
|
144
|
+
if (count >= cutoff)
|
|
145
|
+
drop.add(line);
|
|
146
|
+
}
|
|
147
|
+
if (drop.size === 0)
|
|
148
|
+
return pages;
|
|
149
|
+
return pages.map((p) => p
|
|
150
|
+
.split(/\n+/)
|
|
151
|
+
.filter((line) => !drop.has(normalizeForDedup(line)))
|
|
152
|
+
.join("\n"));
|
|
153
|
+
}
|
|
154
|
+
function normalizeForDedup(line) {
|
|
155
|
+
// Treat "Page 5" and "Page 12" as different (they correctly count
|
|
156
|
+
// distinctly) but ignore leading/trailing whitespace.
|
|
157
|
+
return line.trim().replace(/\s+/g, " ").toLowerCase();
|
|
158
|
+
}
|
|
159
|
+
function collapseWhitespace(s) {
|
|
160
|
+
// Per-line normalization avoids the polynomial backtracking that
|
|
161
|
+
// ` *\n */g` would do on inputs with long horizontal whitespace
|
|
162
|
+
// runs and no newlines (CodeQL flagged the earlier form). Every
|
|
163
|
+
// operation here is single-pass linear: split → trim → filter.
|
|
164
|
+
const lines = s.split(/\r?\n/).map((line) => line.replace(/[ \t\f\v]+/g, " ").trim());
|
|
165
|
+
const out = [];
|
|
166
|
+
let blanks = 0;
|
|
167
|
+
for (const line of lines) {
|
|
168
|
+
if (line === "") {
|
|
169
|
+
blanks++;
|
|
170
|
+
if (blanks <= 1)
|
|
171
|
+
out.push("");
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
blanks = 0;
|
|
175
|
+
out.push(line);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return out.join("\n").trim();
|
|
179
|
+
}
|
|
180
|
+
// Returns true when the URL or content-type plausibly points at a PDF.
|
|
181
|
+
// Exported for unit tests and for browser.ts integration.
|
|
182
|
+
export function looksLikePdf(args) {
|
|
183
|
+
const ct = (args.contentType ?? "").toLowerCase();
|
|
184
|
+
if (ct.startsWith("application/pdf"))
|
|
185
|
+
return true;
|
|
186
|
+
for (const u of [args.finalUrl, args.url]) {
|
|
187
|
+
if (!u)
|
|
188
|
+
continue;
|
|
189
|
+
const lower = u.toLowerCase().split(/[?#]/)[0];
|
|
190
|
+
if (lower.endsWith(".pdf"))
|
|
191
|
+
return true;
|
|
192
|
+
}
|
|
193
|
+
return false;
|
|
194
|
+
}
|
|
195
|
+
//# sourceMappingURL=pdf.js.map
|
package/dist/pdf.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../src/pdf.ts"],"names":[],"mappings":"AAAA,oDAAoD;AACpD,EAAE;AACF,sEAAsE;AACtE,sEAAsE;AACtE,wEAAwE;AACxE,0BAA0B;AAC1B,EAAE;AACF,kEAAkE;AAClE,wEAAwE;AACxE,oEAAoE;AACpE,iEAAiE;AACjE,qEAAqE;AACrE,wCAAwC;AACxC,EAAE;AACF,8BAA8B;AAC9B,EAAE;AACF,+CAA+C;AAe/C,MAAM,OAAO,wBAAyB,SAAQ,KAAK;IACjD;QACE,KAAK,CACH,kGAAkG,CACnG,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF;AAED,wEAAwE;AACxE,yEAAyE;AACzE,uEAAuE;AACvE,IAAI,UAAU,GAAY,SAAS,CAAC;AACpC,KAAK,UAAU,SAAS;IACtB,IAAI,UAAU,KAAK,SAAS;QAAE,OAAO,UAAU,CAAC;IAChD,IAAI,CAAC;QACH,qEAAqE;QACrE,qEAAqE;QACrE,0DAA0D;QAC1D,MAAM,GAAG,GAAG,MAAM,MAAM;QACtB,kBAAkB,CAAC,iCAAiC,CACrD,CAAC;QACF,UAAU,GAAG,GAAG,CAAC;QACjB,OAAO,GAAG,CAAC;IACb,CAAC;IAAC,MAAM,CAAC;QACP,UAAU,GAAG,IAAI,CAAC;QAClB,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,MAAM,CAAC,KAAK,UAAU,uBAAuB;IAC3C,OAAO,CAAC,MAAM,SAAS,EAAE,CAAC,KAAK,IAAI,CAAC;AACtC,CAAC;AAED,gEAAgE;AAChE,MAAM,UAAU,gBAAgB;IAC9B,UAAU,GAAG,SAAS,CAAC;AACzB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAiB,EACjB,OAA0B,EAAE;IAE5B,MAAM,KAAK,GAAG,CAAC,MAAM,SAAS,EAAE,CAU3B,CAAC;IACN,IAAI,CAAC,KAAK;QAAE,MAAM,IAAI,wBAAwB,EAAE,CAAC;IAEjD,sEAAsE;IACtE,qEAAqE;IACrE,wEAAwE;IACxE,oEAAoE;IACpE,qDAAqD;IACrD,IAAI,KAAK,CAAC,mBAAmB,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,SAAS,EAAE,CAAC;QACtE,IAAI,CAAC;YACH,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;YACtD,MAAM,GAAG,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC3C,KAAK,CAAC,mBAAmB,CAAC,SAAS,GAAG,GAAG,CAAC,OAAO,CAC/C,wCAAwC,CACzC,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,gEAAgE;YAChE,wDAAwD;QAC1D,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IAElD,gEAAgE;IAChE,yCAAyC;IACzC,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC;IAC1B,MAAM,GAAG,GAAG,MAAM,KAAK;SACpB,WAAW,CAAC;QACX,IAAI,EAAE,GAAG;QACT,aAAa,EAAE,IAAI;QACnB,eAAe,EAAE,KAAK;QACtB,cAAc,EAAE,KAAK;KACtB,CAAC;SACD,OAAO,CAAC;IAEX,MAAM,SAAS,GAAG,GAAG,CAAC,QAAQ,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAClD,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;QAC5C,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;IAC/C,CAAC;IACD,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,CAAC;IAE3C,MAAM,IAAI,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACjE,OAAO;QACL,IAAI;QACJ,SAAS;QACT,WAAW;QACX,SAAS,EAAE,WAAW,GAAG,SAAS;KACnC,CAAC;AACJ,CAAC;AAeD,kEAAkE;AAClE,wEAAwE;AACxE,0CAA0C;AAC1C,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC;QACzB,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,SAAS;QACjC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACZ,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACZ,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC;IACH,CAAC;IACD,OAAO,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;AAC1C,CAAC;AAED,sEAAsE;AACtE,qEAAqE;AACrE,+BAA+B;AAC/B,MAAM,UAAU,2BAA2B,CAAC,KAAe;IACzD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACnC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;QACrC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YACrC,MAAM,IAAI,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC;YACrC,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,SAAS;YACnC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrB,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACxD,CAAC;IACH,CAAC;IACD,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC;IAC7C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,UAAU,EAAE,CAAC;QACvC,IAAI,KAAK,IAAI,MAAM;YAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACtC,CAAC;IACD,IAAI,IAAI,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAClC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CACrB,CAAC;SACE,KAAK,CAAC,KAAK,CAAC;SACZ,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;SACpD,IAAI,CAAC,IAAI,CAAC,CACd,CAAC;AACJ,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY;IACrC,kEAAkE;IAClE,sDAAsD;IACtD,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;AACxD,CAAC;AAED,SAAS,kBAAkB,CAAC,CAAS;IACnC,iEAAiE;IACjE,gEAAgE;IAChE,gEAAgE;IAChE,+DAA+D;IAC/D,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC1C,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CACxC,CAAC;IACF,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,KAAK,EAAE,EAAE,CAAC;YAChB,MAAM,EAAE,CAAC;YACT,IAAI,MAAM,IAAI,CAAC;gBAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,GAAG,CAAC,CAAC;YACX,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;AAC/B,CAAC;AAED,uEAAuE;AACvE,0DAA0D;AAC1D,MAAM,UAAU,YAAY,CAAC,IAI5B;IACC,MAAM,EAAE,GAAG,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAClD,IAAI,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IAClD,KAAK,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QAC1C,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,MAAM,KAAK,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,IAAI,CAAC;IAC1C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
|
package/dist/plan.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { type LLMConfig } from "./llm.js";
|
|
1
|
+
import { type LLMConfig, type LLMResult } from "./llm.js";
|
|
2
|
+
export type UsageSink = (usage: NonNullable<LLMResult["usage"]>) => void;
|
|
2
3
|
export interface Plan {
|
|
3
4
|
queries: string[];
|
|
4
5
|
reasoning: string;
|
|
@@ -8,8 +9,8 @@ export interface Critique {
|
|
|
8
9
|
reasoning: string;
|
|
9
10
|
queries: string[];
|
|
10
11
|
}
|
|
11
|
-
export declare function planQueries(question: string, config: LLMConfig, signal?: AbortSignal): Promise<Plan>;
|
|
12
|
+
export declare function planQueries(question: string, config: LLMConfig, signal?: AbortSignal, onUsage?: UsageSink): Promise<Plan>;
|
|
12
13
|
export declare function parsePlan(raw: string): Plan;
|
|
13
|
-
export declare function critique(question: string, draftAnswer: string, priorQueries: string[], config: LLMConfig, signal?: AbortSignal): Promise<Critique>;
|
|
14
|
+
export declare function critique(question: string, draftAnswer: string, priorQueries: string[], config: LLMConfig, signal?: AbortSignal, onUsage?: UsageSink): Promise<Critique>;
|
|
14
15
|
export declare function parseCritique(raw: string): Critique;
|
|
15
16
|
//# sourceMappingURL=plan.d.ts.map
|
package/dist/plan.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"plan.d.ts","sourceRoot":"","sources":["../src/plan.ts"],"names":[],"mappings":"AAOA,OAAO,EAAW,KAAK,SAAS,EAAE,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"plan.d.ts","sourceRoot":"","sources":["../src/plan.ts"],"names":[],"mappings":"AAOA,OAAO,EAAW,KAAK,SAAS,EAAE,KAAK,SAAS,EAAE,MAAM,UAAU,CAAC;AAEnE,MAAM,MAAM,SAAS,GAAG,CAAC,KAAK,EAAE,WAAW,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,CAAC;AAEzE,MAAM,WAAW,IAAI;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,OAAO,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAaD,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,SAAS,EACjB,MAAM,CAAC,EAAE,WAAW,EACpB,OAAO,CAAC,EAAE,SAAS,GAClB,OAAO,CAAC,IAAI,CAAC,CASf;AAGD,wBAAgB,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,CAiB3C;AAqBD,wBAAsB,QAAQ,CAC5B,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,MAAM,EACnB,YAAY,EAAE,MAAM,EAAE,EACtB,MAAM,EAAE,SAAS,EACjB,MAAM,CAAC,EAAE,WAAW,EACpB,OAAO,CAAC,EAAE,SAAS,GAClB,OAAO,CAAC,QAAQ,CAAC,CAenB;AAGD,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,QAAQ,CAsBnD"}
|
package/dist/plan.js
CHANGED
|
@@ -15,8 +15,10 @@ Rules:
|
|
|
15
15
|
|
|
16
16
|
Output FORMAT (strict): one JSON object, no prose before or after, matching:
|
|
17
17
|
{"reasoning": "<1-2 sentences on your decomposition>", "queries": ["q1", "q2", ...]}`;
|
|
18
|
-
export async function planQueries(question, config, signal) {
|
|
19
|
-
const { text } = await callLLM([{ role: "user", content: question }], PLANNER_SYSTEM, config, signal);
|
|
18
|
+
export async function planQueries(question, config, signal, onUsage) {
|
|
19
|
+
const { text, usage } = await callLLM([{ role: "user", content: question }], PLANNER_SYSTEM, config, signal);
|
|
20
|
+
if (usage && onUsage)
|
|
21
|
+
onUsage(usage);
|
|
20
22
|
return parsePlan(text);
|
|
21
23
|
}
|
|
22
24
|
// Exported for unit tests.
|
|
@@ -58,13 +60,15 @@ Rules:
|
|
|
58
60
|
|
|
59
61
|
Output FORMAT (strict): one JSON object, no prose before or after, matching:
|
|
60
62
|
{"done": bool, "reasoning": "<1-2 sentences>", "queries": ["q1", "q2", ...]}`;
|
|
61
|
-
export async function critique(question, draftAnswer, priorQueries, config, signal) {
|
|
63
|
+
export async function critique(question, draftAnswer, priorQueries, config, signal, onUsage) {
|
|
62
64
|
const userMessage = `Question: ${question}\n\n` +
|
|
63
65
|
`Draft answer:\n${draftAnswer}\n\n` +
|
|
64
66
|
`Queries already run (${priorQueries.length}):\n` +
|
|
65
67
|
priorQueries.map((q) => `- ${q}`).join("\n") +
|
|
66
68
|
`\n\nReview the draft and propose follow-up queries if needed.`;
|
|
67
|
-
const { text } = await callLLM([{ role: "user", content: userMessage }], CRITIC_SYSTEM, config, signal);
|
|
69
|
+
const { text, usage } = await callLLM([{ role: "user", content: userMessage }], CRITIC_SYSTEM, config, signal);
|
|
70
|
+
if (usage && onUsage)
|
|
71
|
+
onUsage(usage);
|
|
68
72
|
return parseCritique(text);
|
|
69
73
|
}
|
|
70
74
|
// Exported for unit tests.
|
package/dist/plan.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"plan.js","sourceRoot":"","sources":["../src/plan.ts"],"names":[],"mappings":"AAAA,2EAA2E;AAC3E,4DAA4D;AAC5D,EAAE;AACF,4EAA4E;AAC5E,2EAA2E;AAC3E,0DAA0D;AAE1D,OAAO,EAAE,OAAO,
|
|
1
|
+
{"version":3,"file":"plan.js","sourceRoot":"","sources":["../src/plan.ts"],"names":[],"mappings":"AAAA,2EAA2E;AAC3E,4DAA4D;AAC5D,EAAE;AACF,4EAA4E;AAC5E,2EAA2E;AAC3E,0DAA0D;AAE1D,OAAO,EAAE,OAAO,EAAkC,MAAM,UAAU,CAAC;AAenE,MAAM,cAAc,GAAG;;;;;;;;;qFAS8D,CAAC;AAEtF,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,MAAiB,EACjB,MAAoB,EACpB,OAAmB;IAEnB,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,OAAO,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,EACrC,cAAc,EACd,MAAM,EACN,MAAM,CACP,CAAC;IACF,IAAI,KAAK,IAAI,OAAO;QAAE,OAAO,CAAC,KAAK,CAAC,CAAC;IACrC,OAAO,SAAS,CAAC,IAAI,CAAC,CAAC;AACzB,CAAC;AAED,2BAA2B;AAC3B,MAAM,UAAU,SAAS,CAAC,GAAW;IACnC,MAAM,IAAI,GAAG,sBAAsB,CAAC,GAAG,CAAC,CAAC;IACzC,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IAChF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAA8C,CAAC;IAC7E,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClE,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,CAAC;IACD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO;SAC3B,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;SACjD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;SAC3B,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACf,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC5E,OAAO;QACL,OAAO;QACP,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;KACxE,CAAC;AACJ,CAAC;AAED,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;6EAiBuD,CAAC;AAE9E,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,QAAgB,EAChB,WAAmB,EACnB,YAAsB,EACtB,MAAiB,EACjB,MAAoB,EACpB,OAAmB;IAEnB,MAAM,WAAW,GACf,aAAa,QAAQ,MAAM;QAC3B,kBAAkB,WAAW,MAAM;QACnC,wBAAwB,YAAY,CAAC,MAAM,MAAM;QACjD,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;QAC5C,+DAA+D,CAAC;IAClE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,OAAO,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,EACxC,aAAa,EACb,MAAM,EACN,MAAM,CACP,CAAC;IACF,IAAI,KAAK,IAAI,OAAO;QAAE,OAAO,CAAC,KAAK,CAAC,CAAC;IACrC,OAAO,aAAa,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED,2BAA2B;AAC3B,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,MAAM,IAAI,GAAG,sBAAsB,CAAC,GAAG,CAAC,CAAC;IACzC,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAI7B,CAAC;IACF,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC;QAC3C,CAAC,CAAC,MAAM,CAAC,OAAO;aACX,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;aACjD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;QAChB,CAAC,CAAC,EAAE,CAAC;IACP,MAAM,IAAI,GACR,OAAO,MAAM,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC;IACxE,OAAO;QACL,IAAI;QACJ,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;QACvE,OAAO;KACR,CAAC;AACJ,CAAC;AAED,SAAS,sBAAsB,CAAC,CAAS;IACvC,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAC7B,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAC9B,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,QAAQ,GAAG,KAAK,CAAC;IACrB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACf,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,GAAG,KAAK,CAAC;YACf,SAAS;QACX,CAAC;QACD,IAAI,CAAC,KAAK,IAAI,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,GAAG,IAAI,CAAC;YACd,SAAS;QACX,CAAC;QACD,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;YACd,QAAQ,GAAG,CAAC,QAAQ,CAAC;YACrB,SAAS;QACX,CAAC;QACD,IAAI,QAAQ;YAAE,SAAS;QACvB,IAAI,CAAC,KAAK,GAAG;YAAE,KAAK,EAAE,CAAC;aAClB,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;YACnB,KAAK,EAAE,CAAC;YACR,IAAI,KAAK,KAAK,CAAC;gBAAE,OAAO,CAAC,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export interface ModelPrice {
|
|
2
|
+
inputPerMTok: number;
|
|
3
|
+
outputPerMTok: number;
|
|
4
|
+
}
|
|
5
|
+
export interface TokenUsage {
|
|
6
|
+
inputTokens: number;
|
|
7
|
+
outputTokens: number;
|
|
8
|
+
}
|
|
9
|
+
export interface CostEstimate {
|
|
10
|
+
amountUsd: number;
|
|
11
|
+
knownModel: boolean;
|
|
12
|
+
inputTokens: number;
|
|
13
|
+
outputTokens: number;
|
|
14
|
+
calls: number;
|
|
15
|
+
}
|
|
16
|
+
export declare const PRICE_TABLE: Record<string, ModelPrice>;
|
|
17
|
+
export declare const PRICE_TABLE_VERIFIED_AT = "2026-05-05";
|
|
18
|
+
export declare const PRICE_TABLE_STALE_AFTER_DAYS = 90;
|
|
19
|
+
export declare const DARIO_DEFAULT_BASE_URL = "http://localhost:3456";
|
|
20
|
+
export declare function priceFor(model: string, env?: Record<string, string | undefined>): ModelPrice | undefined;
|
|
21
|
+
export declare function estimateCost(usage: TokenUsage & {
|
|
22
|
+
calls: number;
|
|
23
|
+
}, model: string, env?: Record<string, string | undefined>): CostEstimate;
|
|
24
|
+
export declare function formatCostLine(estimate: CostEstimate, model: string): string;
|
|
25
|
+
export declare function looksLikeDario(baseUrl: string): boolean;
|
|
26
|
+
export declare function formatUsd(amount: number): string;
|
|
27
|
+
export declare function formatTokens(n: number): string;
|
|
28
|
+
export declare function daysAgo(isoDate: string, now?: number): number;
|
|
29
|
+
//# sourceMappingURL=pricing.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pricing.d.ts","sourceRoot":"","sources":["../src/pricing.ts"],"names":[],"mappings":"AAaA,MAAM,WAAW,UAAU;IACzB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,UAAU;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,OAAO,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,KAAK,EAAE,MAAM,CAAC;CACf;AASD,eAAO,MAAM,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAIlD,CAAC;AAEF,eAAO,MAAM,uBAAuB,eAAe,CAAC;AAIpD,eAAO,MAAM,4BAA4B,KAAK,CAAC;AAK/C,eAAO,MAAM,sBAAsB,0BAA0B,CAAC;AAK9D,wBAAgB,QAAQ,CACtB,KAAK,EAAE,MAAM,EACb,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,SAAS,CAAC,GACvC,UAAU,GAAG,SAAS,CAUxB;AAED,wBAAgB,YAAY,CAC1B,KAAK,EAAE,UAAU,GAAG;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,EACrC,KAAK,EAAE,MAAM,EACb,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,SAAS,CAAC,GACvC,YAAY,CAqBd;AAOD,wBAAgB,cAAc,CAAC,QAAQ,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CAQ5E;AAID,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAEvD;AASD,wBAAgB,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAKhD;AAGD,wBAAgB,YAAY,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAK9C;AAID,wBAAgB,OAAO,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,GAAE,MAAmB,GAAG,MAAM,CAMzE"}
|