lobster-cli 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/agent/core.js +63 -0
- package/dist/agent/core.js.map +1 -1
- package/dist/agent/index.js +63 -0
- package/dist/agent/index.js.map +1 -1
- package/dist/browser/index.js +294 -0
- package/dist/browser/index.js.map +1 -1
- package/dist/doc/index.js +31715 -0
- package/dist/doc/index.js.map +1 -0
- package/dist/index.js +32070 -221
- package/dist/index.js.map +1 -1
- package/dist/lib.js +352 -2
- package/dist/lib.js.map +1 -1
- package/dist/llm/client.js +63 -0
- package/dist/llm/client.js.map +1 -1
- package/dist/llm/index.js +63 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/openai-client.js +63 -0
- package/dist/llm/openai-client.js.map +1 -1
- package/dist/router/index.js +63 -0
- package/dist/router/index.js.map +1 -1
- package/package.json +15 -2
package/dist/lib.js
CHANGED
|
@@ -1,3 +1,211 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// src/browser/pdf.ts
|
|
12
|
+
var pdf_exports = {};
|
|
13
|
+
__export(pdf_exports, {
|
|
14
|
+
extractPdf: () => extractPdf,
|
|
15
|
+
isPdfResponse: () => isPdfResponse,
|
|
16
|
+
isPdfUrl: () => isPdfUrl,
|
|
17
|
+
tryExtractPdf: () => tryExtractPdf
|
|
18
|
+
});
|
|
19
|
+
import { readFileSync as readFileSync3 } from "fs";
|
|
20
|
+
async function getPdfParser() {
|
|
21
|
+
if (!pdfParseFn) {
|
|
22
|
+
const mod = await import("pdf-parse");
|
|
23
|
+
const PDFParseClass = mod.PDFParse;
|
|
24
|
+
if (PDFParseClass && typeof PDFParseClass === "function") {
|
|
25
|
+
pdfParseFn = async (buffer) => {
|
|
26
|
+
const parser = new PDFParseClass(buffer);
|
|
27
|
+
return parser.parse ? await parser.parse() : parser;
|
|
28
|
+
};
|
|
29
|
+
} else {
|
|
30
|
+
pdfParseFn = mod.default || mod;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return pdfParseFn;
|
|
34
|
+
}
|
|
35
|
+
function isPdfUrl(urlOrPath) {
|
|
36
|
+
const lower = urlOrPath.toLowerCase();
|
|
37
|
+
if (lower.endsWith(".pdf")) return true;
|
|
38
|
+
if (/\/pdf\//.test(lower)) return true;
|
|
39
|
+
if (/arxiv\.org\/pdf\//.test(lower)) return true;
|
|
40
|
+
if (/[?&]format=pdf/i.test(lower)) return true;
|
|
41
|
+
if (/[?&]type=pdf/i.test(lower)) return true;
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
function isPdfResponse(contentType) {
|
|
45
|
+
return contentType.includes("application/pdf");
|
|
46
|
+
}
|
|
47
|
+
async function downloadPdf(url) {
|
|
48
|
+
const response = await fetch(url, {
|
|
49
|
+
headers: {
|
|
50
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
51
|
+
"Accept": "application/pdf,*/*"
|
|
52
|
+
},
|
|
53
|
+
redirect: "follow"
|
|
54
|
+
});
|
|
55
|
+
if (!response.ok) {
|
|
56
|
+
throw new Error(`Failed to download PDF: ${response.status} ${response.statusText}`);
|
|
57
|
+
}
|
|
58
|
+
const contentType = response.headers.get("content-type") || "";
|
|
59
|
+
if (!contentType.includes("pdf") && !isPdfUrl(url)) {
|
|
60
|
+
}
|
|
61
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
62
|
+
return Buffer.from(arrayBuffer);
|
|
63
|
+
}
|
|
64
|
+
function readLocalPdf(filePath) {
|
|
65
|
+
return readFileSync3(filePath);
|
|
66
|
+
}
|
|
67
|
+
function textToMarkdown(text, metadata) {
|
|
68
|
+
const lines = text.split("\n");
|
|
69
|
+
const mdLines = [];
|
|
70
|
+
if (metadata.title && metadata.title !== "untitled") {
|
|
71
|
+
mdLines.push(`# ${metadata.title}`);
|
|
72
|
+
mdLines.push("");
|
|
73
|
+
if (metadata.author) {
|
|
74
|
+
mdLines.push(`**Authors:** ${metadata.author}`);
|
|
75
|
+
mdLines.push("");
|
|
76
|
+
}
|
|
77
|
+
mdLines.push("---");
|
|
78
|
+
mdLines.push("");
|
|
79
|
+
}
|
|
80
|
+
let inReferences = false;
|
|
81
|
+
let prevWasBlank = false;
|
|
82
|
+
let paragraphBuffer = [];
|
|
83
|
+
function flushParagraph() {
|
|
84
|
+
if (paragraphBuffer.length > 0) {
|
|
85
|
+
mdLines.push(paragraphBuffer.join(" "));
|
|
86
|
+
mdLines.push("");
|
|
87
|
+
paragraphBuffer = [];
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
for (let i = 0; i < lines.length; i++) {
|
|
91
|
+
const line = lines[i].trim();
|
|
92
|
+
if (!line) {
|
|
93
|
+
if (!prevWasBlank) {
|
|
94
|
+
flushParagraph();
|
|
95
|
+
}
|
|
96
|
+
prevWasBlank = true;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
prevWasBlank = false;
|
|
100
|
+
const numberedHeading = line.match(/^(\d+\.?\d*\.?\d*)\s+([A-Z][A-Za-z\s:&-]+)$/);
|
|
101
|
+
if (numberedHeading && line.length < 80) {
|
|
102
|
+
flushParagraph();
|
|
103
|
+
const depth = numberedHeading[1].split(".").filter(Boolean).length;
|
|
104
|
+
const prefix = depth <= 1 ? "##" : depth === 2 ? "###" : "####";
|
|
105
|
+
mdLines.push(`${prefix} ${line}`);
|
|
106
|
+
mdLines.push("");
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (line === line.toUpperCase() && line.length > 3 && line.length < 60 && /^[A-Z\s:&-]+$/.test(line)) {
|
|
110
|
+
flushParagraph();
|
|
111
|
+
mdLines.push(`## ${line.charAt(0) + line.slice(1).toLowerCase()}`);
|
|
112
|
+
mdLines.push("");
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
const knownHeadings = /^(Abstract|Introduction|Conclusion|Discussion|Results|Methods|Methodology|Background|Related Work|Acknowledgments|Acknowledgements|References|Bibliography|Appendix)/i;
|
|
116
|
+
if (knownHeadings.test(line) && line.length < 40) {
|
|
117
|
+
flushParagraph();
|
|
118
|
+
if (/^(References|Bibliography)/i.test(line)) {
|
|
119
|
+
inReferences = true;
|
|
120
|
+
}
|
|
121
|
+
mdLines.push(`## ${line}`);
|
|
122
|
+
mdLines.push("");
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (/^[-•∙◦▪]/.test(line)) {
|
|
126
|
+
flushParagraph();
|
|
127
|
+
mdLines.push(`- ${line.replace(/^[-•∙◦▪]\s*/, "")}`);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (/^\(\d+\)|^[a-z]\)/.test(line)) {
|
|
131
|
+
flushParagraph();
|
|
132
|
+
mdLines.push(`- ${line}`);
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
if (inReferences && /^\[?\d+\]?\.?\s/.test(line)) {
|
|
136
|
+
flushParagraph();
|
|
137
|
+
mdLines.push(`- ${line}`);
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
if (line.endsWith("-") && i + 1 < lines.length) {
|
|
141
|
+
paragraphBuffer.push(line.slice(0, -1));
|
|
142
|
+
} else {
|
|
143
|
+
paragraphBuffer.push(line);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
flushParagraph();
|
|
147
|
+
return mdLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
148
|
+
}
|
|
149
|
+
async function extractFromBuffer(buffer) {
|
|
150
|
+
const parse = await getPdfParser();
|
|
151
|
+
const result = await parse(buffer);
|
|
152
|
+
const info = result.info || {};
|
|
153
|
+
const metadata = {
|
|
154
|
+
title: info.Title || "untitled",
|
|
155
|
+
author: info.Author || "",
|
|
156
|
+
pages: result.numpages,
|
|
157
|
+
creator: info.Creator || "",
|
|
158
|
+
producer: info.Producer || "",
|
|
159
|
+
creationDate: info.CreationDate || ""
|
|
160
|
+
};
|
|
161
|
+
const text = result.text || "";
|
|
162
|
+
const pageTexts = text.split(/\f/).filter(Boolean);
|
|
163
|
+
const pages = pageTexts.length === result.numpages ? pageTexts : [text];
|
|
164
|
+
const markdown = textToMarkdown(text, metadata);
|
|
165
|
+
const words = text.split(/\s+/).filter(Boolean);
|
|
166
|
+
return {
|
|
167
|
+
metadata,
|
|
168
|
+
text,
|
|
169
|
+
markdown,
|
|
170
|
+
pages: pages.map((p) => p.trim()),
|
|
171
|
+
wordCount: words.length,
|
|
172
|
+
charCount: text.length
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
async function extractPdf(urlOrPath) {
|
|
176
|
+
let buffer;
|
|
177
|
+
if (urlOrPath.startsWith("http://") || urlOrPath.startsWith("https://")) {
|
|
178
|
+
buffer = await downloadPdf(urlOrPath);
|
|
179
|
+
} else {
|
|
180
|
+
buffer = readLocalPdf(urlOrPath);
|
|
181
|
+
}
|
|
182
|
+
if (buffer[0] !== 37 || buffer[1] !== 80 || buffer[2] !== 68 || buffer[3] !== 70) {
|
|
183
|
+
throw new Error("Not a valid PDF file (invalid magic bytes)");
|
|
184
|
+
}
|
|
185
|
+
return extractFromBuffer(buffer);
|
|
186
|
+
}
|
|
187
|
+
async function tryExtractPdf(url) {
|
|
188
|
+
if (isPdfUrl(url)) {
|
|
189
|
+
return extractPdf(url);
|
|
190
|
+
}
|
|
191
|
+
try {
|
|
192
|
+
const head = await fetch(url, { method: "HEAD", redirect: "follow" });
|
|
193
|
+
const contentType = head.headers.get("content-type") || "";
|
|
194
|
+
if (isPdfResponse(contentType)) {
|
|
195
|
+
return extractPdf(url);
|
|
196
|
+
}
|
|
197
|
+
} catch {
|
|
198
|
+
}
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
var pdfParseFn;
|
|
202
|
+
var init_pdf = __esm({
|
|
203
|
+
"src/browser/pdf.ts"() {
|
|
204
|
+
"use strict";
|
|
205
|
+
pdfParseFn = null;
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
1
209
|
// src/brain/index.ts
|
|
2
210
|
var CLASSIFIER_PROMPT = `You are an intent classifier for a web automation tool. Given a user's question about a webpage, decide what data sources are needed to answer it.
|
|
3
211
|
|
|
@@ -263,6 +471,69 @@ var OpenAIClient = class {
|
|
|
263
471
|
} : void 0
|
|
264
472
|
};
|
|
265
473
|
}
|
|
474
|
+
/**
|
|
475
|
+
* Simple vision call — send a screenshot + text prompt, get text back.
|
|
476
|
+
* Used by PDF Doctor for targeted issue resolution.
|
|
477
|
+
*/
|
|
478
|
+
async chatWithVision(prompt, screenshotBase64) {
|
|
479
|
+
const headers = this.buildHeaders();
|
|
480
|
+
if (this.config.provider === "anthropic") {
|
|
481
|
+
const body2 = {
|
|
482
|
+
model: this.config.model,
|
|
483
|
+
max_tokens: 1024,
|
|
484
|
+
temperature: 0.1,
|
|
485
|
+
messages: [{
|
|
486
|
+
role: "user",
|
|
487
|
+
content: [
|
|
488
|
+
{
|
|
489
|
+
type: "image",
|
|
490
|
+
source: {
|
|
491
|
+
type: "base64",
|
|
492
|
+
media_type: "image/jpeg",
|
|
493
|
+
data: screenshotBase64
|
|
494
|
+
}
|
|
495
|
+
},
|
|
496
|
+
{ type: "text", text: prompt }
|
|
497
|
+
]
|
|
498
|
+
}]
|
|
499
|
+
};
|
|
500
|
+
const resp2 = await fetch(`${this.config.baseURL}/messages`, {
|
|
501
|
+
method: "POST",
|
|
502
|
+
headers,
|
|
503
|
+
body: JSON.stringify(body2)
|
|
504
|
+
});
|
|
505
|
+
if (!resp2.ok) throw new Error(`Anthropic vision error: ${resp2.status}`);
|
|
506
|
+
const json2 = await resp2.json();
|
|
507
|
+
const content = json2.content;
|
|
508
|
+
return content?.[0]?.text || "";
|
|
509
|
+
}
|
|
510
|
+
const body = {
|
|
511
|
+
model: this.config.model,
|
|
512
|
+
max_tokens: 1024,
|
|
513
|
+
temperature: 0.1,
|
|
514
|
+
messages: [{
|
|
515
|
+
role: "user",
|
|
516
|
+
content: [
|
|
517
|
+
{
|
|
518
|
+
type: "image_url",
|
|
519
|
+
image_url: {
|
|
520
|
+
url: `data:image/jpeg;base64,${screenshotBase64}`
|
|
521
|
+
}
|
|
522
|
+
},
|
|
523
|
+
{ type: "text", text: prompt }
|
|
524
|
+
]
|
|
525
|
+
}]
|
|
526
|
+
};
|
|
527
|
+
const resp = await fetch(`${this.config.baseURL}/chat/completions`, {
|
|
528
|
+
method: "POST",
|
|
529
|
+
headers,
|
|
530
|
+
body: JSON.stringify(body)
|
|
531
|
+
});
|
|
532
|
+
if (!resp.ok) throw new Error(`Vision API error: ${resp.status}`);
|
|
533
|
+
const json = await resp.json();
|
|
534
|
+
const choice = json.choices?.[0];
|
|
535
|
+
return choice?.message?.content || "";
|
|
536
|
+
}
|
|
266
537
|
};
|
|
267
538
|
|
|
268
539
|
// src/llm/utils.ts
|
|
@@ -3274,6 +3545,43 @@ async function lobsterFetch(url, options) {
|
|
|
3274
3545
|
const timeout = options?.timeout || 3e4;
|
|
3275
3546
|
const dump = options?.dump || "markdown";
|
|
3276
3547
|
const start = Date.now();
|
|
3548
|
+
const { isPdfUrl: isPdfUrl2, isPdfResponse: isPdfResponse2, extractPdf: extractPdf2 } = await Promise.resolve().then(() => (init_pdf(), pdf_exports));
|
|
3549
|
+
if (isPdfUrl2(url)) {
|
|
3550
|
+
const pdfResult = await extractPdf2(url);
|
|
3551
|
+
const duration2 = Date.now() - start;
|
|
3552
|
+
let content2;
|
|
3553
|
+
switch (dump) {
|
|
3554
|
+
case "markdown":
|
|
3555
|
+
content2 = pdfResult.markdown;
|
|
3556
|
+
break;
|
|
3557
|
+
case "text":
|
|
3558
|
+
content2 = pdfResult.text;
|
|
3559
|
+
break;
|
|
3560
|
+
case "html":
|
|
3561
|
+
content2 = `<pre>${pdfResult.text}</pre>`;
|
|
3562
|
+
break;
|
|
3563
|
+
case "snapshot":
|
|
3564
|
+
content2 = `[PDF] ${pdfResult.metadata.title} (${pdfResult.metadata.pages} pages, ${pdfResult.wordCount} words)
|
|
3565
|
+
|
|
3566
|
+
${pdfResult.text.slice(0, 5e3)}`;
|
|
3567
|
+
break;
|
|
3568
|
+
case "links":
|
|
3569
|
+
content2 = "";
|
|
3570
|
+
break;
|
|
3571
|
+
default:
|
|
3572
|
+
content2 = pdfResult.markdown;
|
|
3573
|
+
}
|
|
3574
|
+
return {
|
|
3575
|
+
url,
|
|
3576
|
+
finalUrl: url,
|
|
3577
|
+
title: pdfResult.metadata.title,
|
|
3578
|
+
content: content2,
|
|
3579
|
+
links: [],
|
|
3580
|
+
engine: "pdf",
|
|
3581
|
+
duration: duration2,
|
|
3582
|
+
statusCode: 200
|
|
3583
|
+
};
|
|
3584
|
+
}
|
|
3277
3585
|
const resp = await fetch(url, {
|
|
3278
3586
|
headers: {
|
|
3279
3587
|
"User-Agent": "LobsterCLI/0.1 (+https://github.com/iexcalibur/lobster-cli)",
|
|
@@ -3287,6 +3595,48 @@ async function lobsterFetch(url, options) {
|
|
|
3287
3595
|
if (!resp.ok) {
|
|
3288
3596
|
throw new Error(`HTTP ${resp.status} ${resp.statusText}`);
|
|
3289
3597
|
}
|
|
3598
|
+
const contentType = resp.headers.get("content-type") || "";
|
|
3599
|
+
if (isPdfResponse2(contentType)) {
|
|
3600
|
+
const arrayBuffer = await resp.arrayBuffer();
|
|
3601
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
3602
|
+
const pdfMod = await import("pdf-parse");
|
|
3603
|
+
const pdfParseFn2 = pdfMod.PDFParse || pdfMod.default || pdfMod;
|
|
3604
|
+
const pdfResult = await pdfParseFn2(buffer);
|
|
3605
|
+
const info = pdfResult.info || {};
|
|
3606
|
+
const metadata = {
|
|
3607
|
+
title: info.Title || "untitled",
|
|
3608
|
+
author: info.Author || "",
|
|
3609
|
+
pages: pdfResult.numpages
|
|
3610
|
+
};
|
|
3611
|
+
const duration2 = Date.now() - start;
|
|
3612
|
+
const text = pdfResult.text || "";
|
|
3613
|
+
let content2;
|
|
3614
|
+
switch (dump) {
|
|
3615
|
+
case "text":
|
|
3616
|
+
content2 = text;
|
|
3617
|
+
break;
|
|
3618
|
+
case "html":
|
|
3619
|
+
content2 = `<pre>${text}</pre>`;
|
|
3620
|
+
break;
|
|
3621
|
+
case "snapshot":
|
|
3622
|
+
content2 = `[PDF] ${metadata.title} (${metadata.pages} pages)
|
|
3623
|
+
|
|
3624
|
+
${text.slice(0, 5e3)}`;
|
|
3625
|
+
break;
|
|
3626
|
+
default:
|
|
3627
|
+
content2 = text;
|
|
3628
|
+
}
|
|
3629
|
+
return {
|
|
3630
|
+
url,
|
|
3631
|
+
finalUrl: resp.url || url,
|
|
3632
|
+
title: metadata.title,
|
|
3633
|
+
content: content2,
|
|
3634
|
+
links: [],
|
|
3635
|
+
engine: "pdf",
|
|
3636
|
+
duration: duration2,
|
|
3637
|
+
statusCode: 200
|
|
3638
|
+
};
|
|
3639
|
+
}
|
|
3290
3640
|
const html = await resp.text();
|
|
3291
3641
|
const duration = Date.now() - start;
|
|
3292
3642
|
const finalUrl = resp.url || url;
|
|
@@ -4268,7 +4618,7 @@ function makeRoutingDecision(request) {
|
|
|
4268
4618
|
}
|
|
4269
4619
|
|
|
4270
4620
|
// src/agent/core.ts
|
|
4271
|
-
import { readFileSync as
|
|
4621
|
+
import { readFileSync as readFileSync4 } from "fs";
|
|
4272
4622
|
import { join as join4, dirname } from "path";
|
|
4273
4623
|
import { fileURLToPath } from "url";
|
|
4274
4624
|
|
|
@@ -4661,7 +5011,7 @@ var AgentCore = class {
|
|
|
4661
5011
|
const macroTool = packMacroTool(tools);
|
|
4662
5012
|
let systemPrompt;
|
|
4663
5013
|
try {
|
|
4664
|
-
systemPrompt =
|
|
5014
|
+
systemPrompt = readFileSync4(join4(__dirname, "prompts", "system.md"), "utf-8");
|
|
4665
5015
|
} catch {
|
|
4666
5016
|
systemPrompt = "You are an AI web agent that navigates web pages to complete tasks.";
|
|
4667
5017
|
}
|