lobster-cli 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +148 -268
- package/dist/agent/core.js +63 -0
- package/dist/agent/core.js.map +1 -1
- package/dist/agent/index.js +63 -0
- package/dist/agent/index.js.map +1 -1
- package/dist/browser/chrome-attach.js +102 -0
- package/dist/browser/chrome-attach.js.map +1 -0
- package/dist/browser/dom/compact-snapshot.js +162 -0
- package/dist/browser/dom/compact-snapshot.js.map +1 -0
- package/dist/browser/dom/index.js +160 -0
- package/dist/browser/dom/index.js.map +1 -1
- package/dist/browser/index.js +1201 -70
- package/dist/browser/index.js.map +1 -1
- package/dist/browser/manager.js +443 -11
- package/dist/browser/manager.js.map +1 -1
- package/dist/browser/page-adapter.js +370 -1
- package/dist/browser/page-adapter.js.map +1 -1
- package/dist/browser/profiles.js +238 -0
- package/dist/browser/profiles.js.map +1 -0
- package/dist/browser/semantic-find.js +152 -0
- package/dist/browser/semantic-find.js.map +1 -0
- package/dist/browser/stealth.js +187 -0
- package/dist/browser/stealth.js.map +1 -0
- package/dist/config/index.js +8 -1
- package/dist/config/index.js.map +1 -1
- package/dist/config/schema.js +8 -1
- package/dist/config/schema.js.map +1 -1
- package/dist/doc/index.js +31715 -0
- package/dist/doc/index.js.map +1 -0
- package/dist/domain-guard.js +103 -0
- package/dist/domain-guard.js.map +1 -0
- package/dist/index.js +32914 -262
- package/dist/index.js.map +1 -1
- package/dist/lib.js +1488 -241
- package/dist/lib.js.map +1 -1
- package/dist/llm/client.js +63 -0
- package/dist/llm/client.js.map +1 -1
- package/dist/llm/index.js +63 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/openai-client.js +63 -0
- package/dist/llm/openai-client.js.map +1 -1
- package/dist/router/index.js +925 -61
- package/dist/router/index.js.map +1 -1
- package/package.json +16 -2
package/dist/browser/index.js
CHANGED
|
@@ -1,6 +1,214 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// src/browser/pdf.ts
|
|
12
|
+
var pdf_exports = {};
|
|
13
|
+
__export(pdf_exports, {
|
|
14
|
+
extractPdf: () => extractPdf,
|
|
15
|
+
isPdfResponse: () => isPdfResponse,
|
|
16
|
+
isPdfUrl: () => isPdfUrl,
|
|
17
|
+
tryExtractPdf: () => tryExtractPdf
|
|
18
|
+
});
|
|
19
|
+
import { readFileSync as readFileSync3 } from "fs";
|
|
20
|
+
async function getPdfParser() {
|
|
21
|
+
if (!pdfParseFn) {
|
|
22
|
+
const mod = await import("pdf-parse");
|
|
23
|
+
const PDFParseClass = mod.PDFParse;
|
|
24
|
+
if (PDFParseClass && typeof PDFParseClass === "function") {
|
|
25
|
+
pdfParseFn = async (buffer) => {
|
|
26
|
+
const parser = new PDFParseClass(buffer);
|
|
27
|
+
return parser.parse ? await parser.parse() : parser;
|
|
28
|
+
};
|
|
29
|
+
} else {
|
|
30
|
+
pdfParseFn = mod.default || mod;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return pdfParseFn;
|
|
34
|
+
}
|
|
35
|
+
function isPdfUrl(urlOrPath) {
|
|
36
|
+
const lower = urlOrPath.toLowerCase();
|
|
37
|
+
if (lower.endsWith(".pdf")) return true;
|
|
38
|
+
if (/\/pdf\//.test(lower)) return true;
|
|
39
|
+
if (/arxiv\.org\/pdf\//.test(lower)) return true;
|
|
40
|
+
if (/[?&]format=pdf/i.test(lower)) return true;
|
|
41
|
+
if (/[?&]type=pdf/i.test(lower)) return true;
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
function isPdfResponse(contentType) {
|
|
45
|
+
return contentType.includes("application/pdf");
|
|
46
|
+
}
|
|
47
|
+
async function downloadPdf(url) {
|
|
48
|
+
const response = await fetch(url, {
|
|
49
|
+
headers: {
|
|
50
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
51
|
+
"Accept": "application/pdf,*/*"
|
|
52
|
+
},
|
|
53
|
+
redirect: "follow"
|
|
54
|
+
});
|
|
55
|
+
if (!response.ok) {
|
|
56
|
+
throw new Error(`Failed to download PDF: ${response.status} ${response.statusText}`);
|
|
57
|
+
}
|
|
58
|
+
const contentType = response.headers.get("content-type") || "";
|
|
59
|
+
if (!contentType.includes("pdf") && !isPdfUrl(url)) {
|
|
60
|
+
}
|
|
61
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
62
|
+
return Buffer.from(arrayBuffer);
|
|
63
|
+
}
|
|
64
|
+
function readLocalPdf(filePath) {
|
|
65
|
+
return readFileSync3(filePath);
|
|
66
|
+
}
|
|
67
|
+
function textToMarkdown(text, metadata) {
|
|
68
|
+
const lines = text.split("\n");
|
|
69
|
+
const mdLines = [];
|
|
70
|
+
if (metadata.title && metadata.title !== "untitled") {
|
|
71
|
+
mdLines.push(`# ${metadata.title}`);
|
|
72
|
+
mdLines.push("");
|
|
73
|
+
if (metadata.author) {
|
|
74
|
+
mdLines.push(`**Authors:** ${metadata.author}`);
|
|
75
|
+
mdLines.push("");
|
|
76
|
+
}
|
|
77
|
+
mdLines.push("---");
|
|
78
|
+
mdLines.push("");
|
|
79
|
+
}
|
|
80
|
+
let inReferences = false;
|
|
81
|
+
let prevWasBlank = false;
|
|
82
|
+
let paragraphBuffer = [];
|
|
83
|
+
function flushParagraph() {
|
|
84
|
+
if (paragraphBuffer.length > 0) {
|
|
85
|
+
mdLines.push(paragraphBuffer.join(" "));
|
|
86
|
+
mdLines.push("");
|
|
87
|
+
paragraphBuffer = [];
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
for (let i = 0; i < lines.length; i++) {
|
|
91
|
+
const line = lines[i].trim();
|
|
92
|
+
if (!line) {
|
|
93
|
+
if (!prevWasBlank) {
|
|
94
|
+
flushParagraph();
|
|
95
|
+
}
|
|
96
|
+
prevWasBlank = true;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
prevWasBlank = false;
|
|
100
|
+
const numberedHeading = line.match(/^(\d+\.?\d*\.?\d*)\s+([A-Z][A-Za-z\s:&-]+)$/);
|
|
101
|
+
if (numberedHeading && line.length < 80) {
|
|
102
|
+
flushParagraph();
|
|
103
|
+
const depth = numberedHeading[1].split(".").filter(Boolean).length;
|
|
104
|
+
const prefix = depth <= 1 ? "##" : depth === 2 ? "###" : "####";
|
|
105
|
+
mdLines.push(`${prefix} ${line}`);
|
|
106
|
+
mdLines.push("");
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (line === line.toUpperCase() && line.length > 3 && line.length < 60 && /^[A-Z\s:&-]+$/.test(line)) {
|
|
110
|
+
flushParagraph();
|
|
111
|
+
mdLines.push(`## ${line.charAt(0) + line.slice(1).toLowerCase()}`);
|
|
112
|
+
mdLines.push("");
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
const knownHeadings = /^(Abstract|Introduction|Conclusion|Discussion|Results|Methods|Methodology|Background|Related Work|Acknowledgments|Acknowledgements|References|Bibliography|Appendix)/i;
|
|
116
|
+
if (knownHeadings.test(line) && line.length < 40) {
|
|
117
|
+
flushParagraph();
|
|
118
|
+
if (/^(References|Bibliography)/i.test(line)) {
|
|
119
|
+
inReferences = true;
|
|
120
|
+
}
|
|
121
|
+
mdLines.push(`## ${line}`);
|
|
122
|
+
mdLines.push("");
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (/^[-•∙◦▪]/.test(line)) {
|
|
126
|
+
flushParagraph();
|
|
127
|
+
mdLines.push(`- ${line.replace(/^[-•∙◦▪]\s*/, "")}`);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (/^\(\d+\)|^[a-z]\)/.test(line)) {
|
|
131
|
+
flushParagraph();
|
|
132
|
+
mdLines.push(`- ${line}`);
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
if (inReferences && /^\[?\d+\]?\.?\s/.test(line)) {
|
|
136
|
+
flushParagraph();
|
|
137
|
+
mdLines.push(`- ${line}`);
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
if (line.endsWith("-") && i + 1 < lines.length) {
|
|
141
|
+
paragraphBuffer.push(line.slice(0, -1));
|
|
142
|
+
} else {
|
|
143
|
+
paragraphBuffer.push(line);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
flushParagraph();
|
|
147
|
+
return mdLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
148
|
+
}
|
|
149
|
+
async function extractFromBuffer(buffer) {
|
|
150
|
+
const parse = await getPdfParser();
|
|
151
|
+
const result = await parse(buffer);
|
|
152
|
+
const info = result.info || {};
|
|
153
|
+
const metadata = {
|
|
154
|
+
title: info.Title || "untitled",
|
|
155
|
+
author: info.Author || "",
|
|
156
|
+
pages: result.numpages,
|
|
157
|
+
creator: info.Creator || "",
|
|
158
|
+
producer: info.Producer || "",
|
|
159
|
+
creationDate: info.CreationDate || ""
|
|
160
|
+
};
|
|
161
|
+
const text = result.text || "";
|
|
162
|
+
const pageTexts = text.split(/\f/).filter(Boolean);
|
|
163
|
+
const pages = pageTexts.length === result.numpages ? pageTexts : [text];
|
|
164
|
+
const markdown = textToMarkdown(text, metadata);
|
|
165
|
+
const words = text.split(/\s+/).filter(Boolean);
|
|
166
|
+
return {
|
|
167
|
+
metadata,
|
|
168
|
+
text,
|
|
169
|
+
markdown,
|
|
170
|
+
pages: pages.map((p) => p.trim()),
|
|
171
|
+
wordCount: words.length,
|
|
172
|
+
charCount: text.length
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
async function extractPdf(urlOrPath) {
|
|
176
|
+
let buffer;
|
|
177
|
+
if (urlOrPath.startsWith("http://") || urlOrPath.startsWith("https://")) {
|
|
178
|
+
buffer = await downloadPdf(urlOrPath);
|
|
179
|
+
} else {
|
|
180
|
+
buffer = readLocalPdf(urlOrPath);
|
|
181
|
+
}
|
|
182
|
+
if (buffer[0] !== 37 || buffer[1] !== 80 || buffer[2] !== 68 || buffer[3] !== 70) {
|
|
183
|
+
throw new Error("Not a valid PDF file (invalid magic bytes)");
|
|
184
|
+
}
|
|
185
|
+
return extractFromBuffer(buffer);
|
|
186
|
+
}
|
|
187
|
+
async function tryExtractPdf(url) {
|
|
188
|
+
if (isPdfUrl(url)) {
|
|
189
|
+
return extractPdf(url);
|
|
190
|
+
}
|
|
191
|
+
try {
|
|
192
|
+
const head = await fetch(url, { method: "HEAD", redirect: "follow" });
|
|
193
|
+
const contentType = head.headers.get("content-type") || "";
|
|
194
|
+
if (isPdfResponse(contentType)) {
|
|
195
|
+
return extractPdf(url);
|
|
196
|
+
}
|
|
197
|
+
} catch {
|
|
198
|
+
}
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
var pdfParseFn;
|
|
202
|
+
var init_pdf = __esm({
|
|
203
|
+
"src/browser/pdf.ts"() {
|
|
204
|
+
"use strict";
|
|
205
|
+
pdfParseFn = null;
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
1
209
|
// src/browser/manager.ts
|
|
2
210
|
import puppeteer from "puppeteer-core";
|
|
3
|
-
import { existsSync } from "fs";
|
|
211
|
+
import { existsSync as existsSync3 } from "fs";
|
|
4
212
|
|
|
5
213
|
// src/utils/logger.ts
|
|
6
214
|
import chalk from "chalk";
|
|
@@ -16,20 +224,509 @@ var log = {
|
|
|
16
224
|
dim: (msg) => console.log(chalk.dim(msg))
|
|
17
225
|
};
|
|
18
226
|
|
|
227
|
+
// src/browser/profiles.ts
|
|
228
|
+
import { existsSync as existsSync2, mkdirSync as mkdirSync2, readFileSync as readFileSync2, writeFileSync as writeFileSync2, readdirSync, rmSync, statSync } from "fs";
|
|
229
|
+
import { join as join2 } from "path";
|
|
230
|
+
|
|
231
|
+
// src/config/index.ts
|
|
232
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
|
|
233
|
+
import { join } from "path";
|
|
234
|
+
import { homedir } from "os";
|
|
235
|
+
import yaml from "js-yaml";
|
|
236
|
+
|
|
237
|
+
// src/config/schema.ts
|
|
238
|
+
import { z } from "zod";
|
|
239
|
+
var configSchema = z.object({
|
|
240
|
+
llm: z.object({
|
|
241
|
+
provider: z.enum(["openai", "anthropic", "gemini", "ollama"]).default("openai"),
|
|
242
|
+
baseURL: z.string().default("https://api.openai.com/v1"),
|
|
243
|
+
model: z.string().default("gpt-4o"),
|
|
244
|
+
apiKey: z.string().default(""),
|
|
245
|
+
temperature: z.number().min(0).max(2).default(0.1),
|
|
246
|
+
maxRetries: z.number().int().min(0).default(3)
|
|
247
|
+
}).default({}),
|
|
248
|
+
browser: z.object({
|
|
249
|
+
executablePath: z.string().default(""),
|
|
250
|
+
headless: z.boolean().default(true),
|
|
251
|
+
connectTimeout: z.number().default(30),
|
|
252
|
+
commandTimeout: z.number().default(60),
|
|
253
|
+
cdpEndpoint: z.string().default(""),
|
|
254
|
+
profile: z.string().default(""),
|
|
255
|
+
stealth: z.boolean().default(false)
|
|
256
|
+
}).default({}),
|
|
257
|
+
agent: z.object({
|
|
258
|
+
maxSteps: z.number().int().default(40),
|
|
259
|
+
stepDelay: z.number().default(0.4)
|
|
260
|
+
}).default({}),
|
|
261
|
+
domains: z.object({
|
|
262
|
+
allow: z.array(z.string()).default([]),
|
|
263
|
+
block: z.array(z.string()).default([]),
|
|
264
|
+
blockMessage: z.string().default("")
|
|
265
|
+
}).default({}),
|
|
266
|
+
output: z.object({
|
|
267
|
+
defaultFormat: z.enum(["table", "json", "yaml", "markdown", "csv"]).default("table"),
|
|
268
|
+
color: z.boolean().default(true)
|
|
269
|
+
}).default({})
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
// src/config/index.ts
|
|
273
|
+
var CONFIG_DIR = join(homedir(), ".lobster");
|
|
274
|
+
var CONFIG_FILE = join(CONFIG_DIR, "config.yaml");
|
|
275
|
+
function getConfigDir() {
|
|
276
|
+
return CONFIG_DIR;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// src/browser/profiles.ts
|
|
280
|
+
var PROFILES_DIR = () => join2(getConfigDir(), "profiles");
|
|
281
|
+
var META_FILE = ".lobster-meta.json";
|
|
282
|
+
var VALID_NAME = /^[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}$/;
|
|
283
|
+
var RESERVED_NAMES = /* @__PURE__ */ new Set([
|
|
284
|
+
"default",
|
|
285
|
+
"system",
|
|
286
|
+
"con",
|
|
287
|
+
"prn",
|
|
288
|
+
"aux",
|
|
289
|
+
"nul",
|
|
290
|
+
"com1",
|
|
291
|
+
"com2",
|
|
292
|
+
"com3",
|
|
293
|
+
"com4",
|
|
294
|
+
"com5",
|
|
295
|
+
"com6",
|
|
296
|
+
"com7",
|
|
297
|
+
"com8",
|
|
298
|
+
"com9",
|
|
299
|
+
"lpt1",
|
|
300
|
+
"lpt2",
|
|
301
|
+
"lpt3",
|
|
302
|
+
"lpt4",
|
|
303
|
+
"lpt5",
|
|
304
|
+
"lpt6",
|
|
305
|
+
"lpt7",
|
|
306
|
+
"lpt8",
|
|
307
|
+
"lpt9"
|
|
308
|
+
]);
|
|
309
|
+
var CACHE_DIRS = [
|
|
310
|
+
"Cache",
|
|
311
|
+
"Code Cache",
|
|
312
|
+
"GPUCache",
|
|
313
|
+
"GrShaderCache",
|
|
314
|
+
"ShaderCache",
|
|
315
|
+
"Service Worker",
|
|
316
|
+
"Sessions",
|
|
317
|
+
"Session Storage",
|
|
318
|
+
"blob_storage"
|
|
319
|
+
];
|
|
320
|
+
function ensureProfilesDir() {
|
|
321
|
+
const dir = PROFILES_DIR();
|
|
322
|
+
if (!existsSync2(dir)) mkdirSync2(dir, { recursive: true });
|
|
323
|
+
}
|
|
324
|
+
function validateName(name) {
|
|
325
|
+
if (!VALID_NAME.test(name)) {
|
|
326
|
+
throw new Error(`Invalid profile name "${name}". Use only letters, numbers, hyphens, underscores (max 64 chars).`);
|
|
327
|
+
}
|
|
328
|
+
if (RESERVED_NAMES.has(name.toLowerCase())) {
|
|
329
|
+
throw new Error(`"${name}" is a reserved name. Choose a different profile name.`);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
function getProfileDir(name) {
|
|
333
|
+
return join2(PROFILES_DIR(), name);
|
|
334
|
+
}
|
|
335
|
+
function readMeta(profileDir) {
|
|
336
|
+
const metaPath = join2(profileDir, META_FILE);
|
|
337
|
+
if (!existsSync2(metaPath)) return null;
|
|
338
|
+
try {
|
|
339
|
+
return JSON.parse(readFileSync2(metaPath, "utf-8"));
|
|
340
|
+
} catch {
|
|
341
|
+
return null;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
function writeMeta(profileDir, meta) {
|
|
345
|
+
writeFileSync2(join2(profileDir, META_FILE), JSON.stringify(meta, null, 2));
|
|
346
|
+
}
|
|
347
|
+
function getDirSizeMB(dirPath) {
|
|
348
|
+
let total = 0;
|
|
349
|
+
try {
|
|
350
|
+
const entries = readdirSync(dirPath, { withFileTypes: true });
|
|
351
|
+
for (const entry of entries) {
|
|
352
|
+
const fullPath = join2(dirPath, entry.name);
|
|
353
|
+
if (entry.isFile()) {
|
|
354
|
+
total += statSync(fullPath).size;
|
|
355
|
+
} else if (entry.isDirectory() && entry.name !== ".lobster-meta.json") {
|
|
356
|
+
total += getDirSizeMB(fullPath) * 1024 * 1024;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
} catch {
|
|
360
|
+
}
|
|
361
|
+
return Math.round(total / (1024 * 1024) * 10) / 10;
|
|
362
|
+
}
|
|
363
|
+
function createProfile(name) {
|
|
364
|
+
validateName(name);
|
|
365
|
+
ensureProfilesDir();
|
|
366
|
+
const dir = getProfileDir(name);
|
|
367
|
+
if (existsSync2(dir)) {
|
|
368
|
+
throw new Error(`Profile "${name}" already exists.`);
|
|
369
|
+
}
|
|
370
|
+
mkdirSync2(dir, { recursive: true });
|
|
371
|
+
const meta = {
|
|
372
|
+
name,
|
|
373
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
374
|
+
lastUsed: (/* @__PURE__ */ new Date()).toISOString()
|
|
375
|
+
};
|
|
376
|
+
writeMeta(dir, meta);
|
|
377
|
+
log.success(`Profile "${name}" created at ${dir}`);
|
|
378
|
+
return meta;
|
|
379
|
+
}
|
|
380
|
+
function listProfiles() {
|
|
381
|
+
ensureProfilesDir();
|
|
382
|
+
const dir = PROFILES_DIR();
|
|
383
|
+
const profiles = [];
|
|
384
|
+
try {
|
|
385
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
386
|
+
for (const entry of entries) {
|
|
387
|
+
if (!entry.isDirectory()) continue;
|
|
388
|
+
const profileDir = join2(dir, entry.name);
|
|
389
|
+
const meta = readMeta(profileDir);
|
|
390
|
+
if (meta) {
|
|
391
|
+
meta.sizeMB = getDirSizeMB(profileDir);
|
|
392
|
+
profiles.push(meta);
|
|
393
|
+
} else {
|
|
394
|
+
profiles.push({
|
|
395
|
+
name: entry.name,
|
|
396
|
+
createdAt: "unknown",
|
|
397
|
+
lastUsed: "unknown",
|
|
398
|
+
sizeMB: getDirSizeMB(profileDir)
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
} catch {
|
|
403
|
+
}
|
|
404
|
+
return profiles.sort((a, b) => a.name.localeCompare(b.name));
|
|
405
|
+
}
|
|
406
|
+
function removeProfile(name) {
|
|
407
|
+
const dir = getProfileDir(name);
|
|
408
|
+
if (!existsSync2(dir)) {
|
|
409
|
+
throw new Error(`Profile "${name}" does not exist.`);
|
|
410
|
+
}
|
|
411
|
+
rmSync(dir, { recursive: true, force: true });
|
|
412
|
+
log.success(`Profile "${name}" deleted.`);
|
|
413
|
+
}
|
|
414
|
+
function getProfileDataDir(name) {
|
|
415
|
+
validateName(name);
|
|
416
|
+
const dir = getProfileDir(name);
|
|
417
|
+
if (!existsSync2(dir)) {
|
|
418
|
+
createProfile(name);
|
|
419
|
+
} else {
|
|
420
|
+
const meta = readMeta(dir) || { name, createdAt: "unknown", lastUsed: "" };
|
|
421
|
+
meta.lastUsed = (/* @__PURE__ */ new Date()).toISOString();
|
|
422
|
+
writeMeta(dir, meta);
|
|
423
|
+
}
|
|
424
|
+
return dir;
|
|
425
|
+
}
|
|
426
|
+
function resetProfileCache(name) {
|
|
427
|
+
const dir = getProfileDir(name);
|
|
428
|
+
if (!existsSync2(dir)) {
|
|
429
|
+
throw new Error(`Profile "${name}" does not exist.`);
|
|
430
|
+
}
|
|
431
|
+
let cleaned = 0;
|
|
432
|
+
for (const cacheDir of CACHE_DIRS) {
|
|
433
|
+
for (const base of [dir, join2(dir, "Default")]) {
|
|
434
|
+
const target = join2(base, cacheDir);
|
|
435
|
+
if (existsSync2(target)) {
|
|
436
|
+
rmSync(target, { recursive: true, force: true });
|
|
437
|
+
cleaned++;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
log.success(`Profile "${name}" cache reset (${cleaned} directories cleaned).`);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// src/browser/chrome-attach.ts
|
|
445
|
+
import http from "http";
|
|
446
|
+
var DEFAULT_PORTS = [9222, 9229, 9333, 9515];
|
|
447
|
+
var PROBE_TIMEOUT = 1500;
|
|
448
|
+
function probePort(port) {
|
|
449
|
+
return new Promise((resolve) => {
|
|
450
|
+
const req = http.get(`http://127.0.0.1:${port}/json/version`, {
|
|
451
|
+
timeout: PROBE_TIMEOUT
|
|
452
|
+
}, (res) => {
|
|
453
|
+
let data = "";
|
|
454
|
+
res.on("data", (chunk) => {
|
|
455
|
+
data += chunk;
|
|
456
|
+
});
|
|
457
|
+
res.on("end", () => {
|
|
458
|
+
try {
|
|
459
|
+
const info = JSON.parse(data);
|
|
460
|
+
if (info.webSocketDebuggerUrl) {
|
|
461
|
+
resolve({
|
|
462
|
+
wsEndpoint: info.webSocketDebuggerUrl,
|
|
463
|
+
port,
|
|
464
|
+
version: info["Protocol-Version"] || "",
|
|
465
|
+
browser: info.Browser || ""
|
|
466
|
+
});
|
|
467
|
+
} else {
|
|
468
|
+
resolve(null);
|
|
469
|
+
}
|
|
470
|
+
} catch {
|
|
471
|
+
resolve(null);
|
|
472
|
+
}
|
|
473
|
+
});
|
|
474
|
+
});
|
|
475
|
+
req.on("error", () => resolve(null));
|
|
476
|
+
req.on("timeout", () => {
|
|
477
|
+
req.destroy();
|
|
478
|
+
resolve(null);
|
|
479
|
+
});
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
async function discoverChrome(ports) {
|
|
483
|
+
const portsToCheck = ports || DEFAULT_PORTS;
|
|
484
|
+
log.debug(`Scanning ports for Chrome: ${portsToCheck.join(", ")}`);
|
|
485
|
+
const results = await Promise.all(portsToCheck.map(probePort));
|
|
486
|
+
const found = results.find(Boolean) || null;
|
|
487
|
+
if (found) {
|
|
488
|
+
log.info(`Found Chrome on port ${found.port}: ${found.browser}`);
|
|
489
|
+
} else {
|
|
490
|
+
log.debug("No running Chrome instance found on debug ports.");
|
|
491
|
+
}
|
|
492
|
+
return found;
|
|
493
|
+
}
|
|
494
|
+
async function getWebSocketDebuggerUrl(port) {
|
|
495
|
+
const result = await probePort(port);
|
|
496
|
+
return result?.wsEndpoint || null;
|
|
497
|
+
}
|
|
498
|
+
async function resolveAttachTarget(target) {
|
|
499
|
+
if (target === true || target === "true") {
|
|
500
|
+
const result = await discoverChrome();
|
|
501
|
+
if (!result) {
|
|
502
|
+
throw new Error(
|
|
503
|
+
"No running Chrome found. Start Chrome with:\n google-chrome --remote-debugging-port=9222\n # or on Mac:\n /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222"
|
|
504
|
+
);
|
|
505
|
+
}
|
|
506
|
+
return result.wsEndpoint;
|
|
507
|
+
}
|
|
508
|
+
if (typeof target === "string") {
|
|
509
|
+
if (target.startsWith("ws://") || target.startsWith("wss://")) {
|
|
510
|
+
return target;
|
|
511
|
+
}
|
|
512
|
+
const port = parseInt(target, 10);
|
|
513
|
+
if (!isNaN(port) && port > 0 && port < 65536) {
|
|
514
|
+
const url = await getWebSocketDebuggerUrl(port);
|
|
515
|
+
if (!url) {
|
|
516
|
+
throw new Error(`No Chrome found on port ${port}. Make sure Chrome is running with --remote-debugging-port=${port}`);
|
|
517
|
+
}
|
|
518
|
+
return url;
|
|
519
|
+
}
|
|
520
|
+
throw new Error(`Invalid attach target: "${target}". Use "true" for auto-discover, a port number, or a ws:// URL.`);
|
|
521
|
+
}
|
|
522
|
+
throw new Error("Invalid attach target.");
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// src/browser/stealth.ts
|
|
526
|
+
var STEALTH_SCRIPT = `
|
|
527
|
+
(() => {
|
|
528
|
+
// \u2500\u2500 1. navigator.webdriver removal \u2500\u2500
|
|
529
|
+
// Most important: this is the #1 detection vector
|
|
530
|
+
Object.defineProperty(navigator, 'webdriver', {
|
|
531
|
+
get: () => undefined,
|
|
532
|
+
configurable: true,
|
|
533
|
+
});
|
|
534
|
+
|
|
535
|
+
// Also delete from prototype
|
|
536
|
+
delete Object.getPrototypeOf(navigator).webdriver;
|
|
537
|
+
|
|
538
|
+
// \u2500\u2500 2. CDP marker removal \u2500\u2500
|
|
539
|
+
// Chrome DevTools Protocol injects cdc_* properties on window
|
|
540
|
+
for (const key of Object.keys(window)) {
|
|
541
|
+
if (/^cdc_|^__webdriver|^__selenium|^__driver/.test(key)) {
|
|
542
|
+
try { delete window[key]; } catch {}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// \u2500\u2500 3. Chrome runtime spoofing \u2500\u2500
|
|
547
|
+
// Real Chrome has window.chrome with runtime, loadTimes, csi
|
|
548
|
+
if (!window.chrome) {
|
|
549
|
+
window.chrome = {};
|
|
550
|
+
}
|
|
551
|
+
if (!window.chrome.runtime) {
|
|
552
|
+
window.chrome.runtime = {
|
|
553
|
+
connect: function() {},
|
|
554
|
+
sendMessage: function() {},
|
|
555
|
+
onMessage: { addListener: function() {} },
|
|
556
|
+
id: undefined,
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
if (!window.chrome.loadTimes) {
|
|
560
|
+
window.chrome.loadTimes = function() {
|
|
561
|
+
return {
|
|
562
|
+
commitLoadTime: Date.now() / 1000 - 0.5,
|
|
563
|
+
connectionInfo: 'h2',
|
|
564
|
+
finishDocumentLoadTime: Date.now() / 1000 - 0.1,
|
|
565
|
+
finishLoadTime: Date.now() / 1000 - 0.05,
|
|
566
|
+
firstPaintAfterLoadTime: 0,
|
|
567
|
+
firstPaintTime: Date.now() / 1000 - 0.3,
|
|
568
|
+
navigationType: 'Other',
|
|
569
|
+
npnNegotiatedProtocol: 'h2',
|
|
570
|
+
requestTime: Date.now() / 1000 - 1,
|
|
571
|
+
startLoadTime: Date.now() / 1000 - 0.8,
|
|
572
|
+
wasAlternateProtocolAvailable: false,
|
|
573
|
+
wasFetchedViaSpdy: true,
|
|
574
|
+
wasNpnNegotiated: true,
|
|
575
|
+
};
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
if (!window.chrome.csi) {
|
|
579
|
+
window.chrome.csi = function() {
|
|
580
|
+
return {
|
|
581
|
+
onloadT: Date.now(),
|
|
582
|
+
startE: Date.now() - 500,
|
|
583
|
+
pageT: 500,
|
|
584
|
+
tran: 15,
|
|
585
|
+
};
|
|
586
|
+
};
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
// \u2500\u2500 4. Plugin array spoofing \u2500\u2500
|
|
590
|
+
// Headless Chrome reports empty plugins; real Chrome has at least 2
|
|
591
|
+
const fakePlugins = [
|
|
592
|
+
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1 },
|
|
593
|
+
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '', length: 1 },
|
|
594
|
+
{ name: 'Native Client', filename: 'internal-nacl-plugin', description: '', length: 2 },
|
|
595
|
+
];
|
|
596
|
+
|
|
597
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
598
|
+
get: () => {
|
|
599
|
+
const arr = fakePlugins.map(p => {
|
|
600
|
+
const plugin = { ...p, item: (i) => plugin, namedItem: (n) => plugin };
|
|
601
|
+
return plugin;
|
|
602
|
+
});
|
|
603
|
+
arr.item = (i) => arr[i];
|
|
604
|
+
arr.namedItem = (n) => arr.find(p => p.name === n);
|
|
605
|
+
arr.refresh = () => {};
|
|
606
|
+
return arr;
|
|
607
|
+
},
|
|
608
|
+
});
|
|
609
|
+
|
|
610
|
+
// \u2500\u2500 5. Languages \u2500\u2500
|
|
611
|
+
Object.defineProperty(navigator, 'languages', {
|
|
612
|
+
get: () => ['en-US', 'en'],
|
|
613
|
+
});
|
|
614
|
+
Object.defineProperty(navigator, 'language', {
|
|
615
|
+
get: () => 'en-US',
|
|
616
|
+
});
|
|
617
|
+
|
|
618
|
+
// \u2500\u2500 6. Platform consistency \u2500\u2500
|
|
619
|
+
// Ensure platform matches user agent
|
|
620
|
+
const platform = navigator.userAgent.includes('Mac') ? 'MacIntel' :
|
|
621
|
+
navigator.userAgent.includes('Win') ? 'Win32' :
|
|
622
|
+
navigator.userAgent.includes('Linux') ? 'Linux x86_64' : navigator.platform;
|
|
623
|
+
Object.defineProperty(navigator, 'platform', { get: () => platform });
|
|
624
|
+
|
|
625
|
+
// \u2500\u2500 7. Hardware concurrency & device memory \u2500\u2500
|
|
626
|
+
// Headless often reports unusual values
|
|
627
|
+
if (navigator.hardwareConcurrency < 2) {
|
|
628
|
+
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
|
|
629
|
+
}
|
|
630
|
+
if (!navigator.deviceMemory || navigator.deviceMemory < 2) {
|
|
631
|
+
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// \u2500\u2500 8. WebGL vendor/renderer spoofing \u2500\u2500
|
|
635
|
+
// Headless reports "Google SwiftShader" which is a dead giveaway
|
|
636
|
+
const origGetParameter = WebGLRenderingContext.prototype.getParameter;
|
|
637
|
+
WebGLRenderingContext.prototype.getParameter = function(param) {
|
|
638
|
+
// UNMASKED_VENDOR_WEBGL
|
|
639
|
+
if (param === 0x9245) return 'Intel Inc.';
|
|
640
|
+
// UNMASKED_RENDERER_WEBGL
|
|
641
|
+
if (param === 0x9246) return 'Intel Iris OpenGL Engine';
|
|
642
|
+
return origGetParameter.call(this, param);
|
|
643
|
+
};
|
|
644
|
+
|
|
645
|
+
// Also for WebGL2
|
|
646
|
+
if (typeof WebGL2RenderingContext !== 'undefined') {
|
|
647
|
+
const origGetParameter2 = WebGL2RenderingContext.prototype.getParameter;
|
|
648
|
+
WebGL2RenderingContext.prototype.getParameter = function(param) {
|
|
649
|
+
if (param === 0x9245) return 'Intel Inc.';
|
|
650
|
+
if (param === 0x9246) return 'Intel Iris OpenGL Engine';
|
|
651
|
+
return origGetParameter2.call(this, param);
|
|
652
|
+
};
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// \u2500\u2500 9. Canvas fingerprint noise \u2500\u2500
|
|
656
|
+
// Adds subtle deterministic noise to canvas output based on domain
|
|
657
|
+
const seed = location.hostname.split('').reduce((a, c) => a + c.charCodeAt(0), 0);
|
|
658
|
+
const origToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
659
|
+
HTMLCanvasElement.prototype.toDataURL = function(type) {
|
|
660
|
+
const ctx = this.getContext('2d');
|
|
661
|
+
if (ctx && this.width > 0 && this.height > 0) {
|
|
662
|
+
try {
|
|
663
|
+
const imageData = ctx.getImageData(0, 0, 1, 1);
|
|
664
|
+
// Flip a single pixel with seeded noise
|
|
665
|
+
imageData.data[0] = (imageData.data[0] + seed) % 256;
|
|
666
|
+
ctx.putImageData(imageData, 0, 0);
|
|
667
|
+
} catch {}
|
|
668
|
+
}
|
|
669
|
+
return origToDataURL.apply(this, arguments);
|
|
670
|
+
};
|
|
671
|
+
|
|
672
|
+
// \u2500\u2500 10. Permissions API \u2500\u2500
|
|
673
|
+
// Headless returns 'denied' for notifications; real Chrome returns 'prompt'
|
|
674
|
+
const origQuery = navigator.permissions?.query?.bind(navigator.permissions);
|
|
675
|
+
if (origQuery) {
|
|
676
|
+
navigator.permissions.query = function(descriptor) {
|
|
677
|
+
if (descriptor.name === 'notifications') {
|
|
678
|
+
return Promise.resolve({ state: Notification.permission || 'prompt', onchange: null });
|
|
679
|
+
}
|
|
680
|
+
return origQuery(descriptor);
|
|
681
|
+
};
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// \u2500\u2500 11. Notification constructor \u2500\u2500
|
|
685
|
+
if (!window.Notification) {
|
|
686
|
+
window.Notification = function() {};
|
|
687
|
+
window.Notification.permission = 'default';
|
|
688
|
+
window.Notification.requestPermission = () => Promise.resolve('default');
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// \u2500\u2500 12. Connection type \u2500\u2500
|
|
692
|
+
if (navigator.connection) {
|
|
693
|
+
Object.defineProperty(navigator.connection, 'rtt', { get: () => 50 });
|
|
694
|
+
}
|
|
695
|
+
})()
|
|
696
|
+
`;
|
|
697
|
+
async function injectStealth(page) {
|
|
698
|
+
await page.evaluateOnNewDocument(STEALTH_SCRIPT);
|
|
699
|
+
}
|
|
700
|
+
var STEALTH_ARGS = [
|
|
701
|
+
"--disable-blink-features=AutomationControlled",
|
|
702
|
+
"--disable-features=IsolateOrigins,site-per-process",
|
|
703
|
+
"--disable-infobars",
|
|
704
|
+
"--window-size=1920,1080"
|
|
705
|
+
];
|
|
706
|
+
|
|
19
707
|
// src/browser/manager.ts
|
|
20
708
|
var BrowserManager = class {
|
|
21
709
|
browser = null;
|
|
22
710
|
config;
|
|
711
|
+
isAttached = false;
|
|
23
712
|
constructor(config = {}) {
|
|
24
713
|
this.config = config;
|
|
25
714
|
}
|
|
26
715
|
async connect() {
|
|
27
716
|
if (this.browser?.connected) return this.browser;
|
|
717
|
+
if (this.config.attach) {
|
|
718
|
+
const wsEndpoint = await resolveAttachTarget(this.config.attach);
|
|
719
|
+
log.info(`Attaching to Chrome: ${wsEndpoint}`);
|
|
720
|
+
this.browser = await puppeteer.connect({ browserWSEndpoint: wsEndpoint });
|
|
721
|
+
this.isAttached = true;
|
|
722
|
+
return this.browser;
|
|
723
|
+
}
|
|
28
724
|
if (this.config.cdpEndpoint) {
|
|
29
725
|
log.debug(`Connecting to CDP endpoint: ${this.config.cdpEndpoint}`);
|
|
30
726
|
this.browser = await puppeteer.connect({
|
|
31
727
|
browserWSEndpoint: this.config.cdpEndpoint
|
|
32
728
|
});
|
|
729
|
+
this.isAttached = true;
|
|
33
730
|
return this.browser;
|
|
34
731
|
}
|
|
35
732
|
const executablePath = this.config.executablePath || findChrome();
|
|
@@ -38,27 +735,48 @@ var BrowserManager = class {
|
|
|
38
735
|
"Chrome/Chromium not found. Set LOBSTER_BROWSER_PATH or config browser.executablePath"
|
|
39
736
|
);
|
|
40
737
|
}
|
|
738
|
+
const args = [
|
|
739
|
+
"--no-sandbox",
|
|
740
|
+
"--disable-setuid-sandbox",
|
|
741
|
+
"--disable-dev-shm-usage",
|
|
742
|
+
"--disable-gpu"
|
|
743
|
+
];
|
|
744
|
+
if (this.config.stealth) {
|
|
745
|
+
args.push(...STEALTH_ARGS);
|
|
746
|
+
}
|
|
747
|
+
let userDataDir;
|
|
748
|
+
if (this.config.profile) {
|
|
749
|
+
userDataDir = getProfileDataDir(this.config.profile);
|
|
750
|
+
log.info(`Using profile "${this.config.profile}" \u2192 ${userDataDir}`);
|
|
751
|
+
}
|
|
41
752
|
log.debug(`Launching Chrome: ${executablePath}`);
|
|
42
753
|
this.browser = await puppeteer.launch({
|
|
43
754
|
executablePath,
|
|
44
755
|
headless: this.config.headless ?? true,
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
"--disable-setuid-sandbox",
|
|
48
|
-
"--disable-dev-shm-usage",
|
|
49
|
-
"--disable-gpu"
|
|
50
|
-
]
|
|
756
|
+
userDataDir,
|
|
757
|
+
args
|
|
51
758
|
});
|
|
759
|
+
this.isAttached = false;
|
|
52
760
|
return this.browser;
|
|
53
761
|
}
|
|
54
762
|
async newPage() {
|
|
55
763
|
const browser = await this.connect();
|
|
56
|
-
|
|
764
|
+
const page = await browser.newPage();
|
|
765
|
+
if (this.config.stealth) {
|
|
766
|
+
await injectStealth(page);
|
|
767
|
+
log.debug("Stealth mode enabled");
|
|
768
|
+
}
|
|
769
|
+
return page;
|
|
57
770
|
}
|
|
58
771
|
async close() {
|
|
59
772
|
if (this.browser) {
|
|
60
|
-
|
|
61
|
-
|
|
773
|
+
if (this.isAttached) {
|
|
774
|
+
this.browser.disconnect();
|
|
775
|
+
log.debug("Disconnected from Chrome (attached mode)");
|
|
776
|
+
} else {
|
|
777
|
+
await this.browser.close().catch(() => {
|
|
778
|
+
});
|
|
779
|
+
}
|
|
62
780
|
this.browser = null;
|
|
63
781
|
}
|
|
64
782
|
}
|
|
@@ -78,7 +796,7 @@ function findChrome() {
|
|
|
78
796
|
"/usr/bin/chromium",
|
|
79
797
|
"/snap/bin/chromium"
|
|
80
798
|
];
|
|
81
|
-
return paths.find((p) =>
|
|
799
|
+
return paths.find((p) => existsSync3(p));
|
|
82
800
|
}
|
|
83
801
|
|
|
84
802
|
// src/browser/dom/flat-tree.ts
|
|
@@ -584,6 +1302,164 @@ var SNAPSHOT_SCRIPT = `
|
|
|
584
1302
|
})()
|
|
585
1303
|
`;
|
|
586
1304
|
|
|
1305
|
+
// src/browser/dom/compact-snapshot.ts
|
|
1306
|
+
var COMPACT_SNAPSHOT_SCRIPT = `
|
|
1307
|
+
(() => {
|
|
1308
|
+
const TOKEN_BUDGET = 800;
|
|
1309
|
+
const CHARS_PER_TOKEN = 4;
|
|
1310
|
+
|
|
1311
|
+
const INTERACTIVE_TAGS = new Set([
|
|
1312
|
+
'a','button','input','select','textarea','details','summary','label',
|
|
1313
|
+
]);
|
|
1314
|
+
const INTERACTIVE_ROLES = new Set([
|
|
1315
|
+
'button','link','textbox','checkbox','radio','combobox','listbox',
|
|
1316
|
+
'menu','menuitem','tab','switch','slider','searchbox','spinbutton',
|
|
1317
|
+
'option','menuitemcheckbox','menuitemradio','treeitem',
|
|
1318
|
+
]);
|
|
1319
|
+
const LANDMARK_TAGS = new Map([
|
|
1320
|
+
['nav', 'Navigation'],
|
|
1321
|
+
['main', 'Main Content'],
|
|
1322
|
+
['header', 'Header'],
|
|
1323
|
+
['footer', 'Footer'],
|
|
1324
|
+
['aside', 'Sidebar'],
|
|
1325
|
+
['form', 'Form'],
|
|
1326
|
+
]);
|
|
1327
|
+
const LANDMARK_ROLES = new Map([
|
|
1328
|
+
['navigation', 'Navigation'],
|
|
1329
|
+
['main', 'Main Content'],
|
|
1330
|
+
['banner', 'Header'],
|
|
1331
|
+
['contentinfo', 'Footer'],
|
|
1332
|
+
['complementary', 'Sidebar'],
|
|
1333
|
+
['search', 'Search'],
|
|
1334
|
+
['dialog', 'Dialog'],
|
|
1335
|
+
]);
|
|
1336
|
+
|
|
1337
|
+
function isVisible(el) {
|
|
1338
|
+
if (el.offsetWidth === 0 && el.offsetHeight === 0 && el.tagName !== 'INPUT') return false;
|
|
1339
|
+
const s = getComputedStyle(el);
|
|
1340
|
+
return s.display !== 'none' && s.visibility !== 'hidden' && s.opacity !== '0';
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
function isInteractive(el) {
|
|
1344
|
+
const tag = el.tagName.toLowerCase();
|
|
1345
|
+
if (INTERACTIVE_TAGS.has(tag)) {
|
|
1346
|
+
if (el.disabled) return false;
|
|
1347
|
+
if (tag === 'input' && el.type === 'hidden') return false;
|
|
1348
|
+
return true;
|
|
1349
|
+
}
|
|
1350
|
+
const role = el.getAttribute('role');
|
|
1351
|
+
if (role && INTERACTIVE_ROLES.has(role)) return true;
|
|
1352
|
+
if (el.contentEditable === 'true') return true;
|
|
1353
|
+
if (el.tabIndex >= 0 && el.getAttribute('tabindex') !== null) return true;
|
|
1354
|
+
return false;
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
function getRole(el) {
|
|
1358
|
+
const role = el.getAttribute('role');
|
|
1359
|
+
if (role) return role;
|
|
1360
|
+
const tag = el.tagName.toLowerCase();
|
|
1361
|
+
if (tag === 'a') return 'link';
|
|
1362
|
+
if (tag === 'button' || tag === 'summary') return 'button';
|
|
1363
|
+
if (tag === 'input') return el.type || 'text';
|
|
1364
|
+
if (tag === 'select') return 'select';
|
|
1365
|
+
if (tag === 'textarea') return 'textarea';
|
|
1366
|
+
if (tag === 'label') return 'label';
|
|
1367
|
+
return tag;
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
function getName(el) {
|
|
1371
|
+
return (
|
|
1372
|
+
el.getAttribute('aria-label') ||
|
|
1373
|
+
el.getAttribute('alt') ||
|
|
1374
|
+
el.getAttribute('title') ||
|
|
1375
|
+
el.getAttribute('placeholder') ||
|
|
1376
|
+
(el.tagName === 'INPUT' && (el.type === 'submit' || el.type === 'button') ? el.value : '') ||
|
|
1377
|
+
(el.id ? document.querySelector('label[for="' + el.id + '"]')?.textContent?.trim() : '') ||
|
|
1378
|
+
(el.children.length <= 2 ? el.textContent?.trim() : '') ||
|
|
1379
|
+
''
|
|
1380
|
+
).slice(0, 60);
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
function getValue(el) {
|
|
1384
|
+
const tag = el.tagName.toLowerCase();
|
|
1385
|
+
if (tag === 'input') {
|
|
1386
|
+
const type = el.type || 'text';
|
|
1387
|
+
if (type === 'checkbox' || type === 'radio') return el.checked ? 'checked' : 'unchecked';
|
|
1388
|
+
if (type === 'password') return el.value ? '****' : '';
|
|
1389
|
+
return el.value ? el.value.slice(0, 30) : '';
|
|
1390
|
+
}
|
|
1391
|
+
if (tag === 'textarea') return el.value ? el.value.slice(0, 30) : '';
|
|
1392
|
+
if (tag === 'select' && el.selectedOptions?.length) return el.selectedOptions[0].text.slice(0, 30);
|
|
1393
|
+
return '';
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
// Collect elements
|
|
1397
|
+
let idx = 0;
|
|
1398
|
+
let charsUsed = 0;
|
|
1399
|
+
const lines = [];
|
|
1400
|
+
let lastLandmark = '';
|
|
1401
|
+
|
|
1402
|
+
// Page header
|
|
1403
|
+
const scrollY = window.scrollY;
|
|
1404
|
+
const scrollMax = document.documentElement.scrollHeight - window.innerHeight;
|
|
1405
|
+
const scrollPct = scrollMax > 0 ? Math.round((scrollY / scrollMax) * 100) : 0;
|
|
1406
|
+
const header = 'url: ' + location.href + ' | scroll: ' + scrollPct + '%';
|
|
1407
|
+
lines.push(header);
|
|
1408
|
+
charsUsed += header.length;
|
|
1409
|
+
|
|
1410
|
+
// Walk DOM
|
|
1411
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
|
1412
|
+
let node;
|
|
1413
|
+
while ((node = walker.nextNode())) {
|
|
1414
|
+
if (!isVisible(node)) continue;
|
|
1415
|
+
|
|
1416
|
+
const tag = node.tagName.toLowerCase();
|
|
1417
|
+
if (['script','style','noscript','svg','path','meta','link','head','template'].includes(tag)) continue;
|
|
1418
|
+
|
|
1419
|
+
// Check for landmark
|
|
1420
|
+
const role = node.getAttribute('role');
|
|
1421
|
+
const landmark = LANDMARK_TAGS.get(tag) || (role ? LANDMARK_ROLES.get(role) : null);
|
|
1422
|
+
if (landmark && landmark !== lastLandmark) {
|
|
1423
|
+
const sectionLine = '--- ' + landmark + ' ---';
|
|
1424
|
+
if (charsUsed + sectionLine.length > TOKEN_BUDGET * CHARS_PER_TOKEN) break;
|
|
1425
|
+
lines.push(sectionLine);
|
|
1426
|
+
charsUsed += sectionLine.length;
|
|
1427
|
+
lastLandmark = landmark;
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
// Only emit interactive elements
|
|
1431
|
+
if (!isInteractive(node)) continue;
|
|
1432
|
+
|
|
1433
|
+
const elRole = getRole(node);
|
|
1434
|
+
const name = getName(node);
|
|
1435
|
+
const value = getValue(node);
|
|
1436
|
+
|
|
1437
|
+
// Build compact line
|
|
1438
|
+
let line = '[' + idx + '] ' + elRole;
|
|
1439
|
+
if (name) line += ' "' + name.replace(/"/g, "'") + '"';
|
|
1440
|
+
if (value) line += ' val="' + value.replace(/"/g, "'") + '"';
|
|
1441
|
+
|
|
1442
|
+
// Check token budget
|
|
1443
|
+
if (charsUsed + line.length > TOKEN_BUDGET * CHARS_PER_TOKEN) {
|
|
1444
|
+
lines.push('... (' + (document.querySelectorAll('a,button,input,select,textarea,[role]').length - idx) + ' more elements)');
|
|
1445
|
+
break;
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
// Annotate element with ref for clicking
|
|
1449
|
+
try { node.dataset.ref = String(idx); } catch {}
|
|
1450
|
+
|
|
1451
|
+
lines.push(line);
|
|
1452
|
+
charsUsed += line.length;
|
|
1453
|
+
idx++;
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
return lines.join('\\n');
|
|
1457
|
+
})()
|
|
1458
|
+
`;
|
|
1459
|
+
function buildCompactSnapshotScript(tokenBudget = 800) {
|
|
1460
|
+
return COMPACT_SNAPSHOT_SCRIPT.replace("const TOKEN_BUDGET = 800;", `const TOKEN_BUDGET = ${tokenBudget};`);
|
|
1461
|
+
}
|
|
1462
|
+
|
|
587
1463
|
// src/browser/dom/semantic-tree.ts
|
|
588
1464
|
var SEMANTIC_TREE_SCRIPT = `
|
|
589
1465
|
(() => {
|
|
@@ -1109,6 +1985,64 @@ var FORM_STATE_SCRIPT = `
|
|
|
1109
1985
|
})()
|
|
1110
1986
|
`;
|
|
1111
1987
|
|
|
1988
|
+
// src/browser/dom/interactive.ts
|
|
1989
|
+
var INTERACTIVE_ELEMENTS_SCRIPT = `
|
|
1990
|
+
(() => {
|
|
1991
|
+
const results = [];
|
|
1992
|
+
|
|
1993
|
+
function classify(el) {
|
|
1994
|
+
const tag = el.tagName.toLowerCase();
|
|
1995
|
+
const role = el.getAttribute('role');
|
|
1996
|
+
const types = [];
|
|
1997
|
+
|
|
1998
|
+
// Native interactive
|
|
1999
|
+
if (['a', 'button', 'input', 'select', 'textarea', 'details', 'summary'].includes(tag)) {
|
|
2000
|
+
types.push('native');
|
|
2001
|
+
}
|
|
2002
|
+
|
|
2003
|
+
// ARIA role interactive
|
|
2004
|
+
if (role && ['button', 'link', 'textbox', 'checkbox', 'radio', 'combobox', 'tab', 'switch', 'menuitem', 'slider'].includes(role)) {
|
|
2005
|
+
types.push('aria');
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
// Contenteditable
|
|
2009
|
+
if (el.contentEditable === 'true') types.push('contenteditable');
|
|
2010
|
+
|
|
2011
|
+
// Focusable
|
|
2012
|
+
if (el.tabIndex >= 0 && el.getAttribute('tabindex') !== null) types.push('focusable');
|
|
2013
|
+
|
|
2014
|
+
// Has click listener (approximate)
|
|
2015
|
+
if (el.onclick) types.push('listener');
|
|
2016
|
+
|
|
2017
|
+
return types;
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
let idx = 0;
|
|
2021
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
|
2022
|
+
let node;
|
|
2023
|
+
while (node = walker.nextNode()) {
|
|
2024
|
+
const types = classify(node);
|
|
2025
|
+
if (types.length === 0) continue;
|
|
2026
|
+
|
|
2027
|
+
const style = getComputedStyle(node);
|
|
2028
|
+
if (style.display === 'none' || style.visibility === 'hidden') continue;
|
|
2029
|
+
|
|
2030
|
+
const rect = node.getBoundingClientRect();
|
|
2031
|
+
results.push({
|
|
2032
|
+
index: idx++,
|
|
2033
|
+
tag: node.tagName.toLowerCase(),
|
|
2034
|
+
role: node.getAttribute('role') || '',
|
|
2035
|
+
text: (node.textContent || '').trim().slice(0, 100),
|
|
2036
|
+
types,
|
|
2037
|
+
ariaLabel: node.getAttribute('aria-label') || '',
|
|
2038
|
+
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
|
|
2039
|
+
});
|
|
2040
|
+
}
|
|
2041
|
+
|
|
2042
|
+
return results;
|
|
2043
|
+
})()
|
|
2044
|
+
`;
|
|
2045
|
+
|
|
1112
2046
|
// src/browser/interceptor.ts
|
|
1113
2047
|
function buildInterceptorScript(pattern) {
|
|
1114
2048
|
return `
|
|
@@ -1165,6 +2099,155 @@ var GET_INTERCEPTED_SCRIPT = `
|
|
|
1165
2099
|
})()
|
|
1166
2100
|
`;
|
|
1167
2101
|
|
|
2102
|
+
// src/browser/semantic-find.ts
|
|
2103
|
+
var SYNONYMS = {
|
|
2104
|
+
btn: ["button"],
|
|
2105
|
+
button: ["btn", "submit", "click"],
|
|
2106
|
+
submit: ["go", "send", "ok", "confirm", "done", "button"],
|
|
2107
|
+
search: ["find", "lookup", "query", "filter"],
|
|
2108
|
+
login: ["signin", "sign-in", "log-in", "authenticate"],
|
|
2109
|
+
signup: ["register", "create-account", "sign-up", "join"],
|
|
2110
|
+
logout: ["signout", "sign-out", "log-out"],
|
|
2111
|
+
close: ["dismiss", "x", "cancel", "exit"],
|
|
2112
|
+
menu: ["nav", "navigation", "hamburger", "sidebar"],
|
|
2113
|
+
nav: ["navigation", "menu", "navbar"],
|
|
2114
|
+
input: ["field", "textbox", "text", "entry"],
|
|
2115
|
+
email: ["mail", "e-mail"],
|
|
2116
|
+
password: ["pass", "pwd", "secret"],
|
|
2117
|
+
next: ["continue", "forward", "proceed"],
|
|
2118
|
+
back: ["previous", "return", "go-back"],
|
|
2119
|
+
save: ["store", "keep", "persist"],
|
|
2120
|
+
delete: ["remove", "trash", "discard", "destroy"],
|
|
2121
|
+
edit: ["modify", "change", "update"],
|
|
2122
|
+
add: ["create", "new", "plus", "insert"],
|
|
2123
|
+
settings: ["preferences", "config", "options", "gear"],
|
|
2124
|
+
profile: ["account", "user", "avatar"],
|
|
2125
|
+
home: ["main", "dashboard", "start"],
|
|
2126
|
+
link: ["anchor", "href", "url"],
|
|
2127
|
+
select: ["dropdown", "combo", "picker", "choose"],
|
|
2128
|
+
checkbox: ["check", "toggle", "tick"],
|
|
2129
|
+
upload: ["attach", "file", "browse"],
|
|
2130
|
+
download: ["save", "export"]
|
|
2131
|
+
};
|
|
2132
|
+
var ROLE_KEYWORDS = /* @__PURE__ */ new Set([
|
|
2133
|
+
"button",
|
|
2134
|
+
"link",
|
|
2135
|
+
"input",
|
|
2136
|
+
"textbox",
|
|
2137
|
+
"checkbox",
|
|
2138
|
+
"radio",
|
|
2139
|
+
"select",
|
|
2140
|
+
"dropdown",
|
|
2141
|
+
"tab",
|
|
2142
|
+
"menu",
|
|
2143
|
+
"menuitem",
|
|
2144
|
+
"switch",
|
|
2145
|
+
"slider",
|
|
2146
|
+
"combobox",
|
|
2147
|
+
"searchbox",
|
|
2148
|
+
"option"
|
|
2149
|
+
]);
|
|
2150
|
+
function tokenize(text) {
|
|
2151
|
+
return text.toLowerCase().replace(/[^a-z0-9\s-]/g, " ").split(/[\s-]+/).filter((t) => t.length > 0);
|
|
2152
|
+
}
|
|
2153
|
+
function expandSynonyms(tokens) {
|
|
2154
|
+
const expanded = new Set(tokens);
|
|
2155
|
+
for (const token of tokens) {
|
|
2156
|
+
const syns = SYNONYMS[token];
|
|
2157
|
+
if (syns) {
|
|
2158
|
+
for (const syn of syns) expanded.add(syn);
|
|
2159
|
+
}
|
|
2160
|
+
}
|
|
2161
|
+
return expanded;
|
|
2162
|
+
}
|
|
2163
|
+
function freqMap(tokens) {
|
|
2164
|
+
const map = /* @__PURE__ */ new Map();
|
|
2165
|
+
for (const t of tokens) {
|
|
2166
|
+
map.set(t, (map.get(t) || 0) + 1);
|
|
2167
|
+
}
|
|
2168
|
+
return map;
|
|
2169
|
+
}
|
|
2170
|
+
function jaccardScore(queryTokens, descTokens) {
|
|
2171
|
+
const qFreq = freqMap(queryTokens);
|
|
2172
|
+
const dFreq = freqMap(descTokens);
|
|
2173
|
+
let intersection = 0;
|
|
2174
|
+
let union = 0;
|
|
2175
|
+
const allTokens = /* @__PURE__ */ new Set([...qFreq.keys(), ...dFreq.keys()]);
|
|
2176
|
+
for (const token of allTokens) {
|
|
2177
|
+
const qCount = qFreq.get(token) || 0;
|
|
2178
|
+
const dCount = dFreq.get(token) || 0;
|
|
2179
|
+
intersection += Math.min(qCount, dCount);
|
|
2180
|
+
union += Math.max(qCount, dCount);
|
|
2181
|
+
}
|
|
2182
|
+
return union === 0 ? 0 : intersection / union;
|
|
2183
|
+
}
|
|
2184
|
+
function prefixScore(queryTokens, descTokens) {
|
|
2185
|
+
if (queryTokens.length === 0 || descTokens.length === 0) return 0;
|
|
2186
|
+
let matches = 0;
|
|
2187
|
+
for (const qt of queryTokens) {
|
|
2188
|
+
if (qt.length < 3) continue;
|
|
2189
|
+
for (const dt of descTokens) {
|
|
2190
|
+
if (dt.startsWith(qt) || qt.startsWith(dt)) {
|
|
2191
|
+
matches += 0.5;
|
|
2192
|
+
break;
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
return Math.min(matches / queryTokens.length, 0.3);
|
|
2197
|
+
}
|
|
2198
|
+
function roleBoost(queryTokens, elementRole) {
|
|
2199
|
+
const roleLower = elementRole.toLowerCase();
|
|
2200
|
+
for (const qt of queryTokens) {
|
|
2201
|
+
if (ROLE_KEYWORDS.has(qt) && roleLower.includes(qt)) {
|
|
2202
|
+
return 0.2;
|
|
2203
|
+
}
|
|
2204
|
+
}
|
|
2205
|
+
return 0;
|
|
2206
|
+
}
|
|
2207
|
+
function scoreElement(queryTokens, queryExpanded, element) {
|
|
2208
|
+
const descParts = [
|
|
2209
|
+
element.text,
|
|
2210
|
+
element.role,
|
|
2211
|
+
element.tag,
|
|
2212
|
+
element.ariaLabel
|
|
2213
|
+
].filter(Boolean);
|
|
2214
|
+
const descText = descParts.join(" ");
|
|
2215
|
+
const descTokens = tokenize(descText);
|
|
2216
|
+
if (descTokens.length === 0) return 0;
|
|
2217
|
+
const descExpanded = expandSynonyms(descTokens);
|
|
2218
|
+
const expandedQueryTokens = [...queryExpanded];
|
|
2219
|
+
const expandedDescTokens = [...descExpanded];
|
|
2220
|
+
const jaccard = jaccardScore(expandedQueryTokens, expandedDescTokens);
|
|
2221
|
+
const prefix = prefixScore(queryTokens, descTokens);
|
|
2222
|
+
const role = roleBoost(queryTokens, element.role || element.tag);
|
|
2223
|
+
const queryStr = queryTokens.join(" ");
|
|
2224
|
+
const descStr = descTokens.join(" ");
|
|
2225
|
+
const exactBonus = descStr.includes(queryStr) ? 0.3 : 0;
|
|
2226
|
+
return Math.min(jaccard + prefix + role + exactBonus, 1);
|
|
2227
|
+
}
|
|
2228
|
+
function semanticFind(elements, query, options) {
|
|
2229
|
+
const maxResults = options?.maxResults ?? 5;
|
|
2230
|
+
const minScore = options?.minScore ?? 0.3;
|
|
2231
|
+
const queryTokens = tokenize(query);
|
|
2232
|
+
if (queryTokens.length === 0) return [];
|
|
2233
|
+
const queryExpanded = expandSynonyms(queryTokens);
|
|
2234
|
+
const scored = [];
|
|
2235
|
+
for (const el of elements) {
|
|
2236
|
+
const score = scoreElement(queryTokens, queryExpanded, el);
|
|
2237
|
+
if (score >= minScore) {
|
|
2238
|
+
scored.push({
|
|
2239
|
+
ref: el.index,
|
|
2240
|
+
score: Math.round(score * 100) / 100,
|
|
2241
|
+
text: (el.text || el.ariaLabel || "").slice(0, 60),
|
|
2242
|
+
role: el.role || el.tag,
|
|
2243
|
+
tag: el.tag
|
|
2244
|
+
});
|
|
2245
|
+
}
|
|
2246
|
+
}
|
|
2247
|
+
scored.sort((a, b) => b.score - a.score);
|
|
2248
|
+
return scored.slice(0, maxResults);
|
|
2249
|
+
}
|
|
2250
|
+
|
|
1168
2251
|
// src/browser/page-adapter.ts
|
|
1169
2252
|
var PuppeteerPage = class {
|
|
1170
2253
|
page;
|
|
@@ -1192,7 +2275,10 @@ var PuppeteerPage = class {
|
|
|
1192
2275
|
async evaluate(js) {
|
|
1193
2276
|
return this.page.evaluate(js);
|
|
1194
2277
|
}
|
|
1195
|
-
async snapshot(
|
|
2278
|
+
async snapshot(opts) {
|
|
2279
|
+
if (opts?.compact) {
|
|
2280
|
+
return this.page.evaluate(COMPACT_SNAPSHOT_SCRIPT);
|
|
2281
|
+
}
|
|
1196
2282
|
return this.page.evaluate(SNAPSHOT_SCRIPT);
|
|
1197
2283
|
}
|
|
1198
2284
|
async semanticTree(_opts) {
|
|
@@ -1464,69 +2550,15 @@ var PuppeteerPage = class {
|
|
|
1464
2550
|
active: p === this.page
|
|
1465
2551
|
}));
|
|
1466
2552
|
}
|
|
2553
|
+
async find(query, options) {
|
|
2554
|
+
const elements = await this.page.evaluate(INTERACTIVE_ELEMENTS_SCRIPT);
|
|
2555
|
+
return semanticFind(elements, query, options);
|
|
2556
|
+
}
|
|
1467
2557
|
async close() {
|
|
1468
2558
|
await this.page.close();
|
|
1469
2559
|
}
|
|
1470
2560
|
};
|
|
1471
2561
|
|
|
1472
|
-
// src/browser/dom/interactive.ts
|
|
1473
|
-
var INTERACTIVE_ELEMENTS_SCRIPT = `
|
|
1474
|
-
(() => {
|
|
1475
|
-
const results = [];
|
|
1476
|
-
|
|
1477
|
-
function classify(el) {
|
|
1478
|
-
const tag = el.tagName.toLowerCase();
|
|
1479
|
-
const role = el.getAttribute('role');
|
|
1480
|
-
const types = [];
|
|
1481
|
-
|
|
1482
|
-
// Native interactive
|
|
1483
|
-
if (['a', 'button', 'input', 'select', 'textarea', 'details', 'summary'].includes(tag)) {
|
|
1484
|
-
types.push('native');
|
|
1485
|
-
}
|
|
1486
|
-
|
|
1487
|
-
// ARIA role interactive
|
|
1488
|
-
if (role && ['button', 'link', 'textbox', 'checkbox', 'radio', 'combobox', 'tab', 'switch', 'menuitem', 'slider'].includes(role)) {
|
|
1489
|
-
types.push('aria');
|
|
1490
|
-
}
|
|
1491
|
-
|
|
1492
|
-
// Contenteditable
|
|
1493
|
-
if (el.contentEditable === 'true') types.push('contenteditable');
|
|
1494
|
-
|
|
1495
|
-
// Focusable
|
|
1496
|
-
if (el.tabIndex >= 0 && el.getAttribute('tabindex') !== null) types.push('focusable');
|
|
1497
|
-
|
|
1498
|
-
// Has click listener (approximate)
|
|
1499
|
-
if (el.onclick) types.push('listener');
|
|
1500
|
-
|
|
1501
|
-
return types;
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
let idx = 0;
|
|
1505
|
-
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
|
1506
|
-
let node;
|
|
1507
|
-
while (node = walker.nextNode()) {
|
|
1508
|
-
const types = classify(node);
|
|
1509
|
-
if (types.length === 0) continue;
|
|
1510
|
-
|
|
1511
|
-
const style = getComputedStyle(node);
|
|
1512
|
-
if (style.display === 'none' || style.visibility === 'hidden') continue;
|
|
1513
|
-
|
|
1514
|
-
const rect = node.getBoundingClientRect();
|
|
1515
|
-
results.push({
|
|
1516
|
-
index: idx++,
|
|
1517
|
-
tag: node.tagName.toLowerCase(),
|
|
1518
|
-
role: node.getAttribute('role') || '',
|
|
1519
|
-
text: (node.textContent || '').trim().slice(0, 100),
|
|
1520
|
-
types,
|
|
1521
|
-
ariaLabel: node.getAttribute('aria-label') || '',
|
|
1522
|
-
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
|
|
1523
|
-
});
|
|
1524
|
-
}
|
|
1525
|
-
|
|
1526
|
-
return results;
|
|
1527
|
-
})()
|
|
1528
|
-
`;
|
|
1529
|
-
|
|
1530
2562
|
// src/browser/wait.ts
|
|
1531
2563
|
async function waitForCondition(page, condition, timeout = 3e4) {
|
|
1532
2564
|
switch (condition) {
|
|
@@ -1958,6 +2990,43 @@ async function lobsterFetch(url, options) {
|
|
|
1958
2990
|
const timeout = options?.timeout || 3e4;
|
|
1959
2991
|
const dump = options?.dump || "markdown";
|
|
1960
2992
|
const start = Date.now();
|
|
2993
|
+
const { isPdfUrl: isPdfUrl2, isPdfResponse: isPdfResponse2, extractPdf: extractPdf2 } = await Promise.resolve().then(() => (init_pdf(), pdf_exports));
|
|
2994
|
+
if (isPdfUrl2(url)) {
|
|
2995
|
+
const pdfResult = await extractPdf2(url);
|
|
2996
|
+
const duration2 = Date.now() - start;
|
|
2997
|
+
let content2;
|
|
2998
|
+
switch (dump) {
|
|
2999
|
+
case "markdown":
|
|
3000
|
+
content2 = pdfResult.markdown;
|
|
3001
|
+
break;
|
|
3002
|
+
case "text":
|
|
3003
|
+
content2 = pdfResult.text;
|
|
3004
|
+
break;
|
|
3005
|
+
case "html":
|
|
3006
|
+
content2 = `<pre>${pdfResult.text}</pre>`;
|
|
3007
|
+
break;
|
|
3008
|
+
case "snapshot":
|
|
3009
|
+
content2 = `[PDF] ${pdfResult.metadata.title} (${pdfResult.metadata.pages} pages, ${pdfResult.wordCount} words)
|
|
3010
|
+
|
|
3011
|
+
${pdfResult.text.slice(0, 5e3)}`;
|
|
3012
|
+
break;
|
|
3013
|
+
case "links":
|
|
3014
|
+
content2 = "";
|
|
3015
|
+
break;
|
|
3016
|
+
default:
|
|
3017
|
+
content2 = pdfResult.markdown;
|
|
3018
|
+
}
|
|
3019
|
+
return {
|
|
3020
|
+
url,
|
|
3021
|
+
finalUrl: url,
|
|
3022
|
+
title: pdfResult.metadata.title,
|
|
3023
|
+
content: content2,
|
|
3024
|
+
links: [],
|
|
3025
|
+
engine: "pdf",
|
|
3026
|
+
duration: duration2,
|
|
3027
|
+
statusCode: 200
|
|
3028
|
+
};
|
|
3029
|
+
}
|
|
1961
3030
|
const resp = await fetch(url, {
|
|
1962
3031
|
headers: {
|
|
1963
3032
|
"User-Agent": "LobsterCLI/0.1 (+https://github.com/iexcalibur/lobster-cli)",
|
|
@@ -1971,6 +3040,48 @@ async function lobsterFetch(url, options) {
|
|
|
1971
3040
|
if (!resp.ok) {
|
|
1972
3041
|
throw new Error(`HTTP ${resp.status} ${resp.statusText}`);
|
|
1973
3042
|
}
|
|
3043
|
+
const contentType = resp.headers.get("content-type") || "";
|
|
3044
|
+
if (isPdfResponse2(contentType)) {
|
|
3045
|
+
const arrayBuffer = await resp.arrayBuffer();
|
|
3046
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
3047
|
+
const pdfMod = await import("pdf-parse");
|
|
3048
|
+
const pdfParseFn2 = pdfMod.PDFParse || pdfMod.default || pdfMod;
|
|
3049
|
+
const pdfResult = await pdfParseFn2(buffer);
|
|
3050
|
+
const info = pdfResult.info || {};
|
|
3051
|
+
const metadata = {
|
|
3052
|
+
title: info.Title || "untitled",
|
|
3053
|
+
author: info.Author || "",
|
|
3054
|
+
pages: pdfResult.numpages
|
|
3055
|
+
};
|
|
3056
|
+
const duration2 = Date.now() - start;
|
|
3057
|
+
const text = pdfResult.text || "";
|
|
3058
|
+
let content2;
|
|
3059
|
+
switch (dump) {
|
|
3060
|
+
case "text":
|
|
3061
|
+
content2 = text;
|
|
3062
|
+
break;
|
|
3063
|
+
case "html":
|
|
3064
|
+
content2 = `<pre>${text}</pre>`;
|
|
3065
|
+
break;
|
|
3066
|
+
case "snapshot":
|
|
3067
|
+
content2 = `[PDF] ${metadata.title} (${metadata.pages} pages)
|
|
3068
|
+
|
|
3069
|
+
${text.slice(0, 5e3)}`;
|
|
3070
|
+
break;
|
|
3071
|
+
default:
|
|
3072
|
+
content2 = text;
|
|
3073
|
+
}
|
|
3074
|
+
return {
|
|
3075
|
+
url,
|
|
3076
|
+
finalUrl: resp.url || url,
|
|
3077
|
+
title: metadata.title,
|
|
3078
|
+
content: content2,
|
|
3079
|
+
links: [],
|
|
3080
|
+
engine: "pdf",
|
|
3081
|
+
duration: duration2,
|
|
3082
|
+
statusCode: 200
|
|
3083
|
+
};
|
|
3084
|
+
}
|
|
1974
3085
|
const html = await resp.text();
|
|
1975
3086
|
const duration = Date.now() - start;
|
|
1976
3087
|
const finalUrl = resp.url || url;
|
|
@@ -2010,8 +3121,12 @@ async function lobsterFetch(url, options) {
|
|
|
2010
3121
|
}
|
|
2011
3122
|
return { url, finalUrl, status: resp.status, title, content, links, duration };
|
|
2012
3123
|
}
|
|
3124
|
+
|
|
3125
|
+
// src/browser/index.ts
|
|
3126
|
+
init_pdf();
|
|
2013
3127
|
export {
|
|
2014
3128
|
BrowserManager,
|
|
3129
|
+
COMPACT_SNAPSHOT_SCRIPT,
|
|
2015
3130
|
FLAT_TREE_SCRIPT,
|
|
2016
3131
|
FORM_STATE_SCRIPT,
|
|
2017
3132
|
GET_INTERCEPTED_SCRIPT,
|
|
@@ -2020,15 +3135,31 @@ export {
|
|
|
2020
3135
|
PuppeteerPage,
|
|
2021
3136
|
SEMANTIC_TREE_SCRIPT,
|
|
2022
3137
|
SNAPSHOT_SCRIPT,
|
|
3138
|
+
STEALTH_ARGS,
|
|
3139
|
+
STEALTH_SCRIPT,
|
|
3140
|
+
buildCompactSnapshotScript,
|
|
2023
3141
|
buildInterceptorScript,
|
|
2024
3142
|
buildSnapshotScript,
|
|
3143
|
+
createProfile,
|
|
3144
|
+
discoverChrome,
|
|
2025
3145
|
extractLinks,
|
|
2026
3146
|
extractMarkdown,
|
|
3147
|
+
extractPdf,
|
|
2027
3148
|
extractSnapshot,
|
|
2028
3149
|
extractText,
|
|
2029
3150
|
flatTreeToString,
|
|
3151
|
+
getProfileDataDir,
|
|
3152
|
+
injectStealth,
|
|
3153
|
+
isPdfResponse,
|
|
3154
|
+
isPdfUrl,
|
|
3155
|
+
listProfiles,
|
|
2030
3156
|
lobsterFetch,
|
|
2031
3157
|
parseHtml,
|
|
3158
|
+
removeProfile,
|
|
3159
|
+
resetProfileCache,
|
|
3160
|
+
resolveAttachTarget,
|
|
3161
|
+
semanticFind,
|
|
3162
|
+
tryExtractPdf,
|
|
2032
3163
|
waitForCondition
|
|
2033
3164
|
};
|
|
2034
3165
|
//# sourceMappingURL=index.js.map
|