lobster-cli 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/agent/core.js +63 -0
- package/dist/agent/core.js.map +1 -1
- package/dist/agent/index.js +63 -0
- package/dist/agent/index.js.map +1 -1
- package/dist/browser/index.js +294 -0
- package/dist/browser/index.js.map +1 -1
- package/dist/doc/index.js +31715 -0
- package/dist/doc/index.js.map +1 -0
- package/dist/index.js +32070 -221
- package/dist/index.js.map +1 -1
- package/dist/lib.js +352 -2
- package/dist/lib.js.map +1 -1
- package/dist/llm/client.js +63 -0
- package/dist/llm/client.js.map +1 -1
- package/dist/llm/index.js +63 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/openai-client.js +63 -0
- package/dist/llm/openai-client.js.map +1 -1
- package/dist/router/index.js +63 -0
- package/dist/router/index.js.map +1 -1
- package/package.json +15 -2
package/dist/browser/index.js
CHANGED
|
@@ -1,3 +1,211 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// src/browser/pdf.ts
|
|
12
|
+
var pdf_exports = {};
|
|
13
|
+
__export(pdf_exports, {
|
|
14
|
+
extractPdf: () => extractPdf,
|
|
15
|
+
isPdfResponse: () => isPdfResponse,
|
|
16
|
+
isPdfUrl: () => isPdfUrl,
|
|
17
|
+
tryExtractPdf: () => tryExtractPdf
|
|
18
|
+
});
|
|
19
|
+
import { readFileSync as readFileSync3 } from "fs";
|
|
20
|
+
async function getPdfParser() {
|
|
21
|
+
if (!pdfParseFn) {
|
|
22
|
+
const mod = await import("pdf-parse");
|
|
23
|
+
const PDFParseClass = mod.PDFParse;
|
|
24
|
+
if (PDFParseClass && typeof PDFParseClass === "function") {
|
|
25
|
+
pdfParseFn = async (buffer) => {
|
|
26
|
+
const parser = new PDFParseClass(buffer);
|
|
27
|
+
return parser.parse ? await parser.parse() : parser;
|
|
28
|
+
};
|
|
29
|
+
} else {
|
|
30
|
+
pdfParseFn = mod.default || mod;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return pdfParseFn;
|
|
34
|
+
}
|
|
35
|
+
function isPdfUrl(urlOrPath) {
|
|
36
|
+
const lower = urlOrPath.toLowerCase();
|
|
37
|
+
if (lower.endsWith(".pdf")) return true;
|
|
38
|
+
if (/\/pdf\//.test(lower)) return true;
|
|
39
|
+
if (/arxiv\.org\/pdf\//.test(lower)) return true;
|
|
40
|
+
if (/[?&]format=pdf/i.test(lower)) return true;
|
|
41
|
+
if (/[?&]type=pdf/i.test(lower)) return true;
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
function isPdfResponse(contentType) {
|
|
45
|
+
return contentType.includes("application/pdf");
|
|
46
|
+
}
|
|
47
|
+
async function downloadPdf(url) {
|
|
48
|
+
const response = await fetch(url, {
|
|
49
|
+
headers: {
|
|
50
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
51
|
+
"Accept": "application/pdf,*/*"
|
|
52
|
+
},
|
|
53
|
+
redirect: "follow"
|
|
54
|
+
});
|
|
55
|
+
if (!response.ok) {
|
|
56
|
+
throw new Error(`Failed to download PDF: ${response.status} ${response.statusText}`);
|
|
57
|
+
}
|
|
58
|
+
const contentType = response.headers.get("content-type") || "";
|
|
59
|
+
if (!contentType.includes("pdf") && !isPdfUrl(url)) {
|
|
60
|
+
}
|
|
61
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
62
|
+
return Buffer.from(arrayBuffer);
|
|
63
|
+
}
|
|
64
|
+
function readLocalPdf(filePath) {
|
|
65
|
+
return readFileSync3(filePath);
|
|
66
|
+
}
|
|
67
|
+
function textToMarkdown(text, metadata) {
|
|
68
|
+
const lines = text.split("\n");
|
|
69
|
+
const mdLines = [];
|
|
70
|
+
if (metadata.title && metadata.title !== "untitled") {
|
|
71
|
+
mdLines.push(`# ${metadata.title}`);
|
|
72
|
+
mdLines.push("");
|
|
73
|
+
if (metadata.author) {
|
|
74
|
+
mdLines.push(`**Authors:** ${metadata.author}`);
|
|
75
|
+
mdLines.push("");
|
|
76
|
+
}
|
|
77
|
+
mdLines.push("---");
|
|
78
|
+
mdLines.push("");
|
|
79
|
+
}
|
|
80
|
+
let inReferences = false;
|
|
81
|
+
let prevWasBlank = false;
|
|
82
|
+
let paragraphBuffer = [];
|
|
83
|
+
function flushParagraph() {
|
|
84
|
+
if (paragraphBuffer.length > 0) {
|
|
85
|
+
mdLines.push(paragraphBuffer.join(" "));
|
|
86
|
+
mdLines.push("");
|
|
87
|
+
paragraphBuffer = [];
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
for (let i = 0; i < lines.length; i++) {
|
|
91
|
+
const line = lines[i].trim();
|
|
92
|
+
if (!line) {
|
|
93
|
+
if (!prevWasBlank) {
|
|
94
|
+
flushParagraph();
|
|
95
|
+
}
|
|
96
|
+
prevWasBlank = true;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
prevWasBlank = false;
|
|
100
|
+
const numberedHeading = line.match(/^(\d+\.?\d*\.?\d*)\s+([A-Z][A-Za-z\s:&-]+)$/);
|
|
101
|
+
if (numberedHeading && line.length < 80) {
|
|
102
|
+
flushParagraph();
|
|
103
|
+
const depth = numberedHeading[1].split(".").filter(Boolean).length;
|
|
104
|
+
const prefix = depth <= 1 ? "##" : depth === 2 ? "###" : "####";
|
|
105
|
+
mdLines.push(`${prefix} ${line}`);
|
|
106
|
+
mdLines.push("");
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (line === line.toUpperCase() && line.length > 3 && line.length < 60 && /^[A-Z\s:&-]+$/.test(line)) {
|
|
110
|
+
flushParagraph();
|
|
111
|
+
mdLines.push(`## ${line.charAt(0) + line.slice(1).toLowerCase()}`);
|
|
112
|
+
mdLines.push("");
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
const knownHeadings = /^(Abstract|Introduction|Conclusion|Discussion|Results|Methods|Methodology|Background|Related Work|Acknowledgments|Acknowledgements|References|Bibliography|Appendix)/i;
|
|
116
|
+
if (knownHeadings.test(line) && line.length < 40) {
|
|
117
|
+
flushParagraph();
|
|
118
|
+
if (/^(References|Bibliography)/i.test(line)) {
|
|
119
|
+
inReferences = true;
|
|
120
|
+
}
|
|
121
|
+
mdLines.push(`## ${line}`);
|
|
122
|
+
mdLines.push("");
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (/^[-•∙◦▪]/.test(line)) {
|
|
126
|
+
flushParagraph();
|
|
127
|
+
mdLines.push(`- ${line.replace(/^[-•∙◦▪]\s*/, "")}`);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (/^\(\d+\)|^[a-z]\)/.test(line)) {
|
|
131
|
+
flushParagraph();
|
|
132
|
+
mdLines.push(`- ${line}`);
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
if (inReferences && /^\[?\d+\]?\.?\s/.test(line)) {
|
|
136
|
+
flushParagraph();
|
|
137
|
+
mdLines.push(`- ${line}`);
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
if (line.endsWith("-") && i + 1 < lines.length) {
|
|
141
|
+
paragraphBuffer.push(line.slice(0, -1));
|
|
142
|
+
} else {
|
|
143
|
+
paragraphBuffer.push(line);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
flushParagraph();
|
|
147
|
+
return mdLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
148
|
+
}
|
|
149
|
+
async function extractFromBuffer(buffer) {
|
|
150
|
+
const parse = await getPdfParser();
|
|
151
|
+
const result = await parse(buffer);
|
|
152
|
+
const info = result.info || {};
|
|
153
|
+
const metadata = {
|
|
154
|
+
title: info.Title || "untitled",
|
|
155
|
+
author: info.Author || "",
|
|
156
|
+
pages: result.numpages,
|
|
157
|
+
creator: info.Creator || "",
|
|
158
|
+
producer: info.Producer || "",
|
|
159
|
+
creationDate: info.CreationDate || ""
|
|
160
|
+
};
|
|
161
|
+
const text = result.text || "";
|
|
162
|
+
const pageTexts = text.split(/\f/).filter(Boolean);
|
|
163
|
+
const pages = pageTexts.length === result.numpages ? pageTexts : [text];
|
|
164
|
+
const markdown = textToMarkdown(text, metadata);
|
|
165
|
+
const words = text.split(/\s+/).filter(Boolean);
|
|
166
|
+
return {
|
|
167
|
+
metadata,
|
|
168
|
+
text,
|
|
169
|
+
markdown,
|
|
170
|
+
pages: pages.map((p) => p.trim()),
|
|
171
|
+
wordCount: words.length,
|
|
172
|
+
charCount: text.length
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
async function extractPdf(urlOrPath) {
|
|
176
|
+
let buffer;
|
|
177
|
+
if (urlOrPath.startsWith("http://") || urlOrPath.startsWith("https://")) {
|
|
178
|
+
buffer = await downloadPdf(urlOrPath);
|
|
179
|
+
} else {
|
|
180
|
+
buffer = readLocalPdf(urlOrPath);
|
|
181
|
+
}
|
|
182
|
+
if (buffer[0] !== 37 || buffer[1] !== 80 || buffer[2] !== 68 || buffer[3] !== 70) {
|
|
183
|
+
throw new Error("Not a valid PDF file (invalid magic bytes)");
|
|
184
|
+
}
|
|
185
|
+
return extractFromBuffer(buffer);
|
|
186
|
+
}
|
|
187
|
+
async function tryExtractPdf(url) {
|
|
188
|
+
if (isPdfUrl(url)) {
|
|
189
|
+
return extractPdf(url);
|
|
190
|
+
}
|
|
191
|
+
try {
|
|
192
|
+
const head = await fetch(url, { method: "HEAD", redirect: "follow" });
|
|
193
|
+
const contentType = head.headers.get("content-type") || "";
|
|
194
|
+
if (isPdfResponse(contentType)) {
|
|
195
|
+
return extractPdf(url);
|
|
196
|
+
}
|
|
197
|
+
} catch {
|
|
198
|
+
}
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
var pdfParseFn;
|
|
202
|
+
var init_pdf = __esm({
|
|
203
|
+
"src/browser/pdf.ts"() {
|
|
204
|
+
"use strict";
|
|
205
|
+
pdfParseFn = null;
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
1
209
|
// src/browser/manager.ts
|
|
2
210
|
import puppeteer from "puppeteer-core";
|
|
3
211
|
import { existsSync as existsSync3 } from "fs";
|
|
@@ -2782,6 +2990,43 @@ async function lobsterFetch(url, options) {
|
|
|
2782
2990
|
const timeout = options?.timeout || 3e4;
|
|
2783
2991
|
const dump = options?.dump || "markdown";
|
|
2784
2992
|
const start = Date.now();
|
|
2993
|
+
const { isPdfUrl: isPdfUrl2, isPdfResponse: isPdfResponse2, extractPdf: extractPdf2 } = await Promise.resolve().then(() => (init_pdf(), pdf_exports));
|
|
2994
|
+
if (isPdfUrl2(url)) {
|
|
2995
|
+
const pdfResult = await extractPdf2(url);
|
|
2996
|
+
const duration2 = Date.now() - start;
|
|
2997
|
+
let content2;
|
|
2998
|
+
switch (dump) {
|
|
2999
|
+
case "markdown":
|
|
3000
|
+
content2 = pdfResult.markdown;
|
|
3001
|
+
break;
|
|
3002
|
+
case "text":
|
|
3003
|
+
content2 = pdfResult.text;
|
|
3004
|
+
break;
|
|
3005
|
+
case "html":
|
|
3006
|
+
content2 = `<pre>${pdfResult.text}</pre>`;
|
|
3007
|
+
break;
|
|
3008
|
+
case "snapshot":
|
|
3009
|
+
content2 = `[PDF] ${pdfResult.metadata.title} (${pdfResult.metadata.pages} pages, ${pdfResult.wordCount} words)
|
|
3010
|
+
|
|
3011
|
+
${pdfResult.text.slice(0, 5e3)}`;
|
|
3012
|
+
break;
|
|
3013
|
+
case "links":
|
|
3014
|
+
content2 = "";
|
|
3015
|
+
break;
|
|
3016
|
+
default:
|
|
3017
|
+
content2 = pdfResult.markdown;
|
|
3018
|
+
}
|
|
3019
|
+
return {
|
|
3020
|
+
url,
|
|
3021
|
+
finalUrl: url,
|
|
3022
|
+
title: pdfResult.metadata.title,
|
|
3023
|
+
content: content2,
|
|
3024
|
+
links: [],
|
|
3025
|
+
engine: "pdf",
|
|
3026
|
+
duration: duration2,
|
|
3027
|
+
statusCode: 200
|
|
3028
|
+
};
|
|
3029
|
+
}
|
|
2785
3030
|
const resp = await fetch(url, {
|
|
2786
3031
|
headers: {
|
|
2787
3032
|
"User-Agent": "LobsterCLI/0.1 (+https://github.com/iexcalibur/lobster-cli)",
|
|
@@ -2795,6 +3040,48 @@ async function lobsterFetch(url, options) {
|
|
|
2795
3040
|
if (!resp.ok) {
|
|
2796
3041
|
throw new Error(`HTTP ${resp.status} ${resp.statusText}`);
|
|
2797
3042
|
}
|
|
3043
|
+
const contentType = resp.headers.get("content-type") || "";
|
|
3044
|
+
if (isPdfResponse2(contentType)) {
|
|
3045
|
+
const arrayBuffer = await resp.arrayBuffer();
|
|
3046
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
3047
|
+
const pdfMod = await import("pdf-parse");
|
|
3048
|
+
const pdfParseFn2 = pdfMod.PDFParse || pdfMod.default || pdfMod;
|
|
3049
|
+
const pdfResult = await pdfParseFn2(buffer);
|
|
3050
|
+
const info = pdfResult.info || {};
|
|
3051
|
+
const metadata = {
|
|
3052
|
+
title: info.Title || "untitled",
|
|
3053
|
+
author: info.Author || "",
|
|
3054
|
+
pages: pdfResult.numpages
|
|
3055
|
+
};
|
|
3056
|
+
const duration2 = Date.now() - start;
|
|
3057
|
+
const text = pdfResult.text || "";
|
|
3058
|
+
let content2;
|
|
3059
|
+
switch (dump) {
|
|
3060
|
+
case "text":
|
|
3061
|
+
content2 = text;
|
|
3062
|
+
break;
|
|
3063
|
+
case "html":
|
|
3064
|
+
content2 = `<pre>${text}</pre>`;
|
|
3065
|
+
break;
|
|
3066
|
+
case "snapshot":
|
|
3067
|
+
content2 = `[PDF] ${metadata.title} (${metadata.pages} pages)
|
|
3068
|
+
|
|
3069
|
+
${text.slice(0, 5e3)}`;
|
|
3070
|
+
break;
|
|
3071
|
+
default:
|
|
3072
|
+
content2 = text;
|
|
3073
|
+
}
|
|
3074
|
+
return {
|
|
3075
|
+
url,
|
|
3076
|
+
finalUrl: resp.url || url,
|
|
3077
|
+
title: metadata.title,
|
|
3078
|
+
content: content2,
|
|
3079
|
+
links: [],
|
|
3080
|
+
engine: "pdf",
|
|
3081
|
+
duration: duration2,
|
|
3082
|
+
statusCode: 200
|
|
3083
|
+
};
|
|
3084
|
+
}
|
|
2798
3085
|
const html = await resp.text();
|
|
2799
3086
|
const duration = Date.now() - start;
|
|
2800
3087
|
const finalUrl = resp.url || url;
|
|
@@ -2834,6 +3121,9 @@ async function lobsterFetch(url, options) {
|
|
|
2834
3121
|
}
|
|
2835
3122
|
return { url, finalUrl, status: resp.status, title, content, links, duration };
|
|
2836
3123
|
}
|
|
3124
|
+
|
|
3125
|
+
// src/browser/index.ts
|
|
3126
|
+
init_pdf();
|
|
2837
3127
|
export {
|
|
2838
3128
|
BrowserManager,
|
|
2839
3129
|
COMPACT_SNAPSHOT_SCRIPT,
|
|
@@ -2854,11 +3144,14 @@ export {
|
|
|
2854
3144
|
discoverChrome,
|
|
2855
3145
|
extractLinks,
|
|
2856
3146
|
extractMarkdown,
|
|
3147
|
+
extractPdf,
|
|
2857
3148
|
extractSnapshot,
|
|
2858
3149
|
extractText,
|
|
2859
3150
|
flatTreeToString,
|
|
2860
3151
|
getProfileDataDir,
|
|
2861
3152
|
injectStealth,
|
|
3153
|
+
isPdfResponse,
|
|
3154
|
+
isPdfUrl,
|
|
2862
3155
|
listProfiles,
|
|
2863
3156
|
lobsterFetch,
|
|
2864
3157
|
parseHtml,
|
|
@@ -2866,6 +3159,7 @@ export {
|
|
|
2866
3159
|
resetProfileCache,
|
|
2867
3160
|
resolveAttachTarget,
|
|
2868
3161
|
semanticFind,
|
|
3162
|
+
tryExtractPdf,
|
|
2869
3163
|
waitForCondition
|
|
2870
3164
|
};
|
|
2871
3165
|
//# sourceMappingURL=index.js.map
|