webpeel 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +11 -657
- package/README.md +246 -325
- package/dist/cli.js +330 -73
- package/dist/cli.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +12 -0
- package/dist/core/browser-fetch.d.ts.map +1 -1
- package/dist/core/browser-fetch.js +70 -17
- package/dist/core/browser-fetch.js.map +1 -1
- package/dist/core/cf-worker-proxy.d.ts +33 -0
- package/dist/core/cf-worker-proxy.d.ts.map +1 -0
- package/dist/core/cf-worker-proxy.js +88 -0
- package/dist/core/cf-worker-proxy.js.map +1 -0
- package/dist/core/chunker.d.ts +47 -0
- package/dist/core/chunker.d.ts.map +1 -0
- package/dist/core/chunker.js +250 -0
- package/dist/core/chunker.js.map +1 -0
- package/dist/core/cloak-fetch.d.ts +43 -0
- package/dist/core/cloak-fetch.d.ts.map +1 -0
- package/dist/core/cloak-fetch.js +141 -0
- package/dist/core/cloak-fetch.js.map +1 -0
- package/dist/core/crawl-checkpoint.d.ts +55 -0
- package/dist/core/crawl-checkpoint.d.ts.map +1 -0
- package/dist/core/crawl-checkpoint.js +105 -0
- package/dist/core/crawl-checkpoint.js.map +1 -0
- package/dist/core/crawler.d.ts +5 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +60 -5
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/cycle-fetch.d.ts +27 -0
- package/dist/core/cycle-fetch.d.ts.map +1 -0
- package/dist/core/cycle-fetch.js +99 -0
- package/dist/core/cycle-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +754 -14
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/google-cache.d.ts +30 -0
- package/dist/core/google-cache.d.ts.map +1 -0
- package/dist/core/google-cache.js +181 -0
- package/dist/core/google-cache.js.map +1 -0
- package/dist/core/markdown.d.ts +11 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +43 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/peel-tls.d.ts +26 -0
- package/dist/core/peel-tls.d.ts.map +1 -0
- package/dist/core/peel-tls.js +221 -0
- package/dist/core/peel-tls.js.map +1 -0
- package/dist/core/pipeline.d.ts +5 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +269 -21
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/schema-postprocess.d.ts +33 -0
- package/dist/core/schema-postprocess.d.ts.map +1 -0
- package/dist/core/schema-postprocess.js +470 -0
- package/dist/core/schema-postprocess.js.map +1 -0
- package/dist/core/schema-templates.d.ts +20 -0
- package/dist/core/schema-templates.d.ts.map +1 -0
- package/dist/core/schema-templates.js +131 -0
- package/dist/core/schema-templates.js.map +1 -0
- package/dist/core/search-fallback.d.ts +28 -0
- package/dist/core/search-fallback.d.ts.map +1 -0
- package/dist/core/search-fallback.js +185 -0
- package/dist/core/search-fallback.js.map +1 -0
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stealth-patches.d.ts +58 -0
- package/dist/core/stealth-patches.d.ts.map +1 -0
- package/dist/core/stealth-patches.js +340 -0
- package/dist/core/stealth-patches.js.map +1 -0
- package/dist/core/strategies.d.ts +20 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +284 -48
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -15
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +29 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +2 -1
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +24 -8
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.js +5 -5
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/compat.d.ts.map +1 -1
- package/dist/server/routes/compat.js +1 -0
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +60 -6
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +103 -2
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/types.d.ts +55 -4
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +55 -125
- package/package.json +15 -1
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content chunker for RAG pipelines.
|
|
3
|
+
* Splits markdown content into overlapping chunks with rich metadata.
|
|
4
|
+
*/
|
|
5
|
+
const DEFAULT_MAX_TOKENS = 512;
|
|
6
|
+
const DEFAULT_OVERLAP = 50;
|
|
7
|
+
const CHARS_PER_TOKEN = 4; // rough approximation
|
|
8
|
+
/**
|
|
9
|
+
* Split content into RAG-ready chunks with metadata.
|
|
10
|
+
*/
|
|
11
|
+
export function chunkContent(content, options = {}) {
|
|
12
|
+
const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
|
|
13
|
+
const overlap = options.overlap || DEFAULT_OVERLAP;
|
|
14
|
+
const strategy = options.strategy || 'section';
|
|
15
|
+
const opts = { maxTokens, overlap, strategy };
|
|
16
|
+
let chunks;
|
|
17
|
+
switch (strategy) {
|
|
18
|
+
case 'section':
|
|
19
|
+
chunks = chunkBySection(content, maxTokens, overlap);
|
|
20
|
+
break;
|
|
21
|
+
case 'paragraph':
|
|
22
|
+
chunks = chunkByParagraph(content, maxTokens, overlap);
|
|
23
|
+
break;
|
|
24
|
+
case 'fixed':
|
|
25
|
+
chunks = chunkByFixed(content, maxTokens, overlap);
|
|
26
|
+
break;
|
|
27
|
+
default:
|
|
28
|
+
chunks = chunkBySection(content, maxTokens, overlap);
|
|
29
|
+
}
|
|
30
|
+
return {
|
|
31
|
+
chunks,
|
|
32
|
+
totalChunks: chunks.length,
|
|
33
|
+
originalLength: content.length,
|
|
34
|
+
strategy,
|
|
35
|
+
options: opts,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Section-based chunking (recommended for RAG).
|
|
40
|
+
* Splits on markdown headings (## / ### etc.), then splits large sections by paragraph.
|
|
41
|
+
* Each chunk includes its section heading for context.
|
|
42
|
+
*/
|
|
43
|
+
function chunkBySection(content, maxTokens, overlap) {
|
|
44
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
45
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
46
|
+
const chunks = [];
|
|
47
|
+
// Split content into sections by headings
|
|
48
|
+
const sections = splitByHeadings(content);
|
|
49
|
+
let chunkIndex = 0;
|
|
50
|
+
for (const section of sections) {
|
|
51
|
+
const { heading, depth, body, startOffset } = section;
|
|
52
|
+
if (!body.trim())
|
|
53
|
+
continue;
|
|
54
|
+
// If section fits in one chunk, use it directly
|
|
55
|
+
if (body.length <= maxChars) {
|
|
56
|
+
const text = heading ? `${heading}\n\n${body.trim()}` : body.trim();
|
|
57
|
+
chunks.push({
|
|
58
|
+
index: chunkIndex++,
|
|
59
|
+
text,
|
|
60
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
61
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
62
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
63
|
+
sectionDepth: depth,
|
|
64
|
+
startOffset,
|
|
65
|
+
endOffset: startOffset + body.length,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
// Large section — split by paragraphs with overlap
|
|
70
|
+
const paragraphs = body.split(/\n\n+/).filter(p => p.trim());
|
|
71
|
+
let currentText = '';
|
|
72
|
+
let currentStart = startOffset;
|
|
73
|
+
for (const para of paragraphs) {
|
|
74
|
+
const candidate = currentText ? `${currentText}\n\n${para}` : para;
|
|
75
|
+
if (candidate.length > maxChars && currentText) {
|
|
76
|
+
// Emit current chunk
|
|
77
|
+
const text = heading ? `${heading}\n\n${currentText.trim()}` : currentText.trim();
|
|
78
|
+
chunks.push({
|
|
79
|
+
index: chunkIndex++,
|
|
80
|
+
text,
|
|
81
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
82
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
83
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
84
|
+
sectionDepth: depth,
|
|
85
|
+
startOffset: currentStart,
|
|
86
|
+
endOffset: currentStart + currentText.length,
|
|
87
|
+
});
|
|
88
|
+
// Start new chunk with overlap from end of previous
|
|
89
|
+
if (overlapChars > 0 && currentText.length > overlapChars) {
|
|
90
|
+
currentText = currentText.slice(-overlapChars) + '\n\n' + para;
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
currentText = para;
|
|
94
|
+
}
|
|
95
|
+
currentStart = startOffset + body.indexOf(para);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
currentText = candidate;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Emit remaining
|
|
102
|
+
if (currentText.trim()) {
|
|
103
|
+
const text = heading ? `${heading}\n\n${currentText.trim()}` : currentText.trim();
|
|
104
|
+
chunks.push({
|
|
105
|
+
index: chunkIndex++,
|
|
106
|
+
text,
|
|
107
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
108
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
109
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
110
|
+
sectionDepth: depth,
|
|
111
|
+
startOffset: currentStart,
|
|
112
|
+
endOffset: currentStart + currentText.length,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Paragraph-based chunking.
|
|
121
|
+
* Groups paragraphs together up to maxTokens, with overlap.
|
|
122
|
+
*/
|
|
123
|
+
function chunkByParagraph(content, maxTokens, overlap) {
|
|
124
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
125
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
126
|
+
const chunks = [];
|
|
127
|
+
const paragraphs = content.split(/\n\n+/).filter(p => p.trim());
|
|
128
|
+
let currentText = '';
|
|
129
|
+
let currentStart = 0;
|
|
130
|
+
let chunkIndex = 0;
|
|
131
|
+
// Track current section heading
|
|
132
|
+
let currentHeading = null;
|
|
133
|
+
let currentDepth = null;
|
|
134
|
+
for (const para of paragraphs) {
|
|
135
|
+
// Check if paragraph is a heading
|
|
136
|
+
const headingMatch = para.match(/^(#{1,6})\s+(.+)/);
|
|
137
|
+
if (headingMatch) {
|
|
138
|
+
currentHeading = headingMatch[2];
|
|
139
|
+
currentDepth = headingMatch[1].length;
|
|
140
|
+
}
|
|
141
|
+
const candidate = currentText ? `${currentText}\n\n${para}` : para;
|
|
142
|
+
if (candidate.length > maxChars && currentText) {
|
|
143
|
+
chunks.push({
|
|
144
|
+
index: chunkIndex++,
|
|
145
|
+
text: currentText.trim(),
|
|
146
|
+
tokenCount: Math.ceil(currentText.length / CHARS_PER_TOKEN),
|
|
147
|
+
wordCount: currentText.split(/\s+/).filter(Boolean).length,
|
|
148
|
+
section: currentHeading,
|
|
149
|
+
sectionDepth: currentDepth,
|
|
150
|
+
startOffset: currentStart,
|
|
151
|
+
endOffset: currentStart + currentText.length,
|
|
152
|
+
});
|
|
153
|
+
if (overlapChars > 0 && currentText.length > overlapChars) {
|
|
154
|
+
currentText = currentText.slice(-overlapChars) + '\n\n' + para;
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
currentText = para;
|
|
158
|
+
}
|
|
159
|
+
currentStart = content.indexOf(para, currentStart);
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
currentText = candidate;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
if (currentText.trim()) {
|
|
166
|
+
chunks.push({
|
|
167
|
+
index: chunkIndex++,
|
|
168
|
+
text: currentText.trim(),
|
|
169
|
+
tokenCount: Math.ceil(currentText.length / CHARS_PER_TOKEN),
|
|
170
|
+
wordCount: currentText.split(/\s+/).filter(Boolean).length,
|
|
171
|
+
section: currentHeading,
|
|
172
|
+
sectionDepth: currentDepth,
|
|
173
|
+
startOffset: currentStart,
|
|
174
|
+
endOffset: currentStart + currentText.length,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
return chunks;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fixed-size chunking with overlap.
|
|
181
|
+
* Simple character-based splitting for predictable chunk sizes.
|
|
182
|
+
*/
|
|
183
|
+
function chunkByFixed(content, maxTokens, overlap) {
|
|
184
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
185
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
186
|
+
const step = Math.max(maxChars - overlapChars, 100);
|
|
187
|
+
const chunks = [];
|
|
188
|
+
let chunkIndex = 0;
|
|
189
|
+
for (let i = 0; i < content.length; i += step) {
|
|
190
|
+
const text = content.slice(i, i + maxChars).trim();
|
|
191
|
+
if (!text)
|
|
192
|
+
continue;
|
|
193
|
+
// Try to find section heading within this chunk
|
|
194
|
+
const headingMatch = text.match(/^(#{1,6})\s+(.+)/m);
|
|
195
|
+
chunks.push({
|
|
196
|
+
index: chunkIndex++,
|
|
197
|
+
text,
|
|
198
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
199
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
200
|
+
section: headingMatch ? headingMatch[2] : null,
|
|
201
|
+
sectionDepth: headingMatch ? headingMatch[1].length : null,
|
|
202
|
+
startOffset: i,
|
|
203
|
+
endOffset: Math.min(i + maxChars, content.length),
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
return chunks;
|
|
207
|
+
}
|
|
208
|
+
/** Split content into sections based on markdown headings */
|
|
209
|
+
function splitByHeadings(content) {
|
|
210
|
+
const lines = content.split('\n');
|
|
211
|
+
const sections = [];
|
|
212
|
+
let currentHeading = null;
|
|
213
|
+
let currentDepth = null;
|
|
214
|
+
let currentBody = [];
|
|
215
|
+
let currentStart = 0;
|
|
216
|
+
let offset = 0;
|
|
217
|
+
for (const line of lines) {
|
|
218
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)/);
|
|
219
|
+
if (headingMatch) {
|
|
220
|
+
// Save previous section
|
|
221
|
+
if (currentBody.length > 0 || currentHeading) {
|
|
222
|
+
sections.push({
|
|
223
|
+
heading: currentHeading,
|
|
224
|
+
depth: currentDepth,
|
|
225
|
+
body: currentBody.join('\n'),
|
|
226
|
+
startOffset: currentStart,
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
currentHeading = line;
|
|
230
|
+
currentDepth = headingMatch[1].length;
|
|
231
|
+
currentBody = [];
|
|
232
|
+
currentStart = offset;
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
currentBody.push(line);
|
|
236
|
+
}
|
|
237
|
+
offset += line.length + 1; // +1 for newline
|
|
238
|
+
}
|
|
239
|
+
// Don't forget last section
|
|
240
|
+
if (currentBody.length > 0 || currentHeading) {
|
|
241
|
+
sections.push({
|
|
242
|
+
heading: currentHeading,
|
|
243
|
+
depth: currentDepth,
|
|
244
|
+
body: currentBody.join('\n'),
|
|
245
|
+
startOffset: currentStart,
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
return sections;
|
|
249
|
+
}
|
|
250
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../src/core/chunker.ts"],"names":[],"mappings":"AAAA;;;GAGG;AA2CH,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAC/B,MAAM,eAAe,GAAG,EAAE,CAAC;AAC3B,MAAM,eAAe,GAAG,CAAC,CAAC,CAAC,sBAAsB;AAEjD;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,OAAe,EAAE,UAAwB,EAAE;IACtE,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,kBAAkB,CAAC;IAC1D,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,eAAe,CAAC;IACnD,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,SAAS,CAAC;IAE/C,MAAM,IAAI,GAA2B,EAAE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;IAEtE,IAAI,MAAsB,CAAC;IAE3B,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,SAAS;YACZ,MAAM,GAAG,cAAc,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;YACrD,MAAM;QACR,KAAK,WAAW;YACd,MAAM,GAAG,gBAAgB,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;YACvD,MAAM;QACR,KAAK,OAAO;YACV,MAAM,GAAG,YAAY,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;YACnD,MAAM;QACR;YACE,MAAM,GAAG,cAAc,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IACzD,CAAC;IAED,OAAO;QACL,MAAM;QACN,WAAW,EAAE,MAAM,CAAC,MAAM;QAC1B,cAAc,EAAE,OAAO,CAAC,MAAM;QAC9B,QAAQ;QACR,OAAO,EAAE,IAAI;KACd,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CAAC,OAAe,EAAE,SAAiB,EAAE,OAAe;IACzE,MAAM,QAAQ,GAAG,SAAS,GAAG,eAAe,CAAC;IAC7C,MAAM,YAAY,GAAG,OAAO,GAAG,eAAe,CAAC;IAC/C,MAAM,MAAM,GAAmB,EAAE,CAAC;IAElC,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAE1C,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC;QAEtD,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,SAAS;QAE3B,gDAAgD;QAChD,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC5B,MAAM,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,OAAO,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YACpE,MAAM,CAAC,IAAI,CAAC;gBACV,KAAK,EAAE,UAAU,EAAE;gBACnB,IAAI;gBACJ,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC;gBACpD,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;gBACnD,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI;gBACvD,YAAY,EAAE,KAAK;gBACnB,WAAW;gBACX,SAAS,EAAE,WAAW,GAAG,IAAI,CAAC,MAAM;aACrC,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,mDAAmD;YACnD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC7D,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,IAAI,YAAY,GAAG,WAAW,CAAC;YAE/B,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;gBAC9B,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;gBAEnE,IAAI,SAAS,CAAC,MAAM,GAAG,QAAQ,IAAI,WAAW,EAAE,CAAC;oBAC/C,qBAAqB;oBACrB,MAAM,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;oBAClF,MAAM,CAAC,IAAI,CAAC;wBACV,KAAK,EAAE,UAAU,EAAE;wBACnB,IAAI;wBACJ,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC;wBACpD,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;wBACnD,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI;wBACvD,YAAY,EAAE,KAAK;wBACnB,WAAW,EAAE,YAAY;wBACzB,SAAS,EAAE,YAAY,GAAG,WAAW,CAAC,MAAM;qBAC7C,CAAC,CAAC;oBAEH,oDAAoD;oBACpD,IAAI,YAAY,GAAG,CAAC,IAAI,WAAW,CAAC,MAAM,GAAG,YAAY,EAAE,CAAC;wBAC1D,WAAW,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,MAAM,GAAG,IAAI,CAAC;oBACjE,CAAC;yBAAM,CAAC;wBACN,WAAW,GAAG,IAAI,CAAC;oBACrB,CAAC;oBACD,YAAY,GAAG,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAClD,CAAC;qBAAM,CAAC;oBACN,WAAW,GAAG,SAAS,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,iBAAiB;YACjB,IAAI,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;gBACvB,MAAM,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;gBAClF,MAAM,CAAC,IAAI,CAAC;oBACV,KAAK,EAAE,UAAU,EAAE;oBACnB,IAAI;oBACJ,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC;oBACpD,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;oBACnD,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI;oBACvD,YAAY,EAAE,KAAK;oBACnB,WAAW,EAAE,YAAY;oBACzB,SAAS,EAAE,YAAY,GAAG,WAAW,CAAC,MAAM;iBAC7C,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,OAAe,EAAE,SAAiB,EAAE,OAAe;IAC3E,MAAM,QAAQ,GAAG,SAAS,GAAG,eAAe,CAAC;IAC7C,MAAM,YAAY,GAAG,OAAO,GAAG,eAAe,CAAC;IAC/C,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IAEhE,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,gCAAgC;IAChC,IAAI,cAAc,GAAkB,IAAI,CAAC;IACzC,IAAI,YAAY,GAAkB,IAAI,CAAC;IAEvC,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,kCAAkC;QAClC,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACpD,IAAI,YAAY,EAAE,CAAC;YACjB,cAAc,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;YACjC,YAAY,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QACxC,CAAC;QAED,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAEnE,IAAI,SAAS,CAAC,MAAM,GAAG,QAAQ,IAAI,WAAW,EAAE,CAAC;YAC/C,MAAM,CAAC,IAAI,CAAC;gBACV,KAAK,EAAE,UAAU,EAAE;gBACnB,IAAI,EAAE,WAAW,CAAC,IAAI,EAAE;gBACxB,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,eAAe,CAAC;gBAC3D,SAAS,EAAE,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;gBAC1D,OAAO,EAAE,cAAc;gBACvB,YAAY,EAAE,YAAY;gBAC1B,WAAW,EAAE,YAAY;gBACzB,SAAS,EAAE,YAAY,GAAG,WAAW,CAAC,MAAM;aAC7C,CAAC,CAAC;YAEH,IAAI,YAAY,GAAG,CAAC,IAAI,WAAW,CAAC,MAAM,GAAG,YAAY,EAAE,CAAC;gBAC1D,WAAW,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,MAAM,GAAG,IAAI,CAAC;YACjE,CAAC;iBAAM,CAAC;gBACN,WAAW,GAAG,IAAI,CAAC;YACrB,CAAC;YACD,YAAY,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,WAAW,GAAG,SAAS,CAAC;QAC1B,CAAC;IACH,CAAC;IAED,IAAI,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;QACvB,MAAM,CAAC,IAAI,CAAC;YACV,KAAK,EAAE,UAAU,EAAE;YACnB,IAAI,EAAE,WAAW,CAAC,IAAI,EAAE;YACxB,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,eAAe,CAAC;YAC3D,SAAS,EAAE,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;YAC1D,OAAO,EAAE,cAAc;YACvB,YAAY,EAAE,YAAY;YAC1B,WAAW,EAAE,YAAY;YACzB,SAAS,EAAE,YAAY,GAAG,WAAW,CAAC,MAAM;SAC7C,CAAC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,OAAe,EAAE,SAAiB,EAAE,OAAe;IACvE,MAAM,QAAQ,GAAG,SAAS,GAAG,eAAe,CAAC;IAC7C,MAAM,YAAY,GAAG,OAAO,GAAG,eAAe,CAAC;IAC/C,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,CAAC;IACpD,MAAM,MAAM,GAAmB,EAAE,CAAC;IAElC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI,EAAE,CAAC;QAC9C,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC;QACnD,IAAI,CAAC,IAAI;YAAE,SAAS;QAEpB,gDAAgD;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAErD,MAAM,CAAC,IAAI,CAAC;YACV,KAAK,EAAE,UAAU,EAAE;YACnB,IAAI;YACJ,UAAU,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC;YACpD,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM;YACnD,OAAO,EAAE,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI;YAC9C,YAAY,EAAE,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI;YAC1D,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC;SAClD,CAAC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,6DAA6D;AAC7D,SAAS,eAAe,CAAC,OAAe;IACtC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAA+F,EAAE,CAAC;IAEhH,IAAI,cAAc,GAAkB,IAAI,CAAC;IACzC,IAAI,YAAY,GAAkB,IAAI,CAAC;IACvC,IAAI,WAAW,GAAa,EAAE,CAAC;IAC/B,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,MAAM,GAAG,CAAC,CAAC;IAEf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QAEpD,IAAI,YAAY,EAAE,CAAC;YACjB,wBAAwB;YACxB,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,cAAc,EAAE,CAAC;gBAC7C,QAAQ,CAAC,IAAI,CAAC;oBACZ,OAAO,EAAE,cAAc;oBACvB,KAAK,EAAE,YAAY;oBACnB,IAAI,EAAE,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;oBAC5B,WAAW,EAAE,YAAY;iBAC1B,CAAC,CAAC;YACL,CAAC;YAED,cAAc,GAAG,IAAI,CAAC;YACtB,YAAY,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACtC,WAAW,GAAG,EAAE,CAAC;YACjB,YAAY,GAAG,MAAM,CAAC;QACxB,CAAC;aAAM,CAAC;YACN,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;QAED,MAAM,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,iBAAiB;IAC9C,CAAC;IAED,4BAA4B;IAC5B,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,cAAc,EAAE,CAAC;QAC7C,QAAQ,CAAC,IAAI,CAAC;YACZ,OAAO,EAAE,cAAc;YACvB,KAAK,EAAE,YAAY;YACnB,IAAI,EAAE,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;YAC5B,WAAW,EAAE,YAAY;SAC1B,CAAC,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CloakBrowser fetch — enterprise-grade stealth using patched Chromium.
|
|
3
|
+
*
|
|
4
|
+
* CloakBrowser patches Chromium at the C++ source level (not JS injection).
|
|
5
|
+
* Passes reCAPTCHA v3 (0.9), Cloudflare Turnstile, DataDome, 14/14 tests.
|
|
6
|
+
*
|
|
7
|
+
* Requires: npm install cloakbrowser playwright-core
|
|
8
|
+
* Usage: peel(url, { cloaked: true })
|
|
9
|
+
*/
|
|
10
|
+
export interface CloakFetchOptions {
|
|
11
|
+
url: string;
|
|
12
|
+
proxy?: string;
|
|
13
|
+
userAgent?: string;
|
|
14
|
+
viewportWidth?: number;
|
|
15
|
+
viewportHeight?: number;
|
|
16
|
+
waitMs?: number;
|
|
17
|
+
waitSelector?: string;
|
|
18
|
+
waitUntil?: string;
|
|
19
|
+
timeoutMs?: number;
|
|
20
|
+
screenshot?: boolean;
|
|
21
|
+
screenshotFullPage?: boolean;
|
|
22
|
+
actions?: any[];
|
|
23
|
+
headers?: Record<string, string>;
|
|
24
|
+
headed?: boolean;
|
|
25
|
+
}
|
|
26
|
+
export interface CloakFetchResult {
|
|
27
|
+
html: string;
|
|
28
|
+
url: string;
|
|
29
|
+
statusCode?: number;
|
|
30
|
+
contentType?: string;
|
|
31
|
+
method: 'cloaked';
|
|
32
|
+
screenshot?: Buffer;
|
|
33
|
+
challengeDetected?: boolean;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Check if CloakBrowser is installed.
|
|
37
|
+
*/
|
|
38
|
+
export declare function isCloakBrowserAvailable(): boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Fetch a URL using CloakBrowser's patched Chromium.
|
|
41
|
+
*/
|
|
42
|
+
export declare function cloakFetch(options: CloakFetchOptions): Promise<CloakFetchResult>;
|
|
43
|
+
//# sourceMappingURL=cloak-fetch.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cloak-fetch.d.ts","sourceRoot":"","sources":["../../src/core/cloak-fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,WAAW,iBAAiB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,OAAO,CAAC,EAAE,GAAG,EAAE,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,SAAS,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAID;;GAEG;AACH,wBAAgB,uBAAuB,IAAI,OAAO,CAOjD;AAqBD;;GAEG;AACH,wBAAsB,UAAU,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAyGtF"}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CloakBrowser fetch — enterprise-grade stealth using patched Chromium.
|
|
3
|
+
*
|
|
4
|
+
* CloakBrowser patches Chromium at the C++ source level (not JS injection).
|
|
5
|
+
* Passes reCAPTCHA v3 (0.9), Cloudflare Turnstile, DataDome, 14/14 tests.
|
|
6
|
+
*
|
|
7
|
+
* Requires: npm install cloakbrowser playwright-core
|
|
8
|
+
* Usage: peel(url, { cloaked: true })
|
|
9
|
+
*/
|
|
10
|
+
let cloakModule = null;
|
|
11
|
+
/**
|
|
12
|
+
* Check if CloakBrowser is installed.
|
|
13
|
+
*/
|
|
14
|
+
export function isCloakBrowserAvailable() {
|
|
15
|
+
try {
|
|
16
|
+
require.resolve('cloakbrowser');
|
|
17
|
+
return true;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Lazy-load CloakBrowser module.
|
|
25
|
+
*/
|
|
26
|
+
async function getCloakBrowser() {
|
|
27
|
+
if (cloakModule)
|
|
28
|
+
return cloakModule;
|
|
29
|
+
try {
|
|
30
|
+
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
|
31
|
+
// @ts-ignore — cloakbrowser is an optional peer dependency
|
|
32
|
+
cloakModule = await import('cloakbrowser');
|
|
33
|
+
return cloakModule;
|
|
34
|
+
}
|
|
35
|
+
catch (e) {
|
|
36
|
+
throw new Error('CloakBrowser not installed. Run: npm install cloakbrowser playwright-core\n' +
|
|
37
|
+
'CloakBrowser provides enterprise-grade stealth using a patched Chromium binary.\n' +
|
|
38
|
+
'Learn more: https://github.com/CloakHQ/cloakbrowser');
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Fetch a URL using CloakBrowser's patched Chromium.
|
|
43
|
+
*/
|
|
44
|
+
export async function cloakFetch(options) {
|
|
45
|
+
const cloak = await getCloakBrowser();
|
|
46
|
+
const launchOptions = {
|
|
47
|
+
headless: !options.headed,
|
|
48
|
+
};
|
|
49
|
+
if (options.proxy) {
|
|
50
|
+
launchOptions.proxy = options.proxy;
|
|
51
|
+
}
|
|
52
|
+
let browser = null;
|
|
53
|
+
let page = null;
|
|
54
|
+
try {
|
|
55
|
+
// Use launchContext if available for richer options, otherwise launch
|
|
56
|
+
if (cloak.launchContext) {
|
|
57
|
+
const contextOptions = {};
|
|
58
|
+
if (options.userAgent)
|
|
59
|
+
contextOptions.userAgent = options.userAgent;
|
|
60
|
+
if (options.viewportWidth && options.viewportHeight) {
|
|
61
|
+
contextOptions.viewport = { width: options.viewportWidth, height: options.viewportHeight };
|
|
62
|
+
}
|
|
63
|
+
const context = await cloak.launchContext({
|
|
64
|
+
...launchOptions,
|
|
65
|
+
...contextOptions,
|
|
66
|
+
});
|
|
67
|
+
page = await context.newPage();
|
|
68
|
+
browser = context.browser();
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
browser = await cloak.launch(launchOptions);
|
|
72
|
+
const context = await browser.newContext({
|
|
73
|
+
userAgent: options.userAgent,
|
|
74
|
+
viewport: options.viewportWidth && options.viewportHeight
|
|
75
|
+
? { width: options.viewportWidth, height: options.viewportHeight }
|
|
76
|
+
: undefined,
|
|
77
|
+
});
|
|
78
|
+
page = await context.newPage();
|
|
79
|
+
}
|
|
80
|
+
// Set custom headers if provided
|
|
81
|
+
if (options.headers && Object.keys(options.headers).length > 0) {
|
|
82
|
+
await page.setExtraHTTPHeaders(options.headers);
|
|
83
|
+
}
|
|
84
|
+
// Navigate
|
|
85
|
+
const waitUntil = options.waitUntil || 'domcontentloaded';
|
|
86
|
+
const timeout = options.timeoutMs || 30000;
|
|
87
|
+
const response = await page.goto(options.url, {
|
|
88
|
+
waitUntil,
|
|
89
|
+
timeout,
|
|
90
|
+
});
|
|
91
|
+
// Wait for selector if specified
|
|
92
|
+
if (options.waitSelector) {
|
|
93
|
+
await page.waitForSelector(options.waitSelector, { timeout }).catch(() => {
|
|
94
|
+
if (process.env.DEBUG)
|
|
95
|
+
console.debug('[webpeel]', `waitSelector "${options.waitSelector}" timed out`);
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
// Additional wait
|
|
99
|
+
if (options.waitMs && options.waitMs > 0) {
|
|
100
|
+
await page.waitForTimeout(options.waitMs);
|
|
101
|
+
}
|
|
102
|
+
// Execute actions if provided
|
|
103
|
+
if (options.actions && options.actions.length > 0) {
|
|
104
|
+
const { executeActions } = await import('./actions.js');
|
|
105
|
+
await executeActions(page, options.actions);
|
|
106
|
+
}
|
|
107
|
+
// Get content
|
|
108
|
+
const html = await page.content();
|
|
109
|
+
const statusCode = response?.status();
|
|
110
|
+
const contentType = response?.headers()['content-type'];
|
|
111
|
+
const finalUrl = page.url();
|
|
112
|
+
// Screenshot if requested
|
|
113
|
+
let screenshotBuffer;
|
|
114
|
+
if (options.screenshot) {
|
|
115
|
+
screenshotBuffer = await page.screenshot({
|
|
116
|
+
fullPage: options.screenshotFullPage || false,
|
|
117
|
+
type: 'png',
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
// Check for challenge pages
|
|
121
|
+
const { detectChallenge } = await import('./challenge-detection.js');
|
|
122
|
+
const challengeCheck = detectChallenge(html, statusCode ?? 200);
|
|
123
|
+
return {
|
|
124
|
+
html,
|
|
125
|
+
url: finalUrl,
|
|
126
|
+
statusCode,
|
|
127
|
+
contentType,
|
|
128
|
+
method: 'cloaked',
|
|
129
|
+
screenshot: screenshotBuffer,
|
|
130
|
+
challengeDetected: challengeCheck.isChallenge && challengeCheck.confidence >= 0.7,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
finally {
|
|
134
|
+
// Cleanup
|
|
135
|
+
if (page)
|
|
136
|
+
await page.close().catch(() => { });
|
|
137
|
+
if (browser)
|
|
138
|
+
await browser.close().catch(() => { });
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
//# sourceMappingURL=cloak-fetch.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cloak-fetch.js","sourceRoot":"","sources":["../../src/core/cloak-fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AA+BH,IAAI,WAAW,GAAQ,IAAI,CAAC;AAE5B;;GAEG;AACH,MAAM,UAAU,uBAAuB;IACrC,IAAI,CAAC;QACH,OAAO,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAChC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,eAAe;IAC5B,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC;IACpC,IAAI,CAAC;QACH,6DAA6D;QAC7D,2DAA2D;QAC3D,WAAW,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QAC3C,OAAO,WAAW,CAAC;IACrB,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CACb,6EAA6E;YAC7E,mFAAmF;YACnF,qDAAqD,CACtD,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,OAA0B;IACzD,MAAM,KAAK,GAAG,MAAM,eAAe,EAAE,CAAC;IAEtC,MAAM,aAAa,GAAQ;QACzB,QAAQ,EAAE,CAAC,OAAO,CAAC,MAAM;KAC1B,CAAC;IAEF,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAClB,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;IACtC,CAAC;IAED,IAAI,OAAO,GAAmB,IAAI,CAAC;IACnC,IAAI,IAAI,GAAgB,IAAI,CAAC;IAE7B,IAAI,CAAC;QACH,sEAAsE;QACtE,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;YACxB,MAAM,cAAc,GAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,CAAC,SAAS;gBAAE,cAAc,CAAC,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;YACpE,IAAI,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;gBACpD,cAAc,CAAC,QAAQ,GAAG,EAAE,KAAK,EAAE,OAAO,CAAC,aAAa,EAAE,MAAM,EAAE,OAAO,CAAC,cAAc,EAAE,CAAC;YAC7F,CAAC;YAED,MAAM,OAAO,GAAmB,MAAM,KAAK,CAAC,aAAa,CAAC;gBACxD,GAAG,aAAa;gBAChB,GAAG,cAAc;aAClB,CAAC,CAAC;YACH,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;YAC5C,MAAM,OAAO,GAAG,MAAM,OAAQ,CAAC,UAAU,CAAC;gBACxC,SAAS,EAAE,OAAO,CAAC,SAAS;gBAC5B,QAAQ,EAAE,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,cAAc;oBACvD,CAAC,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,aAAa,EAAE,MAAM,EAAE,OAAO,CAAC,cAAc,EAAE;oBAClE,CAAC,CAAC,SAAS;aACd,CAAC,CAAC;YACH,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACjC,CAAC;QAED,iCAAiC;QACjC,IAAI,OAAO,CAAC,OAAO,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/D,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAClD,CAAC;QAED,WAAW;QACX,MAAM,SAAS,GAAI,OAAO,CAAC,SAAiB,IAAI,kBAAkB,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC;QAE3C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YAC5C,SAAS;YACT,OAAO;SACR,CAAC,CAAC;QAEH,iCAAiC;QACjC,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;YACzB,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE;gBACvE,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK;oBAAE,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,iBAAiB,OAAO,CAAC,YAAY,aAAa,CAAC,CAAC;YACxG,CAAC,CAAC,CAAC;QACL,CAAC;QAED,kBAAkB;QAClB,IAAI,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAC5C,CAAC;QAED,8BAA8B;QAC9B,IAAI,OAAO,CAAC,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClD,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;YACxD,MAAM,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAC9C,CAAC;QAED,cAAc;QACd,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QAClC,MAAM,UAAU,GAAG,QAAQ,EAAE,MAAM,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,QAAQ,EAAE,OAAO,EAAE,CAAC,cAAc,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE5B,0BAA0B;QAC1B,IAAI,gBAAoC,CAAC;QACzC,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACvB,gBAAgB,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC;gBACvC,QAAQ,EAAE,OAAO,CAAC,kBAAkB,IAAI,KAAK;gBAC7C,IAAI,EAAE,KAAK;aACZ,CAAC,CAAC;QACL,CAAC;QAED,4BAA4B;QAC5B,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,0BAA0B,CAAC,CAAC;QACrE,MAAM,cAAc,GAAG,eAAe,CAAC,IAAI,EAAE,UAAU,IAAI,GAAG,CAAC,CAAC;QAEhE,OAAO;YACL,IAAI;YACJ,GAAG,EAAE,QAAQ;YACb,UAAU;YACV,WAAW;YACX,MAAM,EAAE,SAAS;YACjB,UAAU,EAAE,gBAAgB;YAC5B,iBAAiB,EAAE,cAAc,CAAC,WAAW,IAAI,cAAc,CAAC,UAAU,IAAI,GAAG;SAClF,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,UAAU;QACV,IAAI,IAAI;YAAE,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAC7C,IAAI,OAAO;YAAE,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IACrD,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl checkpoint system for resume capability.
|
|
3
|
+
* Saves progress to a JSON file so interrupted crawls can continue.
|
|
4
|
+
*/
|
|
5
|
+
export interface CrawlCheckpoint {
|
|
6
|
+
/** Unique crawl job ID (hash of start URL + options) */
|
|
7
|
+
jobId: string;
|
|
8
|
+
/** Starting URL */
|
|
9
|
+
startUrl: string;
|
|
10
|
+
/** URLs already crawled (with their results) */
|
|
11
|
+
completed: Map<string, {
|
|
12
|
+
status: number;
|
|
13
|
+
contentLength: number;
|
|
14
|
+
timestamp: number;
|
|
15
|
+
}>;
|
|
16
|
+
/** URLs queued but not yet crawled */
|
|
17
|
+
pending: string[];
|
|
18
|
+
/** URLs discovered but not yet queued */
|
|
19
|
+
discovered: string[];
|
|
20
|
+
/** Crawl options (serialized) */
|
|
21
|
+
options: Record<string, any>;
|
|
22
|
+
/** When crawl started */
|
|
23
|
+
startedAt: number;
|
|
24
|
+
/** Last checkpoint time */
|
|
25
|
+
lastCheckpoint: number;
|
|
26
|
+
/** Total pages target */
|
|
27
|
+
maxPages: number;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Generate a deterministic job ID from URL + options.
|
|
31
|
+
*/
|
|
32
|
+
export declare function generateJobId(url: string, options?: Record<string, any>): string;
|
|
33
|
+
/**
|
|
34
|
+
* Save a checkpoint to disk.
|
|
35
|
+
*/
|
|
36
|
+
export declare function saveCheckpoint(checkpoint: CrawlCheckpoint): void;
|
|
37
|
+
/**
|
|
38
|
+
* Load a checkpoint from disk.
|
|
39
|
+
*/
|
|
40
|
+
export declare function loadCheckpoint(jobId: string): CrawlCheckpoint | null;
|
|
41
|
+
/**
|
|
42
|
+
* Delete a checkpoint (crawl completed or abandoned).
|
|
43
|
+
*/
|
|
44
|
+
export declare function deleteCheckpoint(jobId: string): void;
|
|
45
|
+
/**
|
|
46
|
+
* List all active checkpoints.
|
|
47
|
+
*/
|
|
48
|
+
export declare function listCheckpoints(): Array<{
|
|
49
|
+
jobId: string;
|
|
50
|
+
startUrl: string;
|
|
51
|
+
completed: number;
|
|
52
|
+
pending: number;
|
|
53
|
+
lastCheckpoint: number;
|
|
54
|
+
}>;
|
|
55
|
+
//# sourceMappingURL=crawl-checkpoint.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl-checkpoint.d.ts","sourceRoot":"","sources":["../../src/core/crawl-checkpoint.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAMH,MAAM,WAAW,eAAe;IAC9B,wDAAwD;IACxD,KAAK,EAAE,MAAM,CAAC;IACd,mBAAmB;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,gDAAgD;IAChD,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACrF,sCAAsC;IACtC,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,yCAAyC;IACzC,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,iCAAiC;IACjC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC7B,yBAAyB;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,2BAA2B;IAC3B,cAAc,EAAE,MAAM,CAAC;IACvB,yBAAyB;IACzB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAID;;GAEG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAM,GAAG,MAAM,CASpF;AASD;;GAEG;AACH,wBAAgB,cAAc,CAAC,UAAU,EAAE,eAAe,GAAG,IAAI,CAchE;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAapE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAOpD;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,KAAK,CAAC;IACvC,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CA4BD"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl checkpoint system for resume capability.
|
|
3
|
+
* Saves progress to a JSON file so interrupted crawls can continue.
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'fs';
|
|
6
|
+
import { join } from 'path';
|
|
7
|
+
import { createHash } from 'crypto';
|
|
8
|
+
const CHECKPOINT_DIR = join(process.env.HOME || '/tmp', '.webpeel', 'checkpoints');
|
|
9
|
+
/**
|
|
10
|
+
* Generate a deterministic job ID from URL + options.
|
|
11
|
+
*/
|
|
12
|
+
export function generateJobId(url, options = {}) {
|
|
13
|
+
const key = JSON.stringify({
|
|
14
|
+
url,
|
|
15
|
+
maxPages: options.maxPages,
|
|
16
|
+
maxDepth: options.maxDepth,
|
|
17
|
+
includes: options.includes,
|
|
18
|
+
excludes: options.excludes,
|
|
19
|
+
});
|
|
20
|
+
return createHash('sha256').update(key).digest('hex').slice(0, 16);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Get the checkpoint file path for a job.
|
|
24
|
+
*/
|
|
25
|
+
function getCheckpointPath(jobId) {
|
|
26
|
+
return join(CHECKPOINT_DIR, `${jobId}.json`);
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Save a checkpoint to disk.
|
|
30
|
+
*/
|
|
31
|
+
export function saveCheckpoint(checkpoint) {
|
|
32
|
+
try {
|
|
33
|
+
mkdirSync(CHECKPOINT_DIR, { recursive: true });
|
|
34
|
+
const data = {
|
|
35
|
+
...checkpoint,
|
|
36
|
+
completed: Object.fromEntries(checkpoint.completed),
|
|
37
|
+
lastCheckpoint: Date.now(),
|
|
38
|
+
};
|
|
39
|
+
writeFileSync(getCheckpointPath(checkpoint.jobId), JSON.stringify(data, null, 2));
|
|
40
|
+
}
|
|
41
|
+
catch (e) {
|
|
42
|
+
if (process.env.DEBUG) {
|
|
43
|
+
console.debug('[webpeel]', 'Failed to save checkpoint:', e instanceof Error ? e.message : e);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Load a checkpoint from disk.
|
|
49
|
+
*/
|
|
50
|
+
export function loadCheckpoint(jobId) {
|
|
51
|
+
const path = getCheckpointPath(jobId);
|
|
52
|
+
if (!existsSync(path))
|
|
53
|
+
return null;
|
|
54
|
+
try {
|
|
55
|
+
const raw = JSON.parse(readFileSync(path, 'utf-8'));
|
|
56
|
+
return {
|
|
57
|
+
...raw,
|
|
58
|
+
completed: new Map(Object.entries(raw.completed || {})),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Delete a checkpoint (crawl completed or abandoned).
|
|
67
|
+
*/
|
|
68
|
+
export function deleteCheckpoint(jobId) {
|
|
69
|
+
const path = getCheckpointPath(jobId);
|
|
70
|
+
try {
|
|
71
|
+
if (existsSync(path)) {
|
|
72
|
+
unlinkSync(path);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
catch { /* ignore */ }
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* List all active checkpoints.
|
|
79
|
+
*/
|
|
80
|
+
export function listCheckpoints() {
|
|
81
|
+
try {
|
|
82
|
+
if (!existsSync(CHECKPOINT_DIR))
|
|
83
|
+
return [];
|
|
84
|
+
const files = readdirSync(CHECKPOINT_DIR).filter((f) => f.endsWith('.json'));
|
|
85
|
+
return files.map(f => {
|
|
86
|
+
try {
|
|
87
|
+
const raw = JSON.parse(readFileSync(join(CHECKPOINT_DIR, f), 'utf-8'));
|
|
88
|
+
return {
|
|
89
|
+
jobId: raw.jobId,
|
|
90
|
+
startUrl: raw.startUrl,
|
|
91
|
+
completed: Object.keys(raw.completed || {}).length,
|
|
92
|
+
pending: (raw.pending || []).length,
|
|
93
|
+
lastCheckpoint: raw.lastCheckpoint,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
}).filter(Boolean);
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=crawl-checkpoint.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl-checkpoint.js","sourceRoot":"","sources":["../../src/core/crawl-checkpoint.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,SAAS,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,IAAI,CAAC;AACjG,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAuBpC,MAAM,cAAc,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,MAAM,EAAE,UAAU,EAAE,aAAa,CAAC,CAAC;AAEnF;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW,EAAE,UAA+B,EAAE;IAC1E,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;QACzB,GAAG;QACH,QAAQ,EAAE,OAAO,CAAC,QAAQ;QAC1B,QAAQ,EAAE,OAAO,CAAC,QAAQ;QAC1B,QAAQ,EAAE,OAAO,CAAC,QAAQ;QAC1B,QAAQ,EAAE,OAAO,CAAC,QAAQ;KAC3B,CAAC,CAAC;IACH,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AACrE,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAa;IACtC,OAAO,IAAI,CAAC,cAAc,EAAE,GAAG,KAAK,OAAO,CAAC,CAAC;AAC/C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,UAA2B;IACxD,IAAI,CAAC;QACH,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,IAAI,GAAG;YACX,GAAG,UAAU;YACb,SAAS,EAAE,MAAM,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC;YACnD,cAAc,EAAE,IAAI,CAAC,GAAG,EAAE;SAC3B,CAAC;QACF,aAAa,CAAC,iBAAiB,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACpF,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,4BAA4B,EAAE,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/F,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,MAAM,IAAI,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;IACtC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAEnC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC;QACpD,OAAO;YACL,GAAG,GAAG;YACN,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;SACxD,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,KAAa;IAC5C,MAAM,IAAI,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;IACtC,IAAI,CAAC;QACH,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YACrB,UAAU,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAO7B,IAAI,CAAC;QACH,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC;YAAE,OAAO,EAAE,CAAC;QAC3C,MAAM,KAAK,GAAa,WAAW,CAAC,cAAc,CAAC,CAAC,MAAM,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QAE/F,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;YACnB,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,cAAc,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;gBACvE,OAAO;oBACL,KAAK,EAAE,GAAG,CAAC,KAAK;oBAChB,QAAQ,EAAE,GAAG,CAAC,QAAQ;oBACtB,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC,MAAM;oBAClD,OAAO,EAAE,CAAC,GAAG,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,MAAM;oBACnC,cAAc,EAAE,GAAG,CAAC,cAAc;iBACnC,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAMf,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC"}
|
package/dist/core/crawler.d.ts
CHANGED
|
@@ -4,8 +4,10 @@
|
|
|
4
4
|
*/
|
|
5
5
|
import type { PeelOptions } from '../types.js';
|
|
6
6
|
export interface CrawlOptions extends Omit<PeelOptions, 'format'> {
|
|
7
|
-
/** Maximum number of pages to crawl (default: 10, max:
|
|
7
|
+
/** Maximum number of pages to crawl (default: 10, max: tier-dependent) */
|
|
8
8
|
maxPages?: number;
|
|
9
|
+
/** Tier for determining the max pages cap (default: 'free') */
|
|
10
|
+
tier?: string;
|
|
9
11
|
/** Maximum depth to crawl (default: 2, max: 5) */
|
|
10
12
|
maxDepth?: number;
|
|
11
13
|
/** Only crawl URLs from these domains (default: same domain as starting URL) */
|
|
@@ -28,6 +30,8 @@ export interface CrawlOptions extends Omit<PeelOptions, 'format'> {
|
|
|
28
30
|
onProgress?: (status: CrawlProgress) => void;
|
|
29
31
|
/** Per-page callback — receives the full result as soon as a page completes */
|
|
30
32
|
onPage?: (result: CrawlResult) => void;
|
|
33
|
+
/** Resume an interrupted crawl from its last checkpoint */
|
|
34
|
+
resume?: boolean;
|
|
31
35
|
}
|
|
32
36
|
export interface CrawlProgress {
|
|
33
37
|
crawled: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"crawler.d.ts","sourceRoot":"","sources":["../../src/core/crawler.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"crawler.d.ts","sourceRoot":"","sources":["../../src/core/crawler.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAkC/C,MAAM,WAAW,YAAa,SAAQ,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC;IAC/D,0EAA0E;IAC1E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+DAA+D;IAC/D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,gFAAgF;IAChF,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,2DAA2D;IAC3D,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,yCAAyC;IACzC,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,gFAAgF;IAChF,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,8DAA8D;IAC9D,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,oEAAoE;IACpE,QAAQ,CAAC,EAAE,KAAK,GAAG,KAAK,CAAC;IACzB,kEAAkE;IAClE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,oDAAoD;IACpD,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,+CAA+C;IAC/C,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,aAAa,KAAK,IAAI,CAAC;IAC7C,+EAA+E;IAC/E,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,WAAW,KAAK,IAAI,CAAC;IACvC,2DAA2D;IAC3D,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,WAAW;IAC1B,8BAA8B;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,iBAAiB;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,qCAAqC;IACrC,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,2CAA2C;IAC3C,OAAO,EAAE,MAAM,CAAC;IAChB,4CAA4C;IAC5C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,4CAA4C;IAC5C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAsFD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,KAAK,CACzB,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,WAAW,EAAE,CAAC,CAsRxB"}
|