@robot-resources/scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +228 -0
- package/bin/setup.js +135 -0
- package/dist/index.cjs +1002 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +373 -0
- package/dist/index.d.ts +373 -0
- package/dist/index.js +976 -0
- package/dist/index.js.map +1 -0
- package/package.json +93 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for scraper
|
|
3
|
+
* Context compression for AI agents
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Fetch tier selection for scrape()
|
|
7
|
+
*
|
|
8
|
+
* - 'fast': Tier 1 — plain HTTP fetch (default Node.js fetch)
|
|
9
|
+
* - 'stealth': Tier 2 — TLS fingerprint impersonation (bypasses anti-bot)
|
|
10
|
+
* - 'render': Tier 3 — Playwright headless browser (JS-rendered pages)
|
|
11
|
+
* - 'auto': Try fast first, fall back to stealth on 403/challenge detection
|
|
12
|
+
*/
|
|
13
|
+
type FetchMode = 'fast' | 'stealth' | 'render' | 'auto';
|
|
14
|
+
/**
|
|
15
|
+
* Options for the scrape() function
|
|
16
|
+
*/
|
|
17
|
+
interface ScrapeOptions {
|
|
18
|
+
/** Fetch tier to use (default: 'auto') */
|
|
19
|
+
mode?: FetchMode;
|
|
20
|
+
/** Request timeout in milliseconds (default: 10000) */
|
|
21
|
+
timeout?: number;
|
|
22
|
+
/** Maximum retry attempts for failed requests (default: 3) */
|
|
23
|
+
maxRetries?: number;
|
|
24
|
+
/** Custom user agent string */
|
|
25
|
+
userAgent?: string;
|
|
26
|
+
/** Check robots.txt before fetching (default: false) */
|
|
27
|
+
respectRobots?: boolean;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Result from the scrape() function
|
|
31
|
+
*/
|
|
32
|
+
interface ScrapeResult {
|
|
33
|
+
/** Compressed markdown content */
|
|
34
|
+
markdown: string;
|
|
35
|
+
/** Estimated token count */
|
|
36
|
+
tokenCount: number;
|
|
37
|
+
/** Page title if extracted */
|
|
38
|
+
title?: string;
|
|
39
|
+
/** Author if extracted */
|
|
40
|
+
author?: string;
|
|
41
|
+
/** Site name if extracted */
|
|
42
|
+
siteName?: string;
|
|
43
|
+
/** Published date if extracted */
|
|
44
|
+
publishedAt?: string;
|
|
45
|
+
/** Final URL after redirects */
|
|
46
|
+
url: string;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Result from fetch layer
|
|
50
|
+
*/
|
|
51
|
+
interface FetchResult {
|
|
52
|
+
/** Raw HTML content */
|
|
53
|
+
html: string;
|
|
54
|
+
/** Final URL after redirects */
|
|
55
|
+
url: string;
|
|
56
|
+
/** HTTP status code */
|
|
57
|
+
statusCode: number;
|
|
58
|
+
/** Response headers */
|
|
59
|
+
headers: Record<string, string>;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Result from extract layer
|
|
63
|
+
*/
|
|
64
|
+
interface ExtractResult {
|
|
65
|
+
/** Extracted HTML content (article body) */
|
|
66
|
+
content: string;
|
|
67
|
+
/** Page title */
|
|
68
|
+
title?: string;
|
|
69
|
+
/** Author name */
|
|
70
|
+
author?: string;
|
|
71
|
+
/** Published timestamp */
|
|
72
|
+
publishedAt?: string;
|
|
73
|
+
/** Site name */
|
|
74
|
+
siteName?: string;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Result from convert layer
|
|
78
|
+
*/
|
|
79
|
+
interface ConvertResult {
|
|
80
|
+
/** Markdown content */
|
|
81
|
+
markdown: string;
|
|
82
|
+
/** Estimated token count */
|
|
83
|
+
tokenCount: number;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Options for the crawl() function
|
|
87
|
+
*/
|
|
88
|
+
interface CrawlOptions {
|
|
89
|
+
/** Starting URL to crawl */
|
|
90
|
+
url: string;
|
|
91
|
+
/** Max crawl depth — 0 means only starting URL (default: 2) */
|
|
92
|
+
depth?: number;
|
|
93
|
+
/** Max pages to crawl (default: 50) */
|
|
94
|
+
limit?: number;
|
|
95
|
+
/** Fetch mode per page (default: 'auto') */
|
|
96
|
+
mode?: FetchMode;
|
|
97
|
+
/** URL patterns to include — glob-style (if set, URL must match at least one) */
|
|
98
|
+
include?: string[];
|
|
99
|
+
/** URL patterns to exclude — glob-style (URL must not match any) */
|
|
100
|
+
exclude?: string[];
|
|
101
|
+
/** Per-page timeout in milliseconds (default: 10000) */
|
|
102
|
+
timeout?: number;
|
|
103
|
+
/** Parallel page fetches (default: 3) */
|
|
104
|
+
concurrency?: number;
|
|
105
|
+
/** Check robots.txt before crawling (default: true) */
|
|
106
|
+
respectRobots?: boolean;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Result from a single crawled page — extends ScrapeResult with crawl depth
|
|
110
|
+
*/
|
|
111
|
+
interface CrawlPageResult extends ScrapeResult {
|
|
112
|
+
/** Crawl depth where this page was discovered */
|
|
113
|
+
depth: number;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Error for a single URL during crawl
|
|
117
|
+
*/
|
|
118
|
+
interface CrawlError {
|
|
119
|
+
/** URL that failed */
|
|
120
|
+
url: string;
|
|
121
|
+
/** Error message */
|
|
122
|
+
error: string;
|
|
123
|
+
/** Crawl depth where error occurred */
|
|
124
|
+
depth: number;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Aggregate result from crawl()
|
|
128
|
+
*/
|
|
129
|
+
interface CrawlResult {
|
|
130
|
+
/** Successfully scraped pages */
|
|
131
|
+
pages: CrawlPageResult[];
|
|
132
|
+
/** Total URLs discovered (including skipped/errored) */
|
|
133
|
+
totalDiscovered: number;
|
|
134
|
+
/** Total URLs successfully scraped */
|
|
135
|
+
totalCrawled: number;
|
|
136
|
+
/** Total URLs skipped (robots, filter, limit) */
|
|
137
|
+
totalSkipped: number;
|
|
138
|
+
/** Per-URL errors */
|
|
139
|
+
errors: CrawlError[];
|
|
140
|
+
/** Total crawl duration in milliseconds */
|
|
141
|
+
duration: number;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Layer 1: Fetch
|
|
146
|
+
* HTTP fetching with smart headers and retries
|
|
147
|
+
*/
|
|
148
|
+
|
|
149
|
+
interface FetchOptions {
|
|
150
|
+
timeout?: number;
|
|
151
|
+
maxRetries?: number;
|
|
152
|
+
userAgent?: string;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Error class for fetch-related errors
|
|
156
|
+
*/
|
|
157
|
+
declare class FetchError extends Error {
|
|
158
|
+
readonly statusCode?: number | undefined;
|
|
159
|
+
readonly retryable: boolean;
|
|
160
|
+
constructor(message: string, statusCode?: number | undefined, retryable?: boolean);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Fetch URL content with smart headers and retry logic
|
|
164
|
+
*/
|
|
165
|
+
declare function fetchUrl(url: string, options?: FetchOptions): Promise<FetchResult>;
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Layer 1b: Stealth Fetch
|
|
169
|
+
* TLS fingerprint impersonation via impit (optional peer dependency)
|
|
170
|
+
*
|
|
171
|
+
* Uses Rust-based browser TLS fingerprinting to bypass anti-bot systems
|
|
172
|
+
* (Cloudflare, Akamai, PerimeterX) without a full browser.
|
|
173
|
+
*
|
|
174
|
+
* Requires: npm install impit (Node >= 20)
|
|
175
|
+
*/
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Fetch URL with browser TLS fingerprint impersonation.
|
|
179
|
+
*
|
|
180
|
+
* Uses impit (Apify) to produce Chrome-like JA3/JA4 fingerprints at the
|
|
181
|
+
* TLS handshake level. This bypasses anti-bot systems that reject default
|
|
182
|
+
* Node.js TLS signatures.
|
|
183
|
+
*
|
|
184
|
+
* impit is an optional peer dependency — if not installed, throws a clear
|
|
185
|
+
* error message with install instructions.
|
|
186
|
+
*/
|
|
187
|
+
declare function fetchStealth(url: string, options?: FetchOptions): Promise<FetchResult>;
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Layer 1c: Render Fetch
|
|
191
|
+
* Playwright headless browser for JS-rendered pages (optional peer dependency)
|
|
192
|
+
*
|
|
193
|
+
* Uses Chromium to fully render SPAs (React, Next.js, Vue) that return
|
|
194
|
+
* empty/partial HTML to tiers 1 and 2. Extracts the fully rendered DOM.
|
|
195
|
+
*
|
|
196
|
+
* Requires: npm install playwright
|
|
197
|
+
*/
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Fetch URL using a headless Chromium browser to render JavaScript.
|
|
201
|
+
*
|
|
202
|
+
* Launches a fresh browser per call (no shared state), navigates to the URL,
|
|
203
|
+
* waits for network idle, then extracts the fully rendered HTML.
|
|
204
|
+
*
|
|
205
|
+
* Playwright is an optional peer dependency — if not installed, throws a clear
|
|
206
|
+
* error message with install instructions.
|
|
207
|
+
*/
|
|
208
|
+
declare function fetchRender(url: string, options?: FetchOptions): Promise<FetchResult>;
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Layer 2: Extract
|
|
212
|
+
* Content extraction using Readability
|
|
213
|
+
*/
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Error class for extraction-related errors
|
|
217
|
+
*/
|
|
218
|
+
declare class ExtractionError extends Error {
|
|
219
|
+
readonly code: string;
|
|
220
|
+
constructor(message: string, code: string);
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Extract main content from HTML using Readability
|
|
224
|
+
*/
|
|
225
|
+
declare function extractContent(fetchResult: FetchResult): Promise<ExtractResult>;
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Layer 3: Convert
|
|
229
|
+
* HTML to Markdown conversion
|
|
230
|
+
*/
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Convert extracted HTML to clean Markdown
|
|
234
|
+
*/
|
|
235
|
+
declare function convertToMarkdown(extractResult: ExtractResult): Promise<ConvertResult>;
|
|
236
|
+
/**
|
|
237
|
+
* Content-aware token estimator.
|
|
238
|
+
*
|
|
239
|
+
* Segments text by content type and applies calibrated character-per-token
|
|
240
|
+
* ratios derived from cl100k_base (GPT-4) empirical measurements.
|
|
241
|
+
*
|
|
242
|
+
* Ratios:
|
|
243
|
+
* Code blocks — 3.2 chars/token (operators, camelCase split into subwords)
|
|
244
|
+
* Inline code — 3.5 chars/token (variable names, short expressions)
|
|
245
|
+
* URLs — 5.0 chars/token (path segments tokenize efficiently)
|
|
246
|
+
* Prose — 4.3 chars/token (words, punctuation, markdown formatting)
|
|
247
|
+
*
|
|
248
|
+
* Accuracy: within ±15% of actual BPE tokenization for English content.
|
|
249
|
+
*/
|
|
250
|
+
declare function estimateTokens(text: string): number;
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* robots.txt compliance layer
|
|
254
|
+
*
|
|
255
|
+
* Opt-in for single-page scraping (respectRobots option),
|
|
256
|
+
* foundation for FTR-ORG-019 crawl mode where it becomes default.
|
|
257
|
+
*/
|
|
258
|
+
/**
|
|
259
|
+
* Check if a URL is allowed by robots.txt.
|
|
260
|
+
* Returns true if allowed or if robots.txt cannot be fetched (fail-open).
|
|
261
|
+
*/
|
|
262
|
+
declare function isAllowedByRobots(url: string, timeout?: number): Promise<boolean>;
|
|
263
|
+
/**
|
|
264
|
+
* Extract Sitemap: URLs from robots.txt.
|
|
265
|
+
* Returns empty array if no Sitemap directives or robots.txt unreachable (fail-open).
|
|
266
|
+
* Reuses cached robots.txt parser.
|
|
267
|
+
*/
|
|
268
|
+
declare function getSitemapUrls(url: string, timeout?: number): Promise<string[]>;
|
|
269
|
+
/**
|
|
270
|
+
* Extract Crawl-delay value from robots.txt for ScraperBot user agent.
|
|
271
|
+
* Returns delay in seconds, or null if not specified or robots.txt unreachable (fail-open).
|
|
272
|
+
* Reuses cached robots.txt parser.
|
|
273
|
+
*/
|
|
274
|
+
declare function getCrawlDelay(url: string, timeout?: number): Promise<number | null>;
|
|
275
|
+
/**
|
|
276
|
+
* Clear the robots.txt cache. Exported for testing.
|
|
277
|
+
*/
|
|
278
|
+
declare function clearRobotsCache(): void;
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Sitemap parser
|
|
282
|
+
*
|
|
283
|
+
* Fetches and parses sitemap.xml for crawl mode seed URLs.
|
|
284
|
+
* Regex-based XML parsing — no XML parser dependency.
|
|
285
|
+
* Handles sitemap index files with recursion limit.
|
|
286
|
+
* Mirrors robots.ts fetch+cache+fail-open pattern.
|
|
287
|
+
*/
|
|
288
|
+
/**
|
|
289
|
+
* A single entry from a sitemap
|
|
290
|
+
*/
|
|
291
|
+
interface SitemapEntry {
|
|
292
|
+
/** URL from <loc> tag */
|
|
293
|
+
loc: string;
|
|
294
|
+
/** Last modification date from <lastmod> tag */
|
|
295
|
+
lastmod?: string;
|
|
296
|
+
/** Priority from <priority> tag (0.0 to 1.0) */
|
|
297
|
+
priority?: number;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Parse a sitemap.xml and return URL entries.
|
|
301
|
+
*
|
|
302
|
+
* - Fetches the sitemap from the given URL
|
|
303
|
+
* - Extracts <loc> URLs via regex (no XML parser dependency)
|
|
304
|
+
* - Handles sitemap index files (recursive, max depth 2)
|
|
305
|
+
* - Caches results per URL with 1-hour TTL
|
|
306
|
+
* - Fail-open: returns empty array if sitemap is unreachable or invalid
|
|
307
|
+
* - Filters to same-origin URLs only
|
|
308
|
+
*
|
|
309
|
+
* @param url - Full URL of the sitemap.xml
|
|
310
|
+
* @param timeout - Fetch timeout in ms (default: 10000)
|
|
311
|
+
* @returns Array of SitemapEntry objects
|
|
312
|
+
*/
|
|
313
|
+
declare function parseSitemap(url: string, timeout?: number): Promise<SitemapEntry[]>;
|
|
314
|
+
/**
|
|
315
|
+
* Clear the sitemap cache. Exported for testing.
|
|
316
|
+
*/
|
|
317
|
+
declare function clearSitemapCache(): void;
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Crawl: BFS multi-page orchestrator
|
|
321
|
+
* TKT-SCRAPER-079: Crawl multiple pages from a starting URL
|
|
322
|
+
*
|
|
323
|
+
* Composes: sitemap seeding, robots.txt, link extraction, URL normalization,
|
|
324
|
+
* depth/limit/filter constraints, concurrency, and scrape pipeline per page.
|
|
325
|
+
*/
|
|
326
|
+
|
|
327
|
+
declare function normalizeUrl(url: string): string;
|
|
328
|
+
declare function extractLinks(html: string, baseUrl: string): string[];
|
|
329
|
+
declare function crawl(options: CrawlOptions): Promise<CrawlResult>;
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Mode-aware fetch routing with challenge detection.
|
|
333
|
+
*
|
|
334
|
+
* Shared between scrape() (index.ts) and crawl() (crawl.ts)
|
|
335
|
+
* to avoid duplicating mode routing and challenge detection logic.
|
|
336
|
+
*/
|
|
337
|
+
|
|
338
|
+
/**
|
|
339
|
+
* Detect anti-bot challenge pages that return HTTP 200 but contain
|
|
340
|
+
* challenge/verification HTML instead of real content.
|
|
341
|
+
*/
|
|
342
|
+
declare function isChallengeResponse(fetchResult: FetchResult): boolean;
|
|
343
|
+
declare function fetchWithMode(url: string, mode: FetchMode, options: FetchOptions): Promise<FetchResult>;
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* scraper
|
|
347
|
+
* Context compression for AI agents
|
|
348
|
+
*
|
|
349
|
+
* @packageDocumentation
|
|
350
|
+
*/
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Compress web content for AI agents
|
|
354
|
+
*
|
|
355
|
+
* Pipeline: Fetch -> Extract -> Convert
|
|
356
|
+
* No LLM dependency. Reduces tokens by 70-80%.
|
|
357
|
+
*
|
|
358
|
+
* @example
|
|
359
|
+
* ```typescript
|
|
360
|
+
* import { scrape } from '@robot-resources/scraper';
|
|
361
|
+
*
|
|
362
|
+
* const result = await scrape('https://example.com/article');
|
|
363
|
+
* console.log(result.markdown);
|
|
364
|
+
* console.log(result.tokenCount);
|
|
365
|
+
* ```
|
|
366
|
+
*
|
|
367
|
+
* @param url - URL to fetch and compress
|
|
368
|
+
* @param options - Optional configuration
|
|
369
|
+
* @returns Compressed content with metadata
|
|
370
|
+
*/
|
|
371
|
+
declare function scrape(url: string, options?: ScrapeOptions): Promise<ScrapeResult>;
|
|
372
|
+
|
|
373
|
+
export { type ConvertResult, type CrawlError, type CrawlOptions, type CrawlPageResult, type CrawlResult, type ExtractResult, ExtractionError, FetchError, type FetchMode, type FetchResult, type ScrapeOptions, type ScrapeResult, type SitemapEntry, clearRobotsCache, clearSitemapCache, convertToMarkdown, crawl, estimateTokens, extractContent, extractLinks, fetchRender, fetchStealth, fetchUrl, fetchWithMode, getCrawlDelay, getSitemapUrls, isAllowedByRobots, isChallengeResponse, normalizeUrl, parseSitemap, scrape };
|