@robot-resources/scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,373 @@
1
+ /**
2
+ * Types for scraper
3
+ * Context compression for AI agents
4
+ */
5
+ /**
6
+ * Fetch tier selection for scrape()
7
+ *
8
+ * - 'fast': Tier 1 — plain HTTP fetch (default Node.js fetch)
9
+ * - 'stealth': Tier 2 — TLS fingerprint impersonation (bypasses anti-bot)
10
+ * - 'render': Tier 3 — Playwright headless browser (JS-rendered pages)
11
+ * - 'auto': Try fast first, fall back to stealth on 403/challenge detection
12
+ */
13
+ type FetchMode = 'fast' | 'stealth' | 'render' | 'auto';
14
+ /**
15
+ * Options for the scrape() function
16
+ */
17
+ interface ScrapeOptions {
18
+ /** Fetch tier to use (default: 'auto') */
19
+ mode?: FetchMode;
20
+ /** Request timeout in milliseconds (default: 10000) */
21
+ timeout?: number;
22
+ /** Maximum retry attempts for failed requests (default: 3) */
23
+ maxRetries?: number;
24
+ /** Custom user agent string */
25
+ userAgent?: string;
26
+ /** Check robots.txt before fetching (default: false) */
27
+ respectRobots?: boolean;
28
+ }
29
+ /**
30
+ * Result from the scrape() function
31
+ */
32
+ interface ScrapeResult {
33
+ /** Compressed markdown content */
34
+ markdown: string;
35
+ /** Estimated token count */
36
+ tokenCount: number;
37
+ /** Page title if extracted */
38
+ title?: string;
39
+ /** Author if extracted */
40
+ author?: string;
41
+ /** Site name if extracted */
42
+ siteName?: string;
43
+ /** Published date if extracted */
44
+ publishedAt?: string;
45
+ /** Final URL after redirects */
46
+ url: string;
47
+ }
48
+ /**
49
+ * Result from fetch layer
50
+ */
51
+ interface FetchResult {
52
+ /** Raw HTML content */
53
+ html: string;
54
+ /** Final URL after redirects */
55
+ url: string;
56
+ /** HTTP status code */
57
+ statusCode: number;
58
+ /** Response headers */
59
+ headers: Record<string, string>;
60
+ }
61
+ /**
62
+ * Result from extract layer
63
+ */
64
+ interface ExtractResult {
65
+ /** Extracted HTML content (article body) */
66
+ content: string;
67
+ /** Page title */
68
+ title?: string;
69
+ /** Author name */
70
+ author?: string;
71
+ /** Published timestamp */
72
+ publishedAt?: string;
73
+ /** Site name */
74
+ siteName?: string;
75
+ }
76
+ /**
77
+ * Result from convert layer
78
+ */
79
+ interface ConvertResult {
80
+ /** Markdown content */
81
+ markdown: string;
82
+ /** Estimated token count */
83
+ tokenCount: number;
84
+ }
85
+ /**
86
+ * Options for the crawl() function
87
+ */
88
+ interface CrawlOptions {
89
+ /** Starting URL to crawl */
90
+ url: string;
91
+ /** Max crawl depth — 0 means only starting URL (default: 2) */
92
+ depth?: number;
93
+ /** Max pages to crawl (default: 50) */
94
+ limit?: number;
95
+ /** Fetch mode per page (default: 'auto') */
96
+ mode?: FetchMode;
97
+ /** URL patterns to include — glob-style (if set, URL must match at least one) */
98
+ include?: string[];
99
+ /** URL patterns to exclude — glob-style (URL must not match any) */
100
+ exclude?: string[];
101
+ /** Per-page timeout in milliseconds (default: 10000) */
102
+ timeout?: number;
103
+ /** Parallel page fetches (default: 3) */
104
+ concurrency?: number;
105
+ /** Check robots.txt before crawling (default: true) */
106
+ respectRobots?: boolean;
107
+ }
108
+ /**
109
+ * Result from a single crawled page — extends ScrapeResult with crawl depth
110
+ */
111
+ interface CrawlPageResult extends ScrapeResult {
112
+ /** Crawl depth where this page was discovered */
113
+ depth: number;
114
+ }
115
+ /**
116
+ * Error for a single URL during crawl
117
+ */
118
+ interface CrawlError {
119
+ /** URL that failed */
120
+ url: string;
121
+ /** Error message */
122
+ error: string;
123
+ /** Crawl depth where error occurred */
124
+ depth: number;
125
+ }
126
+ /**
127
+ * Aggregate result from crawl()
128
+ */
129
+ interface CrawlResult {
130
+ /** Successfully scraped pages */
131
+ pages: CrawlPageResult[];
132
+ /** Total URLs discovered (including skipped/errored) */
133
+ totalDiscovered: number;
134
+ /** Total URLs successfully scraped */
135
+ totalCrawled: number;
136
+ /** Total URLs skipped (robots, filter, limit) */
137
+ totalSkipped: number;
138
+ /** Per-URL errors */
139
+ errors: CrawlError[];
140
+ /** Total crawl duration in milliseconds */
141
+ duration: number;
142
+ }
143
+
144
+ /**
145
+ * Layer 1: Fetch
146
+ * HTTP fetching with smart headers and retries
147
+ */
148
+
149
+ interface FetchOptions {
150
+ timeout?: number;
151
+ maxRetries?: number;
152
+ userAgent?: string;
153
+ }
154
+ /**
155
+ * Error class for fetch-related errors
156
+ */
157
+ declare class FetchError extends Error {
158
+ readonly statusCode?: number | undefined;
159
+ readonly retryable: boolean;
160
+ constructor(message: string, statusCode?: number | undefined, retryable?: boolean);
161
+ }
162
+ /**
163
+ * Fetch URL content with smart headers and retry logic
164
+ */
165
+ declare function fetchUrl(url: string, options?: FetchOptions): Promise<FetchResult>;
166
+
167
+ /**
168
+ * Layer 1b: Stealth Fetch
169
+ * TLS fingerprint impersonation via impit (optional peer dependency)
170
+ *
171
+ * Uses Rust-based browser TLS fingerprinting to bypass anti-bot systems
172
+ * (Cloudflare, Akamai, PerimeterX) without a full browser.
173
+ *
174
+ * Requires: npm install impit (Node >= 20)
175
+ */
176
+
177
+ /**
178
+ * Fetch URL with browser TLS fingerprint impersonation.
179
+ *
180
+ * Uses impit (Apify) to produce Chrome-like JA3/JA4 fingerprints at the
181
+ * TLS handshake level. This bypasses anti-bot systems that reject default
182
+ * Node.js TLS signatures.
183
+ *
184
+ * impit is an optional peer dependency — if not installed, throws a clear
185
+ * error message with install instructions.
186
+ */
187
+ declare function fetchStealth(url: string, options?: FetchOptions): Promise<FetchResult>;
188
+
189
+ /**
190
+ * Layer 1c: Render Fetch
191
+ * Playwright headless browser for JS-rendered pages (optional peer dependency)
192
+ *
193
+ * Uses Chromium to fully render SPAs (React, Next.js, Vue) that return
194
+ * empty/partial HTML to tiers 1 and 2. Extracts the fully rendered DOM.
195
+ *
196
+ * Requires: npm install playwright
197
+ */
198
+
199
+ /**
200
+ * Fetch URL using a headless Chromium browser to render JavaScript.
201
+ *
202
+ * Launches a fresh browser per call (no shared state), navigates to the URL,
203
+ * waits for network idle, then extracts the fully rendered HTML.
204
+ *
205
+ * Playwright is an optional peer dependency — if not installed, throws a clear
206
+ * error message with install instructions.
207
+ */
208
+ declare function fetchRender(url: string, options?: FetchOptions): Promise<FetchResult>;
209
+
210
+ /**
211
+ * Layer 2: Extract
212
+ * Content extraction using Readability
213
+ */
214
+
215
+ /**
216
+ * Error class for extraction-related errors
217
+ */
218
+ declare class ExtractionError extends Error {
219
+ readonly code: string;
220
+ constructor(message: string, code: string);
221
+ }
222
+ /**
223
+ * Extract main content from HTML using Readability
224
+ */
225
+ declare function extractContent(fetchResult: FetchResult): Promise<ExtractResult>;
226
+
227
+ /**
228
+ * Layer 3: Convert
229
+ * HTML to Markdown conversion
230
+ */
231
+
232
+ /**
233
+ * Convert extracted HTML to clean Markdown
234
+ */
235
+ declare function convertToMarkdown(extractResult: ExtractResult): Promise<ConvertResult>;
236
+ /**
237
+ * Content-aware token estimator.
238
+ *
239
+ * Segments text by content type and applies calibrated character-per-token
240
+ * ratios derived from cl100k_base (GPT-4) empirical measurements.
241
+ *
242
+ * Ratios:
243
+ * Code blocks — 3.2 chars/token (operators, camelCase split into subwords)
244
+ * Inline code — 3.5 chars/token (variable names, short expressions)
245
+ * URLs — 5.0 chars/token (path segments tokenize efficiently)
246
+ * Prose — 4.3 chars/token (words, punctuation, markdown formatting)
247
+ *
248
+ * Accuracy: within ±15% of actual BPE tokenization for English content.
249
+ */
250
+ declare function estimateTokens(text: string): number;
251
+
252
+ /**
253
+ * robots.txt compliance layer
254
+ *
255
+ * Opt-in for single-page scraping (respectRobots option),
256
+ * foundation for FTR-ORG-019 crawl mode where it becomes default.
257
+ */
258
+ /**
259
+ * Check if a URL is allowed by robots.txt.
260
+ * Returns true if allowed or if robots.txt cannot be fetched (fail-open).
261
+ */
262
+ declare function isAllowedByRobots(url: string, timeout?: number): Promise<boolean>;
263
+ /**
264
+ * Extract Sitemap: URLs from robots.txt.
265
+ * Returns empty array if no Sitemap directives or robots.txt unreachable (fail-open).
266
+ * Reuses cached robots.txt parser.
267
+ */
268
+ declare function getSitemapUrls(url: string, timeout?: number): Promise<string[]>;
269
+ /**
270
+ * Extract Crawl-delay value from robots.txt for ScraperBot user agent.
271
+ * Returns delay in seconds, or null if not specified or robots.txt unreachable (fail-open).
272
+ * Reuses cached robots.txt parser.
273
+ */
274
+ declare function getCrawlDelay(url: string, timeout?: number): Promise<number | null>;
275
+ /**
276
+ * Clear the robots.txt cache. Exported for testing.
277
+ */
278
+ declare function clearRobotsCache(): void;
279
+
280
+ /**
281
+ * Sitemap parser
282
+ *
283
+ * Fetches and parses sitemap.xml for crawl mode seed URLs.
284
+ * Regex-based XML parsing — no XML parser dependency.
285
+ * Handles sitemap index files with recursion limit.
286
+ * Mirrors robots.ts fetch+cache+fail-open pattern.
287
+ */
288
+ /**
289
+ * A single entry from a sitemap
290
+ */
291
+ interface SitemapEntry {
292
+ /** URL from <loc> tag */
293
+ loc: string;
294
+ /** Last modification date from <lastmod> tag */
295
+ lastmod?: string;
296
+ /** Priority from <priority> tag (0.0 to 1.0) */
297
+ priority?: number;
298
+ }
299
+ /**
300
+ * Parse a sitemap.xml and return URL entries.
301
+ *
302
+ * - Fetches the sitemap from the given URL
303
+ * - Extracts <loc> URLs via regex (no XML parser dependency)
304
+ * - Handles sitemap index files (recursive, max depth 2)
305
+ * - Caches results per URL with 1-hour TTL
306
+ * - Fail-open: returns empty array if sitemap is unreachable or invalid
307
+ * - Filters to same-origin URLs only
308
+ *
309
+ * @param url - Full URL of the sitemap.xml
310
+ * @param timeout - Fetch timeout in ms (default: 10000)
311
+ * @returns Array of SitemapEntry objects
312
+ */
313
+ declare function parseSitemap(url: string, timeout?: number): Promise<SitemapEntry[]>;
314
+ /**
315
+ * Clear the sitemap cache. Exported for testing.
316
+ */
317
+ declare function clearSitemapCache(): void;
318
+
319
+ /**
320
+ * Crawl: BFS multi-page orchestrator
321
+ * TKT-SCRAPER-079: Crawl multiple pages from a starting URL
322
+ *
323
+ * Composes: sitemap seeding, robots.txt, link extraction, URL normalization,
324
+ * depth/limit/filter constraints, concurrency, and scrape pipeline per page.
325
+ */
326
+
327
+ declare function normalizeUrl(url: string): string;
328
+ declare function extractLinks(html: string, baseUrl: string): string[];
329
+ declare function crawl(options: CrawlOptions): Promise<CrawlResult>;
330
+
331
+ /**
332
+ * Mode-aware fetch routing with challenge detection.
333
+ *
334
+ * Shared between scrape() (index.ts) and crawl() (crawl.ts)
335
+ * to avoid duplicating mode routing and challenge detection logic.
336
+ */
337
+
338
+ /**
339
+ * Detect anti-bot challenge pages that return HTTP 200 but contain
340
+ * challenge/verification HTML instead of real content.
341
+ */
342
+ declare function isChallengeResponse(fetchResult: FetchResult): boolean;
343
+ declare function fetchWithMode(url: string, mode: FetchMode, options: FetchOptions): Promise<FetchResult>;
344
+
345
+ /**
346
+ * scraper
347
+ * Context compression for AI agents
348
+ *
349
+ * @packageDocumentation
350
+ */
351
+
352
+ /**
353
+ * Compress web content for AI agents
354
+ *
355
+ * Pipeline: Fetch -> Extract -> Convert
356
+ * No LLM dependency. Reduces tokens by 70-80%.
357
+ *
358
+ * @example
359
+ * ```typescript
360
+ * import { scrape } from '@robot-resources/scraper';
361
+ *
362
+ * const result = await scrape('https://example.com/article');
363
+ * console.log(result.markdown);
364
+ * console.log(result.tokenCount);
365
+ * ```
366
+ *
367
+ * @param url - URL to fetch and compress
368
+ * @param options - Optional configuration
369
+ * @returns Compressed content with metadata
370
+ */
371
+ declare function scrape(url: string, options?: ScrapeOptions): Promise<ScrapeResult>;
372
+
373
+ export { type ConvertResult, type CrawlError, type CrawlOptions, type CrawlPageResult, type CrawlResult, type ExtractResult, ExtractionError, FetchError, type FetchMode, type FetchResult, type ScrapeOptions, type ScrapeResult, type SitemapEntry, clearRobotsCache, clearSitemapCache, convertToMarkdown, crawl, estimateTokens, extractContent, extractLinks, fetchRender, fetchStealth, fetchUrl, fetchWithMode, getCrawlDelay, getSitemapUrls, isAllowedByRobots, isChallengeResponse, normalizeUrl, parseSitemap, scrape };