scraply 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.d.ts ADDED
@@ -0,0 +1,285 @@
1
+ import type { CheerioAPI } from 'cheerio';
2
+
3
+ export type LogLevel = 'silent' | 'error' | 'warn' | 'info' | 'debug';
4
+ export type OutputFormat = 'json' | 'jsonl' | 'lines';
5
+ export type ContentKind = 'html' | 'json' | 'text';
6
+ export type UrlPattern = string | RegExp;
7
+ export type WaitUntil = 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
8
+ export type BlockableResource = 'image' | 'stylesheet' | 'font' | 'media';
9
+
10
+ /**
11
+ * List fields accept a plain array (replaces the default) or a directive object
12
+ * that combines with Scraply's defaults.
13
+ */
14
+ export type ListInput<T> = T[] | { extend?: T[]; append?: T[]; prepend?: T[]; replace?: T[] };
15
+
16
+ export interface RequestConfig {
17
+ timeout: number;
18
+ maxRedirects: number;
19
+ /** Hard cap on the response body in bytes; 0 disables it. */
20
+ maxContentLength: number;
21
+ userAgent: string;
22
+ headers: Record<string, string>;
23
+ }
24
+
25
+ export interface RetryConfig {
26
+ max: number;
27
+ statusCodes: number[];
28
+ delay: number;
29
+ }
30
+
31
+ export interface RateLimitConfig {
32
+ fallbackDelay: number;
33
+ /** false: wait & retry. true: abort the crawl with a RateLimitError. */
34
+ exitOnLimit: boolean;
35
+ exitCode: number;
36
+ }
37
+
38
+ export interface CrawlConfig {
39
+ concurrency: number;
40
+ /** Minimum spacing (ms) between requests to the same host. */
41
+ delay: number;
42
+ maxDepth: number;
43
+ maxPages: number;
44
+ resetOnComplete: boolean;
45
+ retryErrors: boolean;
46
+ retrySkipped: boolean;
47
+ /** true seeds <origin>/sitemap.xml per start URL, or pass explicit sitemap URLs. */
48
+ sitemap: boolean | string[];
49
+ }
50
+
51
+ export interface BrowserConfig {
52
+ waitUntil: WaitUntil;
53
+ blockResources: BlockableResource[];
54
+ }
55
+
56
+ export interface ExtractConfig {
57
+ /** Allow-list container(s) to read text from; null = whole <body>. */
58
+ root?: string | string[] | null;
59
+ /** Selector used when `root` matches nothing (default 'body'). */
60
+ rootFallback?: string;
61
+ /** Parse JSON bodies into pretty content + record.data (default true). */
62
+ json?: boolean;
63
+ /** Elements stripped before text extraction. */
64
+ removeSelectors?: string[];
65
+ }
66
+
67
+ export interface OutputConfig {
68
+ format: OutputFormat;
69
+ exclude: UrlPattern[];
70
+ routes: Record<string, Record<string, string>>;
71
+ }
72
+
73
+ /** Per-origin/route override applied to URLs matching `match`. */
74
+ export interface SiteConfig {
75
+ match: UrlPattern | UrlPattern[];
76
+ allowedContentTypes?: string[];
77
+ extract?: ExtractConfig;
78
+ }
79
+
80
+ export interface FetchResult {
81
+ data: string | ArrayBuffer;
82
+ status: number;
83
+ /** Header keys are lowercased. */
84
+ headers: Record<string, string>;
85
+ }
86
+
87
+ export interface Fetcher {
88
+ name: string;
89
+ fetch(url: string): Promise<FetchResult>;
90
+ init?(): Promise<void>;
91
+ close?(): Promise<void>;
92
+ }
93
+
94
+ export interface Logger {
95
+ level: LogLevel;
96
+ error(...args: unknown[]): void;
97
+ warn(...args: unknown[]): void;
98
+ info(...args: unknown[]): void;
99
+ debug(...args: unknown[]): void;
100
+ }
101
+
102
+ export interface FetcherDeps {
103
+ config: ResolvedConfig;
104
+ logger: Logger;
105
+ }
106
+
107
+ export interface QueueEntry {
108
+ url: string;
109
+ /** Filename of the saved crawled record (relative to crawledDir), or null. */
110
+ file: string | null;
111
+ status: number | null;
112
+ error: string | null;
113
+ skipped: string | null;
114
+ referrer: string | null;
115
+ depth: number;
116
+ }
117
+
118
+ /** A crawled record. `data` is present for JSON sources; transform hooks may add fields. */
119
+ export interface CrawlRecord {
120
+ url: string;
121
+ content: string;
122
+ crawledAt: string;
123
+ hash: string;
124
+ data?: unknown;
125
+ [key: string]: unknown;
126
+ }
127
+
128
+ export interface QueueManager {
129
+ entries: QueueEntry[];
130
+ load(): QueueEntry[];
131
+ seed(urls: string[]): void;
132
+ add(url: string, opts?: { depth?: number; referrer?: string | null }): boolean;
133
+ claimNext(): QueueEntry | null;
134
+ requeueErrors(): number;
135
+ requeueSkipped(): number;
136
+ isAllProcessed(): boolean;
137
+ pendingCount(): number;
138
+ crawledCount(): number;
139
+ errorCount(): number;
140
+ skippedCount(): number;
141
+ flush(): void;
142
+ reset(): void;
143
+ }
144
+
145
+ export interface ScraplyConfig {
146
+ startUrls?: string[];
147
+ include?: ListInput<UrlPattern>;
148
+ exclude?: ListInput<UrlPattern>;
149
+ allowedContentTypes?: ListInput<string>;
150
+ sites?: SiteConfig[];
151
+ fetcher?: 'http' | 'browser' | Fetcher;
152
+ browser?: Partial<BrowserConfig>;
153
+ logLevel?: LogLevel;
154
+ /** Install SIGINT/SIGTERM handlers for a graceful stop (default true). */
155
+ signals?: boolean;
156
+ storage?: { dir?: string };
157
+ request?: Partial<RequestConfig>;
158
+ retry?: Partial<RetryConfig>;
159
+ rateLimit?: Partial<RateLimitConfig>;
160
+ crawl?: Partial<CrawlConfig>;
161
+ extract?: Omit<ExtractConfig, 'removeSelectors'> & { removeSelectors?: ListInput<string> };
162
+ output?: Partial<Omit<OutputConfig, 'exclude'>> & { exclude?: ListInput<UrlPattern> };
163
+ }
164
+
165
+ export interface ResolvedConfig {
166
+ startUrls: string[];
167
+ include: UrlPattern[];
168
+ exclude: UrlPattern[];
169
+ allowedContentTypes: string[];
170
+ sites: Array<{ match: UrlPattern[]; allowedContentTypes?: string[]; extract?: ExtractConfig }>;
171
+ fetcher: 'http' | 'browser' | Fetcher;
172
+ browser: BrowserConfig;
173
+ logLevel: LogLevel;
174
+ signals: boolean;
175
+ storage: { dir: string; queuePath: string; crawledDir: string; formattedDir: string };
176
+ request: RequestConfig;
177
+ retry: RetryConfig;
178
+ rateLimit: RateLimitConfig;
179
+ crawl: CrawlConfig;
180
+ extract: ExtractConfig & { removeSelectors: string[] };
181
+ output: OutputConfig;
182
+ }
183
+
184
+ /** Lifecycle hooks. Reduce hooks may return a replacement value; emit hooks are side-effect only. */
185
+ export interface HookMap {
186
+ /** Fires right after a successful fetch, before the content-type gate. */
187
+ response: (result: FetchResult, entry: QueueEntry) => void | Promise<void>;
188
+ /** Fires when a response is skipped (e.g. disallowed content-type). */
189
+ skip: (
190
+ entry: QueueEntry,
191
+ info: { reason: string; status: number | null; result: FetchResult }
192
+ ) => void | Promise<void>;
193
+ /** Return false to veto enqueuing a URL. */
194
+ shouldEnqueue: (
195
+ allow: boolean,
196
+ url: string,
197
+ referrer: string | null
198
+ ) => boolean | void | Promise<boolean | void>;
199
+ /** Reduce/replace the list of discovered links before they are enqueued. `$` is null for non-HTML. */
200
+ links: (
201
+ links: string[],
202
+ $: CheerioAPI | null,
203
+ entry: QueueEntry,
204
+ result: FetchResult
205
+ ) => string[] | void | Promise<string[] | void>;
206
+ /** Reduce/replace the extracted content. `$` is null for non-HTML bodies. */
207
+ extract: (
208
+ content: string,
209
+ $: CheerioAPI | null,
210
+ entry: QueueEntry,
211
+ result: FetchResult
212
+ ) => string | void | Promise<string | void>;
213
+ /** Reduce/replace the record before it is persisted and formatted. */
214
+ transform: (
215
+ record: CrawlRecord,
216
+ entry: QueueEntry,
217
+ result: FetchResult
218
+ ) => CrawlRecord | void | Promise<CrawlRecord | void>;
219
+ /** Fires after a record is persisted. */
220
+ page: (record: CrawlRecord, entry: QueueEntry, result: FetchResult) => void | Promise<void>;
221
+ /** Fires when a fetch/process fails (non-429). */
222
+ error: (error: Error, entry: QueueEntry) => void | Promise<void>;
223
+ }
224
+
225
+ export interface Crawler {
226
+ config: ResolvedConfig;
227
+ logger: Logger;
228
+ queue: QueueManager;
229
+ on<K extends keyof HookMap>(event: K, fn: HookMap[K]): () => void;
230
+ fetch(url: string): Promise<FetchResult>;
231
+ extract(html: string | CheerioAPI, url?: string | null): { url: string | null; content: string };
232
+ enqueue(
233
+ urls: string | string[],
234
+ opts?: { depth?: number; referrer?: string | null }
235
+ ): Promise<number>;
236
+ crawl(): Promise<void>;
237
+ format(records?: CrawlRecord[] | null): Promise<Map<string, CrawlRecord[]>>;
238
+ run(): Promise<QueueEntry[]>;
239
+ requeueErrors(): number;
240
+ requeueSkipped(): number;
241
+ stop(): void;
242
+ }
243
+
244
+ export function createCrawler(config?: ScraplyConfig): Crawler;
245
+ export function scraply(config?: ScraplyConfig): Promise<QueueEntry[]>;
246
+ export function runCrawlers(
247
+ items: Array<ScraplyConfig | Crawler>,
248
+ options?: { concurrency?: number }
249
+ ): Promise<QueueEntry[][]>;
250
+
251
+ export function loadConfig(config?: ScraplyConfig): ResolvedConfig;
252
+ export const DEFAULT_CONFIG: ScraplyConfig;
253
+ export function assertBrowserConfig(browser: BrowserConfig): void;
254
+ export const BROWSER_WAIT_UNTIL: readonly WaitUntil[];
255
+ export const BROWSER_BLOCKABLE_RESOURCES: readonly BlockableResource[];
256
+
257
+ export class RateLimitError extends Error {
258
+ name: 'RateLimitError';
259
+ code: number;
260
+ headers: Record<string, string>;
261
+ response: { status: 429; headers: Record<string, string> };
262
+ constructor(
263
+ message?: string,
264
+ options?: { code?: number; headers?: Record<string, string>; cause?: unknown }
265
+ );
266
+ }
267
+
268
+ export function normalizeUrl(url: string): string;
269
+ export function matchesPattern(value: string, pattern: UrlPattern): boolean;
270
+ export function matchesAnyPattern(value: string, patterns?: UrlPattern[]): boolean;
271
+ export function extractText(input: string | CheerioAPI, options?: ExtractConfig): string;
272
+ export function discoverLinks($: CheerioAPI, baseUrl: string): string[];
273
+ export function classifyContentType(contentType?: string): ContentKind;
274
+ export function parseJson(data: string | ArrayBuffer): { data: unknown; content: string };
275
+ export function parseSitemap(xml: string): { sitemaps: string[]; urls: string[] };
276
+ export function routeRecord(url: string, output: OutputConfig, formattedDir: string): string | null;
277
+ export function writeRecords(filePath: string, records: CrawlRecord[], format?: OutputFormat): void;
278
+ export function formatRecords(
279
+ records: CrawlRecord[],
280
+ options: { output: OutputConfig; formattedDir: string }
281
+ ): Map<string, CrawlRecord[]>;
282
+
283
+ export function resolveFetcher(deps: FetcherDeps): Fetcher;
284
+ export function createHttpFetcher(deps: { config: ResolvedConfig }): Fetcher;
285
+ export function createBrowserFetcher(deps: { config: ResolvedConfig; logger: Logger }): Fetcher;
package/src/index.js CHANGED
@@ -25,51 +25,82 @@
25
25
  * @property {number} maxPages - hard cap on successfully crawled pages (counts across resumes)
26
26
  * @property {boolean} resetOnComplete
27
27
  * @property {boolean} retryErrors - re-queue previously errored URLs on resume
28
+ * @property {boolean} retrySkipped - re-queue previously skipped URLs on resume
29
+ * @property {boolean|string[]} sitemap - seed from sitemap(s): true uses <origin>/sitemap.xml, or pass explicit URLs
28
30
  *
29
31
  * @typedef {Object} BrowserConfig
30
32
  * @property {'load'|'domcontentloaded'|'networkidle0'|'networkidle2'} waitUntil
31
33
  * @property {Array<'image'|'stylesheet'|'font'|'media'>} blockResources
32
34
  *
35
+ * @typedef {Object} ExtractConfig
36
+ * @property {string|string[]|null} [root] - allow-list container(s) to read text from; null = whole <body>
37
+ * @property {string} [rootFallback] - selector used when `root` matches nothing (default 'body')
38
+ * @property {boolean} [json] - parse JSON bodies into pretty content + record.data (default true)
39
+ * @property {string[]} [removeSelectors] - elements stripped before text extraction
40
+ *
41
+ * @typedef {Object} SiteConfig
42
+ * @property {string|RegExp|Array<string|RegExp>} match - URL prefix(es)/RegExp(s) this override applies to
43
+ * @property {string[]} [allowedContentTypes]
44
+ * @property {ExtractConfig} [extract]
45
+ *
33
46
  * @typedef {Object} OutputConfig
34
47
  * @property {'json'|'jsonl'|'lines'} format
35
48
  * @property {Array<string|RegExp>} exclude
36
49
  * @property {Record<string, Record<string, string>>} routes
37
50
  *
51
+ * List fields (`include`, `exclude`, `allowedContentTypes`, `extract.removeSelectors`,
52
+ * `output.exclude`) accept either an array (replaces the default) or a directive
53
+ * object `{ extend?, prepend?, replace? }` to combine with Scraply's defaults.
54
+ * @template T
55
+ * @typedef {T[] | { extend?: T[], prepend?: T[], append?: T[], replace?: T[] }} ListInput
56
+ *
38
57
  * @typedef {Object} ScraplyConfig
39
58
  * @property {string[]} [startUrls]
40
- * @property {Array<string|RegExp>} [include]
41
- * @property {Array<string|RegExp>} [exclude]
42
- * @property {string[]} [allowedContentTypes]
59
+ * @property {ListInput<string|RegExp>} [include]
60
+ * @property {ListInput<string|RegExp>} [exclude]
61
+ * @property {ListInput<string>} [allowedContentTypes]
62
+ * @property {SiteConfig[]} [sites] - per-origin/route overrides for allowedContentTypes + extract
43
63
  * @property {'http'|'browser'|import('./fetchers/types.js').Fetcher} [fetcher]
44
64
  * @property {Partial<BrowserConfig>} [browser]
45
65
  * @property {'silent'|'error'|'warn'|'info'|'debug'} [logLevel]
66
+ * @property {boolean} [signals] - install SIGINT/SIGTERM handlers (default true)
46
67
  * @property {{ dir?: string }} [storage]
47
68
  * @property {Partial<RequestConfig>} [request]
48
69
  * @property {Partial<RetryConfig>} [retry]
49
70
  * @property {Partial<RateLimitConfig>} [rateLimit]
50
71
  * @property {Partial<CrawlConfig>} [crawl]
51
- * @property {{ removeSelectors?: string[] }} [extract]
52
- * @property {Partial<OutputConfig>} [output]
72
+ * @property {ExtractConfig & { removeSelectors?: ListInput<string> }} [extract]
73
+ * @property {Partial<OutputConfig> & { exclude?: ListInput<string|RegExp> }} [output]
53
74
  *
54
75
  * @typedef {Required<ScraplyConfig> & {
76
+ * include: Array<string|RegExp>,
77
+ * exclude: Array<string|RegExp>,
78
+ * allowedContentTypes: string[],
79
+ * sites: Array<{ match: Array<string|RegExp>, allowedContentTypes?: string[], extract?: ExtractConfig }>,
55
80
  * browser: BrowserConfig,
81
+ * extract: ExtractConfig & { removeSelectors: string[] },
56
82
  * storage: { dir: string, queuePath: string, crawledDir: string, formattedDir: string }
57
83
  * }} ResolvedConfig
58
84
  */
59
85
 
60
86
  // Main entry points
61
- export { createCrawler, scraply } from './crawler.js';
87
+ export { createCrawler, scraply, runCrawlers } from './crawler.js';
62
88
 
63
89
  // Config
64
90
  export { loadConfig } from './config/load.js';
65
91
  export { DEFAULT_CONFIG } from './config/defaults.js';
66
92
  export { assertBrowserConfig, BROWSER_WAIT_UNTIL, BROWSER_BLOCKABLE_RESOURCES } from './config/browser.js';
67
93
 
94
+ // Errors
95
+ export { RateLimitError } from './core/errors.js';
96
+
68
97
  // Standalone building blocks (usable without a crawler instance)
69
98
  export { normalizeUrl } from './url/normalize.js';
70
99
  export { matchesPattern, matchesAnyPattern } from './url/patterns.js';
71
100
  export { extractText } from './extract/extract.js';
72
101
  export { discoverLinks } from './extract/links.js';
102
+ export { classifyContentType, parseJson } from './extract/parse.js';
103
+ export { parseSitemap } from './extract/sitemap.js';
73
104
  export { routeRecord } from './output/router.js';
74
105
  export { writeRecords, formatRecords } from './output/writers.js';
75
106
 
@@ -3,19 +3,28 @@ import { routeRecord } from './router.js';
3
3
 
4
4
  const sortByUrl = (records) => [...records].sort((a, b) => a.url.localeCompare(b.url));
5
5
 
6
+ // Internal bookkeeping fields kept on disk but omitted from formatted output.
7
+ const OUTPUT_OMIT = new Set(['hash', 'crawledAt', 'file']);
8
+
9
+ /** Projects a stored record to its public output shape (url, content, data, any transform fields). */
10
+ const toOutputRecord = (record) =>
11
+ Object.fromEntries(Object.entries(record).filter(([key]) => !OUTPUT_OMIT.has(key)));
12
+
6
13
  /**
7
14
  * Serializes records. Sorting by URL keeps output stable across runs so version
8
15
  * control does not show spurious diffs for unchanged data.
9
16
  *
10
- * @param {{ url: string, content: string }[]} records
17
+ * @param {Array<{ url: string, content: string }>} records
11
18
  * @param {'json'|'jsonl'|'lines'} format
12
19
  */
13
20
  const serialize = (records, format) => {
14
21
  const sorted = sortByUrl(records);
15
22
 
16
- if (format === 'jsonl') return `${sorted.map((record) => JSON.stringify(record)).join('\n')}\n`;
17
23
  if (format === 'lines') return sorted.map((record) => `${record.url} ${record.content}`).join('\n');
18
- return JSON.stringify(sorted, null, 2);
24
+
25
+ const projected = sorted.map(toOutputRecord);
26
+ if (format === 'jsonl') return `${projected.map((record) => JSON.stringify(record)).join('\n')}\n`;
27
+ return JSON.stringify(projected, null, 2);
19
28
  };
20
29
 
21
30
  /** Writes a single group of records to `filePath`. */
@@ -26,9 +35,9 @@ export const writeRecords = (filePath, records, format = 'json') => {
26
35
  /**
27
36
  * Groups records by routed output file and writes each group.
28
37
  *
29
- * @param {{ url: string, content: string }[]} records
38
+ * @param {Array<{ url: string, content: string }>} records
30
39
  * @param {{ output: import('../index.js').OutputConfig, formattedDir: string }} options
31
- * @returns {Map<string, { url: string, content: string }[]>} written file -> records
40
+ * @returns {Map<string, Array<{ url: string, content: string }>>} written file -> records
32
41
  */
33
42
  export const formatRecords = (records, { output, formattedDir }) => {
34
43
  const groups = new Map();