npm - scraply - Versions diffs - 2.0.0 → 2.0.2 - Mend

scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/package.json +7 -3
package/readme.md +149 -55
package/src/config/browser.js +37 -0
package/src/config/defaults.js +47 -11
package/src/config/load.js +57 -1
package/src/core/errors.js +23 -0
package/src/core/queue.js +83 -11
package/src/core/retry.js +34 -26
package/src/crawler.js +265 -76
package/src/extract/extract.js +17 -3
package/src/extract/links.js +4 -4
package/src/extract/parse.js +35 -0
package/src/extract/sitemap.js +35 -0
package/src/fetchers/browserFetcher.js +18 -12
package/src/fetchers/httpFetcher.js +40 -3
package/src/index.d.ts +285 -0
package/src/index.js +48 -7
package/src/output/writers.js +14 -5

package/src/fetchers/browserFetcher.js CHANGED Viewed

@@ -1,18 +1,18 @@
 import { Cluster } from 'puppeteer-cluster';
-const BLOCKED_RESOURCES = new Set(['image', 'stylesheet', 'font', 'media']);
 /**
- * Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already
- * follows redirects and returns the final response, so no manual redirect
- * handling is needed.
+ * Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already follows redirects and returns the final response, so no manual redirect handling is needed. The `browser` config is validated once in `loadConfig`, so no re-validation is needed here.
  *
  * @param {import('./types.js').FetcherDeps} deps
  * @returns {import('./types.js').Fetcher}
  */
 export const createBrowserFetcher = ({ config, logger }) => {
-  const { request, crawl } = config;
+  const { request, crawl, browser } = config;
   const timeout = Math.max(request.timeout, 5000);
+  const { waitUntil, blockResources } = browser;
+  const blockedResources = new Set(blockResources);
   let cluster = null;
   const init = async () => {
@@ -31,13 +31,19 @@ export const createBrowserFetcher = ({ config, logger }) => {
     await cluster.task(async ({ page, data: url }) => {
       await page.setUserAgent(request.userAgent);
-      await page.setRequestInterception(true);
-      page.on('request', (req) => {
-        if (BLOCKED_RESOURCES.has(req.resourceType())) req.abort();
-        else req.continue();
-      });
+      if (Object.keys(request.headers).length > 0) {
+        await page.setExtraHTTPHeaders(request.headers);
+      }
+      if (blockedResources.size > 0) {
+        await page.setRequestInterception(true);
+        page.on('request', (req) => {
+          if (blockedResources.has(req.resourceType())) req.abort();
+          else req.continue();
+        });
+      }
-      const response = await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
+      const response = await page.goto(url, { timeout, waitUntil });
       const data = await page.content();
       return {

package/src/fetchers/httpFetcher.js CHANGED Viewed

@@ -3,9 +3,45 @@ const lowercaseHeaders = (headers) => Object.fromEntries(headers.entries());
 const httpError = (message, status, headers = {}) =>
   Object.assign(new Error(message), { response: { status, headers } });
+/**
+ * Reads a response body as text while enforcing a byte cap (`maxBytes <= 0`
+ * disables it). Rejects early on a declared `Content-Length`, and otherwise
+ * streams the body so an oversized chunked response is aborted instead of being
+ * buffered whole.
+ */
+const readBodyWithLimit = async (response, maxBytes, headers) => {
+  if (maxBytes > 0) {
+    const declared = Number(response.headers.get('content-length'));
+    if (Number.isFinite(declared) && declared > maxBytes) {
+      throw httpError(`Response too large: ${declared} bytes (max ${maxBytes})`, 413, headers);
+    }
+  }
+  if (maxBytes <= 0 || !response.body) return response.text();
+  const reader = response.body.getReader();
+  const chunks = [];
+  let total = 0;
+  for (;;) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    total += value.byteLength;
+    if (total > maxBytes) {
+      await reader.cancel();
+      throw httpError(`Response exceeded max size of ${maxBytes} bytes`, 413, headers);
+    }
+    chunks.push(Buffer.from(value));
+  }
+  return Buffer.concat(chunks).toString('utf8');
+};
 /**
  * Native-fetch based backend. Follows redirects manually so the redirect budget
- * is enforced, and times out via AbortController.
+ * is enforced, times out via AbortController, and caps the body at
+ * `request.maxContentLength`.
  *
  * @param {import('./types.js').FetcherDeps} deps
  * @returns {import('./types.js').Fetcher}
@@ -21,12 +57,13 @@ export const createHttpFetcher = ({ config }) => {
       const response = await fetch(url, {
         signal: controller.signal,
         redirect: 'manual',
-        headers: { 'User-Agent': request.userAgent }
+        headers: { 'User-Agent': request.userAgent, ...request.headers }
       });
       const headers = lowercaseHeaders(response.headers);
       if (response.status >= 300 && response.status < 400) {
+        await response.body?.cancel();
         const location = response.headers.get('location');
         if (!location) throw httpError('Redirect without location header', response.status, headers);
         if (redirectsLeft <= 0) throw httpError('Max redirects reached', response.status, headers);
@@ -35,7 +72,7 @@ export const createHttpFetcher = ({ config }) => {
       if (!response.ok) throw httpError(`Invalid status code: ${response.status}`, response.status, headers);
-      const data = await response.text();
+      const data = await readBodyWithLimit(response, request.maxContentLength, headers);
       return { data, status: response.status, headers };
     } catch (error) {
       if (error.name === 'AbortError') {

package/src/index.d.ts ADDED Viewed

@@ -0,0 +1,285 @@
+import type { CheerioAPI } from 'cheerio';
+export type LogLevel = 'silent' | 'error' | 'warn' | 'info' | 'debug';
+export type OutputFormat = 'json' | 'jsonl' | 'lines';
+export type ContentKind = 'html' | 'json' | 'text';
+export type UrlPattern = string | RegExp;
+export type WaitUntil = 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
+export type BlockableResource = 'image' | 'stylesheet' | 'font' | 'media';
+/**
+ * List fields accept a plain array (replaces the default) or a directive object
+ * that combines with Scraply's defaults.
+ */
+export type ListInput<T> = T[] | { extend?: T[]; append?: T[]; prepend?: T[]; replace?: T[] };
+export interface RequestConfig {
+  timeout: number;
+  maxRedirects: number;
+  /** Hard cap on the response body in bytes; 0 disables it. */
+  maxContentLength: number;
+  userAgent: string;
+  headers: Record<string, string>;
+}
+export interface RetryConfig {
+  max: number;
+  statusCodes: number[];
+  delay: number;
+}
+export interface RateLimitConfig {
+  fallbackDelay: number;
+  /** false: wait & retry. true: abort the crawl with a RateLimitError. */
+  exitOnLimit: boolean;
+  exitCode: number;
+}
+export interface CrawlConfig {
+  concurrency: number;
+  /** Minimum spacing (ms) between requests to the same host. */
+  delay: number;
+  maxDepth: number;
+  maxPages: number;
+  resetOnComplete: boolean;
+  retryErrors: boolean;
+  retrySkipped: boolean;
+  /** true seeds <origin>/sitemap.xml per start URL, or pass explicit sitemap URLs. */
+  sitemap: boolean | string[];
+}
+export interface BrowserConfig {
+  waitUntil: WaitUntil;
+  blockResources: BlockableResource[];
+}
+export interface ExtractConfig {
+  /** Allow-list container(s) to read text from; null = whole <body>. */
+  root?: string | string[] | null;
+  /** Selector used when `root` matches nothing (default 'body'). */
+  rootFallback?: string;
+  /** Parse JSON bodies into pretty content + record.data (default true). */
+  json?: boolean;
+  /** Elements stripped before text extraction. */
+  removeSelectors?: string[];
+}
+export interface OutputConfig {
+  format: OutputFormat;
+  exclude: UrlPattern[];
+  routes: Record<string, Record<string, string>>;
+}
+/** Per-origin/route override applied to URLs matching `match`. */
+export interface SiteConfig {
+  match: UrlPattern | UrlPattern[];
+  allowedContentTypes?: string[];
+  extract?: ExtractConfig;
+}
+export interface FetchResult {
+  data: string | ArrayBuffer;
+  status: number;
+  /** Header keys are lowercased. */
+  headers: Record<string, string>;
+}
+export interface Fetcher {
+  name: string;
+  fetch(url: string): Promise<FetchResult>;
+  init?(): Promise<void>;
+  close?(): Promise<void>;
+}
+export interface Logger {
+  level: LogLevel;
+  error(...args: unknown[]): void;
+  warn(...args: unknown[]): void;
+  info(...args: unknown[]): void;
+  debug(...args: unknown[]): void;
+}
+export interface FetcherDeps {
+  config: ResolvedConfig;
+  logger: Logger;
+}
+export interface QueueEntry {
+  url: string;
+  /** Filename of the saved crawled record (relative to crawledDir), or null. */
+  file: string | null;
+  status: number | null;
+  error: string | null;
+  skipped: string | null;
+  referrer: string | null;
+  depth: number;
+}
+/** A crawled record. `data` is present for JSON sources; transform hooks may add fields. */
+export interface CrawlRecord {
+  url: string;
+  content: string;
+  crawledAt: string;
+  hash: string;
+  data?: unknown;
+  [key: string]: unknown;
+}
+export interface QueueManager {
+  entries: QueueEntry[];
+  load(): QueueEntry[];
+  seed(urls: string[]): void;
+  add(url: string, opts?: { depth?: number; referrer?: string | null }): boolean;
+  claimNext(): QueueEntry | null;
+  requeueErrors(): number;
+  requeueSkipped(): number;
+  isAllProcessed(): boolean;
+  pendingCount(): number;
+  crawledCount(): number;
+  errorCount(): number;
+  skippedCount(): number;
+  flush(): void;
+  reset(): void;
+}
+export interface ScraplyConfig {
+  startUrls?: string[];
+  include?: ListInput<UrlPattern>;
+  exclude?: ListInput<UrlPattern>;
+  allowedContentTypes?: ListInput<string>;
+  sites?: SiteConfig[];
+  fetcher?: 'http' | 'browser' | Fetcher;
+  browser?: Partial<BrowserConfig>;
+  logLevel?: LogLevel;
+  /** Install SIGINT/SIGTERM handlers for a graceful stop (default true). */
+  signals?: boolean;
+  storage?: { dir?: string };
+  request?: Partial<RequestConfig>;
+  retry?: Partial<RetryConfig>;
+  rateLimit?: Partial<RateLimitConfig>;
+  crawl?: Partial<CrawlConfig>;
+  extract?: Omit<ExtractConfig, 'removeSelectors'> & { removeSelectors?: ListInput<string> };
+  output?: Partial<Omit<OutputConfig, 'exclude'>> & { exclude?: ListInput<UrlPattern> };
+}
+export interface ResolvedConfig {
+  startUrls: string[];
+  include: UrlPattern[];
+  exclude: UrlPattern[];
+  allowedContentTypes: string[];
+  sites: Array<{ match: UrlPattern[]; allowedContentTypes?: string[]; extract?: ExtractConfig }>;
+  fetcher: 'http' | 'browser' | Fetcher;
+  browser: BrowserConfig;
+  logLevel: LogLevel;
+  signals: boolean;
+  storage: { dir: string; queuePath: string; crawledDir: string; formattedDir: string };
+  request: RequestConfig;
+  retry: RetryConfig;
+  rateLimit: RateLimitConfig;
+  crawl: CrawlConfig;
+  extract: ExtractConfig & { removeSelectors: string[] };
+  output: OutputConfig;
+}
+/** Lifecycle hooks. Reduce hooks may return a replacement value; emit hooks are side-effect only. */
+export interface HookMap {
+  /** Fires right after a successful fetch, before the content-type gate. */
+  response: (result: FetchResult, entry: QueueEntry) => void | Promise<void>;
+  /** Fires when a response is skipped (e.g. disallowed content-type). */
+  skip: (
+    entry: QueueEntry,
+    info: { reason: string; status: number | null; result: FetchResult }
+  ) => void | Promise<void>;
+  /** Return false to veto enqueuing a URL. */
+  shouldEnqueue: (
+    allow: boolean,
+    url: string,
+    referrer: string | null
+  ) => boolean | void | Promise<boolean | void>;
+  /** Reduce/replace the list of discovered links before they are enqueued. `$` is null for non-HTML. */
+  links: (
+    links: string[],
+    $: CheerioAPI | null,
+    entry: QueueEntry,
+    result: FetchResult
+  ) => string[] | void | Promise<string[] | void>;
+  /** Reduce/replace the extracted content. `$` is null for non-HTML bodies. */
+  extract: (
+    content: string,
+    $: CheerioAPI | null,
+    entry: QueueEntry,
+    result: FetchResult
+  ) => string | void | Promise<string | void>;
+  /** Reduce/replace the record before it is persisted and formatted. */
+  transform: (
+    record: CrawlRecord,
+    entry: QueueEntry,
+    result: FetchResult
+  ) => CrawlRecord | void | Promise<CrawlRecord | void>;
+  /** Fires after a record is persisted. */
+  page: (record: CrawlRecord, entry: QueueEntry, result: FetchResult) => void | Promise<void>;
+  /** Fires when a fetch/process fails (non-429). */
+  error: (error: Error, entry: QueueEntry) => void | Promise<void>;
+}
+export interface Crawler {
+  config: ResolvedConfig;
+  logger: Logger;
+  queue: QueueManager;
+  on<K extends keyof HookMap>(event: K, fn: HookMap[K]): () => void;
+  fetch(url: string): Promise<FetchResult>;
+  extract(html: string | CheerioAPI, url?: string | null): { url: string | null; content: string };
+  enqueue(
+    urls: string | string[],
+    opts?: { depth?: number; referrer?: string | null }
+  ): Promise<number>;
+  crawl(): Promise<void>;
+  format(records?: CrawlRecord[] | null): Promise<Map<string, CrawlRecord[]>>;
+  run(): Promise<QueueEntry[]>;
+  requeueErrors(): number;
+  requeueSkipped(): number;
+  stop(): void;
+}
+export function createCrawler(config?: ScraplyConfig): Crawler;
+export function scraply(config?: ScraplyConfig): Promise<QueueEntry[]>;
+export function runCrawlers(
+  items: Array<ScraplyConfig | Crawler>,
+  options?: { concurrency?: number }
+): Promise<QueueEntry[][]>;
+export function loadConfig(config?: ScraplyConfig): ResolvedConfig;
+export const DEFAULT_CONFIG: ScraplyConfig;
+export function assertBrowserConfig(browser: BrowserConfig): void;
+export const BROWSER_WAIT_UNTIL: readonly WaitUntil[];
+export const BROWSER_BLOCKABLE_RESOURCES: readonly BlockableResource[];
+export class RateLimitError extends Error {
+  name: 'RateLimitError';
+  code: number;
+  headers: Record<string, string>;
+  response: { status: 429; headers: Record<string, string> };
+  constructor(
+    message?: string,
+    options?: { code?: number; headers?: Record<string, string>; cause?: unknown }
+  );
+}
+export function normalizeUrl(url: string): string;
+export function matchesPattern(value: string, pattern: UrlPattern): boolean;
+export function matchesAnyPattern(value: string, patterns?: UrlPattern[]): boolean;
+export function extractText(input: string | CheerioAPI, options?: ExtractConfig): string;
+export function discoverLinks($: CheerioAPI, baseUrl: string): string[];
+export function classifyContentType(contentType?: string): ContentKind;
+export function parseJson(data: string | ArrayBuffer): { data: unknown; content: string };
+export function parseSitemap(xml: string): { sitemaps: string[]; urls: string[] };
+export function routeRecord(url: string, output: OutputConfig, formattedDir: string): string | null;
+export function writeRecords(filePath: string, records: CrawlRecord[], format?: OutputFormat): void;
+export function formatRecords(
+  records: CrawlRecord[],
+  options: { output: OutputConfig; formattedDir: string }
+): Map<string, CrawlRecord[]>;
+export function resolveFetcher(deps: FetcherDeps): Fetcher;
+export function createHttpFetcher(deps: { config: ResolvedConfig }): Fetcher;
+export function createBrowserFetcher(deps: { config: ResolvedConfig; logger: Logger }): Fetcher;

package/src/index.js CHANGED Viewed

@@ -4,8 +4,9 @@
  * @typedef {Object} RequestConfig
  * @property {number} timeout
  * @property {number} maxRedirects
- * @property {number} maxContentLength
+ * @property {number} maxContentLength - hard cap on the response body in bytes; 0 disables it
  * @property {string} userAgent
+ * @property {Record<string, string>} headers - extra request headers sent by every fetcher
  *
  * @typedef {Object} RetryConfig
  * @property {number} max
@@ -21,45 +22,85 @@
  * @property {number} concurrency
  * @property {number} delay - minimum spacing (ms) between requests to the same host
  * @property {number} maxDepth
+ * @property {number} maxPages - hard cap on successfully crawled pages (counts across resumes)
  * @property {boolean} resetOnComplete
+ * @property {boolean} retryErrors - re-queue previously errored URLs on resume
+ * @property {boolean} retrySkipped - re-queue previously skipped URLs on resume
+ * @property {boolean|string[]} sitemap - seed from sitemap(s): true uses <origin>/sitemap.xml, or pass explicit URLs
+ *
+ * @typedef {Object} BrowserConfig
+ * @property {'load'|'domcontentloaded'|'networkidle0'|'networkidle2'} waitUntil
+ * @property {Array<'image'|'stylesheet'|'font'|'media'>} blockResources
+ *
+ * @typedef {Object} ExtractConfig
+ * @property {string|string[]|null} [root] - allow-list container(s) to read text from; null = whole <body>
+ * @property {string} [rootFallback] - selector used when `root` matches nothing (default 'body')
+ * @property {boolean} [json] - parse JSON bodies into pretty content + record.data (default true)
+ * @property {string[]} [removeSelectors] - elements stripped before text extraction
+ *
+ * @typedef {Object} SiteConfig
+ * @property {string|RegExp|Array<string|RegExp>} match - URL prefix(es)/RegExp(s) this override applies to
+ * @property {string[]} [allowedContentTypes]
+ * @property {ExtractConfig} [extract]
  *
  * @typedef {Object} OutputConfig
  * @property {'json'|'jsonl'|'lines'} format
  * @property {Array<string|RegExp>} exclude
  * @property {Record<string, Record<string, string>>} routes
  *
+ * List fields (`include`, `exclude`, `allowedContentTypes`, `extract.removeSelectors`,
+ * `output.exclude`) accept either an array (replaces the default) or a directive
+ * object `{ extend?, prepend?, replace? }` to combine with Scraply's defaults.
+ * @template T
+ * @typedef {T[] | { extend?: T[], prepend?: T[], append?: T[], replace?: T[] }} ListInput
+ *
  * @typedef {Object} ScraplyConfig
  * @property {string[]} [startUrls]
- * @property {Array<string|RegExp>} [include]
- * @property {Array<string|RegExp>} [exclude]
- * @property {string[]} [allowedContentTypes]
+ * @property {ListInput<string|RegExp>} [include]
+ * @property {ListInput<string|RegExp>} [exclude]
+ * @property {ListInput<string>} [allowedContentTypes]
+ * @property {SiteConfig[]} [sites] - per-origin/route overrides for allowedContentTypes + extract
  * @property {'http'|'browser'|import('./fetchers/types.js').Fetcher} [fetcher]
+ * @property {Partial<BrowserConfig>} [browser]
  * @property {'silent'|'error'|'warn'|'info'|'debug'} [logLevel]
+ * @property {boolean} [signals] - install SIGINT/SIGTERM handlers (default true)
  * @property {{ dir?: string }} [storage]
  * @property {Partial<RequestConfig>} [request]
  * @property {Partial<RetryConfig>} [retry]
  * @property {Partial<RateLimitConfig>} [rateLimit]
  * @property {Partial<CrawlConfig>} [crawl]
- * @property {{ removeSelectors?: string[] }} [extract]
- * @property {Partial<OutputConfig>} [output]
+ * @property {ExtractConfig & { removeSelectors?: ListInput<string> }} [extract]
+ * @property {Partial<OutputConfig> & { exclude?: ListInput<string|RegExp> }} [output]
  *
  * @typedef {Required<ScraplyConfig> & {
+ *   include: Array<string|RegExp>,
+ *   exclude: Array<string|RegExp>,
+ *   allowedContentTypes: string[],
+ *   sites: Array<{ match: Array<string|RegExp>, allowedContentTypes?: string[], extract?: ExtractConfig }>,
+ *   browser: BrowserConfig,
+ *   extract: ExtractConfig & { removeSelectors: string[] },
  *   storage: { dir: string, queuePath: string, crawledDir: string, formattedDir: string }
  * }} ResolvedConfig
  */
 // Main entry points
-export { createCrawler, scraply } from './crawler.js';
+export { createCrawler, scraply, runCrawlers } from './crawler.js';
 // Config
 export { loadConfig } from './config/load.js';
 export { DEFAULT_CONFIG } from './config/defaults.js';
+export { assertBrowserConfig, BROWSER_WAIT_UNTIL, BROWSER_BLOCKABLE_RESOURCES } from './config/browser.js';
+// Errors
+export { RateLimitError } from './core/errors.js';
 // Standalone building blocks (usable without a crawler instance)
 export { normalizeUrl } from './url/normalize.js';
 export { matchesPattern, matchesAnyPattern } from './url/patterns.js';
 export { extractText } from './extract/extract.js';
 export { discoverLinks } from './extract/links.js';
+export { classifyContentType, parseJson } from './extract/parse.js';
+export { parseSitemap } from './extract/sitemap.js';
 export { routeRecord } from './output/router.js';
 export { writeRecords, formatRecords } from './output/writers.js';

package/src/output/writers.js CHANGED Viewed

@@ -3,19 +3,28 @@ import { routeRecord } from './router.js';
 const sortByUrl = (records) => [...records].sort((a, b) => a.url.localeCompare(b.url));
+// Internal bookkeeping fields kept on disk but omitted from formatted output.
+const OUTPUT_OMIT = new Set(['hash', 'crawledAt', 'file']);
+/** Projects a stored record to its public output shape (url, content, data, any transform fields). */
+const toOutputRecord = (record) =>
+  Object.fromEntries(Object.entries(record).filter(([key]) => !OUTPUT_OMIT.has(key)));
 /**
  * Serializes records. Sorting by URL keeps output stable across runs so version
  * control does not show spurious diffs for unchanged data.
  *
- * @param {{ url: string, content: string }[]} records
+ * @param {Array<{ url: string, content: string }>} records
  * @param {'json'|'jsonl'|'lines'} format
  */
 const serialize = (records, format) => {
   const sorted = sortByUrl(records);
-  if (format === 'jsonl') return `${sorted.map((record) => JSON.stringify(record)).join('\n')}\n`;
   if (format === 'lines') return sorted.map((record) => `${record.url} ${record.content}`).join('\n');
-  return JSON.stringify(sorted, null, 2);
+  const projected = sorted.map(toOutputRecord);
+  if (format === 'jsonl') return `${projected.map((record) => JSON.stringify(record)).join('\n')}\n`;
+  return JSON.stringify(projected, null, 2);
 };
 /** Writes a single group of records to `filePath`. */
@@ -26,9 +35,9 @@ export const writeRecords = (filePath, records, format = 'json') => {
 /**
  * Groups records by routed output file and writes each group.
  *
- * @param {{ url: string, content: string }[]} records
+ * @param {Array<{ url: string, content: string }>} records
  * @param {{ output: import('../index.js').OutputConfig, formattedDir: string }} options
- * @returns {Map<string, { url: string, content: string }[]>} written file -> records
+ * @returns {Map<string, Array<{ url: string, content: string }>>} written file -> records
  */
 export const formatRecords = (records, { output, formattedDir }) => {
   const groups = new Map();