scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,18 @@
1
1
  import { Cluster } from 'puppeteer-cluster';
2
2
 
3
- const BLOCKED_RESOURCES = new Set(['image', 'stylesheet', 'font', 'media']);
4
-
5
3
  /**
6
- * Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already
7
- * follows redirects and returns the final response, so no manual redirect
8
- * handling is needed.
4
+ * Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already follows redirects and returns the final response, so no manual redirect handling is needed. The `browser` config is validated once in `loadConfig`, so no re-validation is needed here.
9
5
  *
10
6
  * @param {import('./types.js').FetcherDeps} deps
11
7
  * @returns {import('./types.js').Fetcher}
12
8
  */
13
9
  export const createBrowserFetcher = ({ config, logger }) => {
14
- const { request, crawl } = config;
10
+ const { request, crawl, browser } = config;
15
11
  const timeout = Math.max(request.timeout, 5000);
12
+
13
+ const { waitUntil, blockResources } = browser;
14
+ const blockedResources = new Set(blockResources);
15
+
16
16
  let cluster = null;
17
17
 
18
18
  const init = async () => {
@@ -31,13 +31,19 @@ export const createBrowserFetcher = ({ config, logger }) => {
31
31
 
32
32
  await cluster.task(async ({ page, data: url }) => {
33
33
  await page.setUserAgent(request.userAgent);
34
- await page.setRequestInterception(true);
35
- page.on('request', (req) => {
36
- if (BLOCKED_RESOURCES.has(req.resourceType())) req.abort();
37
- else req.continue();
38
- });
34
+ if (Object.keys(request.headers).length > 0) {
35
+ await page.setExtraHTTPHeaders(request.headers);
36
+ }
37
+
38
+ if (blockedResources.size > 0) {
39
+ await page.setRequestInterception(true);
40
+ page.on('request', (req) => {
41
+ if (blockedResources.has(req.resourceType())) req.abort();
42
+ else req.continue();
43
+ });
44
+ }
39
45
 
40
- const response = await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
46
+ const response = await page.goto(url, { timeout, waitUntil });
41
47
  const data = await page.content();
42
48
 
43
49
  return {
@@ -3,9 +3,45 @@ const lowercaseHeaders = (headers) => Object.fromEntries(headers.entries());
3
3
  const httpError = (message, status, headers = {}) =>
4
4
  Object.assign(new Error(message), { response: { status, headers } });
5
5
 
6
+ /**
7
+ * Reads a response body as text while enforcing a byte cap (`maxBytes <= 0`
8
+ * disables it). Rejects early on a declared `Content-Length`, and otherwise
9
+ * streams the body so an oversized chunked response is aborted instead of being
10
+ * buffered whole.
11
+ */
12
+ const readBodyWithLimit = async (response, maxBytes, headers) => {
13
+ if (maxBytes > 0) {
14
+ const declared = Number(response.headers.get('content-length'));
15
+ if (Number.isFinite(declared) && declared > maxBytes) {
16
+ throw httpError(`Response too large: ${declared} bytes (max ${maxBytes})`, 413, headers);
17
+ }
18
+ }
19
+
20
+ if (maxBytes <= 0 || !response.body) return response.text();
21
+
22
+ const reader = response.body.getReader();
23
+ const chunks = [];
24
+ let total = 0;
25
+
26
+ for (;;) {
27
+ const { done, value } = await reader.read();
28
+ if (done) break;
29
+
30
+ total += value.byteLength;
31
+ if (total > maxBytes) {
32
+ await reader.cancel();
33
+ throw httpError(`Response exceeded max size of ${maxBytes} bytes`, 413, headers);
34
+ }
35
+ chunks.push(Buffer.from(value));
36
+ }
37
+
38
+ return Buffer.concat(chunks).toString('utf8');
39
+ };
40
+
6
41
  /**
7
42
  * Native-fetch based backend. Follows redirects manually so the redirect budget
8
- * is enforced, and times out via AbortController.
43
+ * is enforced, times out via AbortController, and caps the body at
44
+ * `request.maxContentLength`.
9
45
  *
10
46
  * @param {import('./types.js').FetcherDeps} deps
11
47
  * @returns {import('./types.js').Fetcher}
@@ -21,12 +57,13 @@ export const createHttpFetcher = ({ config }) => {
21
57
  const response = await fetch(url, {
22
58
  signal: controller.signal,
23
59
  redirect: 'manual',
24
- headers: { 'User-Agent': request.userAgent }
60
+ headers: { 'User-Agent': request.userAgent, ...request.headers }
25
61
  });
26
62
 
27
63
  const headers = lowercaseHeaders(response.headers);
28
64
 
29
65
  if (response.status >= 300 && response.status < 400) {
66
+ await response.body?.cancel();
30
67
  const location = response.headers.get('location');
31
68
  if (!location) throw httpError('Redirect without location header', response.status, headers);
32
69
  if (redirectsLeft <= 0) throw httpError('Max redirects reached', response.status, headers);
@@ -35,7 +72,7 @@ export const createHttpFetcher = ({ config }) => {
35
72
 
36
73
  if (!response.ok) throw httpError(`Invalid status code: ${response.status}`, response.status, headers);
37
74
 
38
- const data = await response.text();
75
+ const data = await readBodyWithLimit(response, request.maxContentLength, headers);
39
76
  return { data, status: response.status, headers };
40
77
  } catch (error) {
41
78
  if (error.name === 'AbortError') {
package/src/index.d.ts ADDED
@@ -0,0 +1,285 @@
1
+ import type { CheerioAPI } from 'cheerio';
2
+
3
+ export type LogLevel = 'silent' | 'error' | 'warn' | 'info' | 'debug';
4
+ export type OutputFormat = 'json' | 'jsonl' | 'lines';
5
+ export type ContentKind = 'html' | 'json' | 'text';
6
+ export type UrlPattern = string | RegExp;
7
+ export type WaitUntil = 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
8
+ export type BlockableResource = 'image' | 'stylesheet' | 'font' | 'media';
9
+
10
+ /**
11
+ * List fields accept a plain array (replaces the default) or a directive object
12
+ * that combines with Scraply's defaults.
13
+ */
14
+ export type ListInput<T> = T[] | { extend?: T[]; append?: T[]; prepend?: T[]; replace?: T[] };
15
+
16
+ export interface RequestConfig {
17
+ timeout: number;
18
+ maxRedirects: number;
19
+ /** Hard cap on the response body in bytes; 0 disables it. */
20
+ maxContentLength: number;
21
+ userAgent: string;
22
+ headers: Record<string, string>;
23
+ }
24
+
25
+ export interface RetryConfig {
26
+ max: number;
27
+ statusCodes: number[];
28
+ delay: number;
29
+ }
30
+
31
+ export interface RateLimitConfig {
32
+ fallbackDelay: number;
33
+ /** false: wait & retry. true: abort the crawl with a RateLimitError. */
34
+ exitOnLimit: boolean;
35
+ exitCode: number;
36
+ }
37
+
38
+ export interface CrawlConfig {
39
+ concurrency: number;
40
+ /** Minimum spacing (ms) between requests to the same host. */
41
+ delay: number;
42
+ maxDepth: number;
43
+ maxPages: number;
44
+ resetOnComplete: boolean;
45
+ retryErrors: boolean;
46
+ retrySkipped: boolean;
47
+ /** true seeds <origin>/sitemap.xml per start URL, or pass explicit sitemap URLs. */
48
+ sitemap: boolean | string[];
49
+ }
50
+
51
+ export interface BrowserConfig {
52
+ waitUntil: WaitUntil;
53
+ blockResources: BlockableResource[];
54
+ }
55
+
56
+ export interface ExtractConfig {
57
+ /** Allow-list container(s) to read text from; null = whole <body>. */
58
+ root?: string | string[] | null;
59
+ /** Selector used when `root` matches nothing (default 'body'). */
60
+ rootFallback?: string;
61
+ /** Parse JSON bodies into pretty content + record.data (default true). */
62
+ json?: boolean;
63
+ /** Elements stripped before text extraction. */
64
+ removeSelectors?: string[];
65
+ }
66
+
67
+ export interface OutputConfig {
68
+ format: OutputFormat;
69
+ exclude: UrlPattern[];
70
+ routes: Record<string, Record<string, string>>;
71
+ }
72
+
73
+ /** Per-origin/route override applied to URLs matching `match`. */
74
+ export interface SiteConfig {
75
+ match: UrlPattern | UrlPattern[];
76
+ allowedContentTypes?: string[];
77
+ extract?: ExtractConfig;
78
+ }
79
+
80
+ export interface FetchResult {
81
+ data: string | ArrayBuffer;
82
+ status: number;
83
+ /** Header keys are lowercased. */
84
+ headers: Record<string, string>;
85
+ }
86
+
87
+ export interface Fetcher {
88
+ name: string;
89
+ fetch(url: string): Promise<FetchResult>;
90
+ init?(): Promise<void>;
91
+ close?(): Promise<void>;
92
+ }
93
+
94
+ export interface Logger {
95
+ level: LogLevel;
96
+ error(...args: unknown[]): void;
97
+ warn(...args: unknown[]): void;
98
+ info(...args: unknown[]): void;
99
+ debug(...args: unknown[]): void;
100
+ }
101
+
102
+ export interface FetcherDeps {
103
+ config: ResolvedConfig;
104
+ logger: Logger;
105
+ }
106
+
107
+ export interface QueueEntry {
108
+ url: string;
109
+ /** Filename of the saved crawled record (relative to crawledDir), or null. */
110
+ file: string | null;
111
+ status: number | null;
112
+ error: string | null;
113
+ skipped: string | null;
114
+ referrer: string | null;
115
+ depth: number;
116
+ }
117
+
118
+ /** A crawled record. `data` is present for JSON sources; transform hooks may add fields. */
119
+ export interface CrawlRecord {
120
+ url: string;
121
+ content: string;
122
+ crawledAt: string;
123
+ hash: string;
124
+ data?: unknown;
125
+ [key: string]: unknown;
126
+ }
127
+
128
+ export interface QueueManager {
129
+ entries: QueueEntry[];
130
+ load(): QueueEntry[];
131
+ seed(urls: string[]): void;
132
+ add(url: string, opts?: { depth?: number; referrer?: string | null }): boolean;
133
+ claimNext(): QueueEntry | null;
134
+ requeueErrors(): number;
135
+ requeueSkipped(): number;
136
+ isAllProcessed(): boolean;
137
+ pendingCount(): number;
138
+ crawledCount(): number;
139
+ errorCount(): number;
140
+ skippedCount(): number;
141
+ flush(): void;
142
+ reset(): void;
143
+ }
144
+
145
+ export interface ScraplyConfig {
146
+ startUrls?: string[];
147
+ include?: ListInput<UrlPattern>;
148
+ exclude?: ListInput<UrlPattern>;
149
+ allowedContentTypes?: ListInput<string>;
150
+ sites?: SiteConfig[];
151
+ fetcher?: 'http' | 'browser' | Fetcher;
152
+ browser?: Partial<BrowserConfig>;
153
+ logLevel?: LogLevel;
154
+ /** Install SIGINT/SIGTERM handlers for a graceful stop (default true). */
155
+ signals?: boolean;
156
+ storage?: { dir?: string };
157
+ request?: Partial<RequestConfig>;
158
+ retry?: Partial<RetryConfig>;
159
+ rateLimit?: Partial<RateLimitConfig>;
160
+ crawl?: Partial<CrawlConfig>;
161
+ extract?: Omit<ExtractConfig, 'removeSelectors'> & { removeSelectors?: ListInput<string> };
162
+ output?: Partial<Omit<OutputConfig, 'exclude'>> & { exclude?: ListInput<UrlPattern> };
163
+ }
164
+
165
+ export interface ResolvedConfig {
166
+ startUrls: string[];
167
+ include: UrlPattern[];
168
+ exclude: UrlPattern[];
169
+ allowedContentTypes: string[];
170
+ sites: Array<{ match: UrlPattern[]; allowedContentTypes?: string[]; extract?: ExtractConfig }>;
171
+ fetcher: 'http' | 'browser' | Fetcher;
172
+ browser: BrowserConfig;
173
+ logLevel: LogLevel;
174
+ signals: boolean;
175
+ storage: { dir: string; queuePath: string; crawledDir: string; formattedDir: string };
176
+ request: RequestConfig;
177
+ retry: RetryConfig;
178
+ rateLimit: RateLimitConfig;
179
+ crawl: CrawlConfig;
180
+ extract: ExtractConfig & { removeSelectors: string[] };
181
+ output: OutputConfig;
182
+ }
183
+
184
+ /** Lifecycle hooks. Reduce hooks may return a replacement value; emit hooks are side-effect only. */
185
+ export interface HookMap {
186
+ /** Fires right after a successful fetch, before the content-type gate. */
187
+ response: (result: FetchResult, entry: QueueEntry) => void | Promise<void>;
188
+ /** Fires when a response is skipped (e.g. disallowed content-type). */
189
+ skip: (
190
+ entry: QueueEntry,
191
+ info: { reason: string; status: number | null; result: FetchResult }
192
+ ) => void | Promise<void>;
193
+ /** Return false to veto enqueuing a URL. */
194
+ shouldEnqueue: (
195
+ allow: boolean,
196
+ url: string,
197
+ referrer: string | null
198
+ ) => boolean | void | Promise<boolean | void>;
199
+ /** Reduce/replace the list of discovered links before they are enqueued. `$` is null for non-HTML. */
200
+ links: (
201
+ links: string[],
202
+ $: CheerioAPI | null,
203
+ entry: QueueEntry,
204
+ result: FetchResult
205
+ ) => string[] | void | Promise<string[] | void>;
206
+ /** Reduce/replace the extracted content. `$` is null for non-HTML bodies. */
207
+ extract: (
208
+ content: string,
209
+ $: CheerioAPI | null,
210
+ entry: QueueEntry,
211
+ result: FetchResult
212
+ ) => string | void | Promise<string | void>;
213
+ /** Reduce/replace the record before it is persisted and formatted. */
214
+ transform: (
215
+ record: CrawlRecord,
216
+ entry: QueueEntry,
217
+ result: FetchResult
218
+ ) => CrawlRecord | void | Promise<CrawlRecord | void>;
219
+ /** Fires after a record is persisted. */
220
+ page: (record: CrawlRecord, entry: QueueEntry, result: FetchResult) => void | Promise<void>;
221
+ /** Fires when a fetch/process fails (non-429). */
222
+ error: (error: Error, entry: QueueEntry) => void | Promise<void>;
223
+ }
224
+
225
+ export interface Crawler {
226
+ config: ResolvedConfig;
227
+ logger: Logger;
228
+ queue: QueueManager;
229
+ on<K extends keyof HookMap>(event: K, fn: HookMap[K]): () => void;
230
+ fetch(url: string): Promise<FetchResult>;
231
+ extract(html: string | CheerioAPI, url?: string | null): { url: string | null; content: string };
232
+ enqueue(
233
+ urls: string | string[],
234
+ opts?: { depth?: number; referrer?: string | null }
235
+ ): Promise<number>;
236
+ crawl(): Promise<void>;
237
+ format(records?: CrawlRecord[] | null): Promise<Map<string, CrawlRecord[]>>;
238
+ run(): Promise<QueueEntry[]>;
239
+ requeueErrors(): number;
240
+ requeueSkipped(): number;
241
+ stop(): void;
242
+ }
243
+
244
+ export function createCrawler(config?: ScraplyConfig): Crawler;
245
+ export function scraply(config?: ScraplyConfig): Promise<QueueEntry[]>;
246
+ export function runCrawlers(
247
+ items: Array<ScraplyConfig | Crawler>,
248
+ options?: { concurrency?: number }
249
+ ): Promise<QueueEntry[][]>;
250
+
251
+ export function loadConfig(config?: ScraplyConfig): ResolvedConfig;
252
+ export const DEFAULT_CONFIG: ScraplyConfig;
253
+ export function assertBrowserConfig(browser: BrowserConfig): void;
254
+ export const BROWSER_WAIT_UNTIL: readonly WaitUntil[];
255
+ export const BROWSER_BLOCKABLE_RESOURCES: readonly BlockableResource[];
256
+
257
+ export class RateLimitError extends Error {
258
+ name: 'RateLimitError';
259
+ code: number;
260
+ headers: Record<string, string>;
261
+ response: { status: 429; headers: Record<string, string> };
262
+ constructor(
263
+ message?: string,
264
+ options?: { code?: number; headers?: Record<string, string>; cause?: unknown }
265
+ );
266
+ }
267
+
268
+ export function normalizeUrl(url: string): string;
269
+ export function matchesPattern(value: string, pattern: UrlPattern): boolean;
270
+ export function matchesAnyPattern(value: string, patterns?: UrlPattern[]): boolean;
271
+ export function extractText(input: string | CheerioAPI, options?: ExtractConfig): string;
272
+ export function discoverLinks($: CheerioAPI, baseUrl: string): string[];
273
+ export function classifyContentType(contentType?: string): ContentKind;
274
+ export function parseJson(data: string | ArrayBuffer): { data: unknown; content: string };
275
+ export function parseSitemap(xml: string): { sitemaps: string[]; urls: string[] };
276
+ export function routeRecord(url: string, output: OutputConfig, formattedDir: string): string | null;
277
+ export function writeRecords(filePath: string, records: CrawlRecord[], format?: OutputFormat): void;
278
+ export function formatRecords(
279
+ records: CrawlRecord[],
280
+ options: { output: OutputConfig; formattedDir: string }
281
+ ): Map<string, CrawlRecord[]>;
282
+
283
+ export function resolveFetcher(deps: FetcherDeps): Fetcher;
284
+ export function createHttpFetcher(deps: { config: ResolvedConfig }): Fetcher;
285
+ export function createBrowserFetcher(deps: { config: ResolvedConfig; logger: Logger }): Fetcher;
package/src/index.js CHANGED
@@ -4,8 +4,9 @@
4
4
  * @typedef {Object} RequestConfig
5
5
  * @property {number} timeout
6
6
  * @property {number} maxRedirects
7
- * @property {number} maxContentLength
7
+ * @property {number} maxContentLength - hard cap on the response body in bytes; 0 disables it
8
8
  * @property {string} userAgent
9
+ * @property {Record<string, string>} headers - extra request headers sent by every fetcher
9
10
  *
10
11
  * @typedef {Object} RetryConfig
11
12
  * @property {number} max
@@ -21,45 +22,85 @@
21
22
  * @property {number} concurrency
22
23
  * @property {number} delay - minimum spacing (ms) between requests to the same host
23
24
  * @property {number} maxDepth
25
+ * @property {number} maxPages - hard cap on successfully crawled pages (counts across resumes)
24
26
  * @property {boolean} resetOnComplete
27
+ * @property {boolean} retryErrors - re-queue previously errored URLs on resume
28
+ * @property {boolean} retrySkipped - re-queue previously skipped URLs on resume
29
+ * @property {boolean|string[]} sitemap - seed from sitemap(s): true uses <origin>/sitemap.xml, or pass explicit URLs
30
+ *
31
+ * @typedef {Object} BrowserConfig
32
+ * @property {'load'|'domcontentloaded'|'networkidle0'|'networkidle2'} waitUntil
33
+ * @property {Array<'image'|'stylesheet'|'font'|'media'>} blockResources
34
+ *
35
+ * @typedef {Object} ExtractConfig
36
+ * @property {string|string[]|null} [root] - allow-list container(s) to read text from; null = whole <body>
37
+ * @property {string} [rootFallback] - selector used when `root` matches nothing (default 'body')
38
+ * @property {boolean} [json] - parse JSON bodies into pretty content + record.data (default true)
39
+ * @property {string[]} [removeSelectors] - elements stripped before text extraction
40
+ *
41
+ * @typedef {Object} SiteConfig
42
+ * @property {string|RegExp|Array<string|RegExp>} match - URL prefix(es)/RegExp(s) this override applies to
43
+ * @property {string[]} [allowedContentTypes]
44
+ * @property {ExtractConfig} [extract]
25
45
  *
26
46
  * @typedef {Object} OutputConfig
27
47
  * @property {'json'|'jsonl'|'lines'} format
28
48
  * @property {Array<string|RegExp>} exclude
29
49
  * @property {Record<string, Record<string, string>>} routes
30
50
  *
51
+ * List fields (`include`, `exclude`, `allowedContentTypes`, `extract.removeSelectors`,
52
+ * `output.exclude`) accept either an array (replaces the default) or a directive
53
+ * object `{ extend?, prepend?, replace? }` to combine with Scraply's defaults.
54
+ * @template T
55
+ * @typedef {T[] | { extend?: T[], prepend?: T[], append?: T[], replace?: T[] }} ListInput
56
+ *
31
57
  * @typedef {Object} ScraplyConfig
32
58
  * @property {string[]} [startUrls]
33
- * @property {Array<string|RegExp>} [include]
34
- * @property {Array<string|RegExp>} [exclude]
35
- * @property {string[]} [allowedContentTypes]
59
+ * @property {ListInput<string|RegExp>} [include]
60
+ * @property {ListInput<string|RegExp>} [exclude]
61
+ * @property {ListInput<string>} [allowedContentTypes]
62
+ * @property {SiteConfig[]} [sites] - per-origin/route overrides for allowedContentTypes + extract
36
63
  * @property {'http'|'browser'|import('./fetchers/types.js').Fetcher} [fetcher]
64
+ * @property {Partial<BrowserConfig>} [browser]
37
65
  * @property {'silent'|'error'|'warn'|'info'|'debug'} [logLevel]
66
+ * @property {boolean} [signals] - install SIGINT/SIGTERM handlers (default true)
38
67
  * @property {{ dir?: string }} [storage]
39
68
  * @property {Partial<RequestConfig>} [request]
40
69
  * @property {Partial<RetryConfig>} [retry]
41
70
  * @property {Partial<RateLimitConfig>} [rateLimit]
42
71
  * @property {Partial<CrawlConfig>} [crawl]
43
- * @property {{ removeSelectors?: string[] }} [extract]
44
- * @property {Partial<OutputConfig>} [output]
72
+ * @property {ExtractConfig & { removeSelectors?: ListInput<string> }} [extract]
73
+ * @property {Partial<OutputConfig> & { exclude?: ListInput<string|RegExp> }} [output]
45
74
  *
46
75
  * @typedef {Required<ScraplyConfig> & {
76
+ * include: Array<string|RegExp>,
77
+ * exclude: Array<string|RegExp>,
78
+ * allowedContentTypes: string[],
79
+ * sites: Array<{ match: Array<string|RegExp>, allowedContentTypes?: string[], extract?: ExtractConfig }>,
80
+ * browser: BrowserConfig,
81
+ * extract: ExtractConfig & { removeSelectors: string[] },
47
82
  * storage: { dir: string, queuePath: string, crawledDir: string, formattedDir: string }
48
83
  * }} ResolvedConfig
49
84
  */
50
85
 
51
86
  // Main entry points
52
- export { createCrawler, scraply } from './crawler.js';
87
+ export { createCrawler, scraply, runCrawlers } from './crawler.js';
53
88
 
54
89
  // Config
55
90
  export { loadConfig } from './config/load.js';
56
91
  export { DEFAULT_CONFIG } from './config/defaults.js';
92
+ export { assertBrowserConfig, BROWSER_WAIT_UNTIL, BROWSER_BLOCKABLE_RESOURCES } from './config/browser.js';
93
+
94
+ // Errors
95
+ export { RateLimitError } from './core/errors.js';
57
96
 
58
97
  // Standalone building blocks (usable without a crawler instance)
59
98
  export { normalizeUrl } from './url/normalize.js';
60
99
  export { matchesPattern, matchesAnyPattern } from './url/patterns.js';
61
100
  export { extractText } from './extract/extract.js';
62
101
  export { discoverLinks } from './extract/links.js';
102
+ export { classifyContentType, parseJson } from './extract/parse.js';
103
+ export { parseSitemap } from './extract/sitemap.js';
63
104
  export { routeRecord } from './output/router.js';
64
105
  export { writeRecords, formatRecords } from './output/writers.js';
65
106
 
@@ -3,19 +3,28 @@ import { routeRecord } from './router.js';
3
3
 
4
4
  const sortByUrl = (records) => [...records].sort((a, b) => a.url.localeCompare(b.url));
5
5
 
6
+ // Internal bookkeeping fields kept on disk but omitted from formatted output.
7
+ const OUTPUT_OMIT = new Set(['hash', 'crawledAt', 'file']);
8
+
9
+ /** Projects a stored record to its public output shape (url, content, data, any transform fields). */
10
+ const toOutputRecord = (record) =>
11
+ Object.fromEntries(Object.entries(record).filter(([key]) => !OUTPUT_OMIT.has(key)));
12
+
6
13
  /**
7
14
  * Serializes records. Sorting by URL keeps output stable across runs so version
8
15
  * control does not show spurious diffs for unchanged data.
9
16
  *
10
- * @param {{ url: string, content: string }[]} records
17
+ * @param {Array<{ url: string, content: string }>} records
11
18
  * @param {'json'|'jsonl'|'lines'} format
12
19
  */
13
20
  const serialize = (records, format) => {
14
21
  const sorted = sortByUrl(records);
15
22
 
16
- if (format === 'jsonl') return `${sorted.map((record) => JSON.stringify(record)).join('\n')}\n`;
17
23
  if (format === 'lines') return sorted.map((record) => `${record.url} ${record.content}`).join('\n');
18
- return JSON.stringify(sorted, null, 2);
24
+
25
+ const projected = sorted.map(toOutputRecord);
26
+ if (format === 'jsonl') return `${projected.map((record) => JSON.stringify(record)).join('\n')}\n`;
27
+ return JSON.stringify(projected, null, 2);
19
28
  };
20
29
 
21
30
  /** Writes a single group of records to `filePath`. */
@@ -26,9 +35,9 @@ export const writeRecords = (filePath, records, format = 'json') => {
26
35
  /**
27
36
  * Groups records by routed output file and writes each group.
28
37
  *
29
- * @param {{ url: string, content: string }[]} records
38
+ * @param {Array<{ url: string, content: string }>} records
30
39
  * @param {{ output: import('../index.js').OutputConfig, formattedDir: string }} options
31
- * @returns {Map<string, { url: string, content: string }[]>} written file -> records
40
+ * @returns {Map<string, Array<{ url: string, content: string }>>} written file -> records
32
41
  */
33
42
  export const formatRecords = (records, { output, formattedDir }) => {
34
43
  const groups = new Map();