@ebowwa/markdown-docs-scraper 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -85,9 +85,16 @@ export declare class MarkdownDocsScraper {
85
85
  */
86
86
  scrape(): Promise<ScraperResult>;
87
87
  /**
88
- * Save scraped pages to disk
88
+ * Extract body content from a file (strips header comment)
89
89
  */
90
- savePages(pages: DocPage[]): Promise<void>;
90
+ private extractBody;
91
+ /**
92
+ * Save scraped pages to disk (only writes if content changed)
93
+ */
94
+ savePages(pages: DocPage[]): Promise<{
95
+ updated: number;
96
+ skipped: number;
97
+ }>;
91
98
  /**
92
99
  * Get list of pages to scrape based on categories
93
100
  */
@@ -98,7 +105,12 @@ export declare class MarkdownDocsScraper {
98
105
  */
99
106
  export declare function scrapeMarkdownDocs(options: ScraperOptions & {
100
107
  useLlms?: boolean;
101
- }): Promise<ScraperResult>;
108
+ }): Promise<ScraperResult & {
109
+ saveStats?: {
110
+ updated: number;
111
+ skipped: number;
112
+ };
113
+ }>;
102
114
  /** Pattern for Claude Code docs: /docs/en/page.md */
103
115
  export declare const CLAUDE_CODE_PATTERN: RegExp;
104
116
  /** Pattern for generic docs: any domain/path.md */
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBhD;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,CAAC,CAWxB;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACH,OAAO,CAAC,WAAW;IAMnB;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC;IAyChF;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,GAAG;IAAE,SAAS,CAAC,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC,CAe/E;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}
package/dist/index.js CHANGED
@@ -382,14 +382,28 @@ class MarkdownDocsScraper {
382
382
  console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
383
383
  return { downloaded, failed, duration };
384
384
  }
385
+ extractBody(content) {
386
+ const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
387
+ return content.replace(headerRegex, "");
388
+ }
385
389
  async savePages(pages) {
386
390
  const fs = await import("fs/promises");
387
391
  const path = await import("path");
392
+ let updated = 0;
393
+ let skipped = 0;
388
394
  for (const page of pages) {
389
395
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
390
396
  const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
391
397
  await fs.mkdir(dir, { recursive: true });
392
398
  const filepath = path.join(dir, `${nameToUse}.md`);
399
+ try {
400
+ const existingContent = await fs.readFile(filepath, "utf-8");
401
+ const existingBody = this.extractBody(existingContent);
402
+ if (existingBody === page.content) {
403
+ skipped++;
404
+ continue;
405
+ }
406
+ } catch {}
393
407
  const header = `<!--
394
408
  Source: ${page.url}
395
409
  Downloaded: ${new Date().toISOString()}
@@ -397,7 +411,9 @@ Downloaded: ${new Date().toISOString()}
397
411
 
398
412
  `;
399
413
  await fs.writeFile(filepath, header + page.content, "utf-8");
414
+ updated++;
400
415
  }
416
+ return { updated, skipped };
401
417
  }
402
418
  getPagesToScrape() {
403
419
  const pages = [];
@@ -412,10 +428,14 @@ Downloaded: ${new Date().toISOString()}
412
428
  async function scrapeMarkdownDocs(options) {
413
429
  const scraper = new MarkdownDocsScraper(options);
414
430
  const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
431
+ let saveStats;
415
432
  if (options.outputDir) {
416
- await scraper.savePages(result.downloaded);
433
+ saveStats = await scraper.savePages(result.downloaded);
434
+ if (saveStats.updated > 0 || saveStats.skipped > 0) {
435
+ console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
436
+ }
417
437
  }
418
- return result;
438
+ return { ...result, saveStats };
419
439
  }
420
440
  var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
421
441
  var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
@@ -226,14 +226,28 @@ class MarkdownDocsScraper {
226
226
  console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
227
227
  return { downloaded, failed, duration };
228
228
  }
229
+ extractBody(content) {
230
+ const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
231
+ return content.replace(headerRegex, "");
232
+ }
229
233
  async savePages(pages) {
230
234
  const fs = await import("fs/promises");
231
235
  const path = await import("path");
236
+ let updated = 0;
237
+ let skipped = 0;
232
238
  for (const page of pages) {
233
239
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
234
240
  const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
235
241
  await fs.mkdir(dir, { recursive: true });
236
242
  const filepath = path.join(dir, `${nameToUse}.md`);
243
+ try {
244
+ const existingContent = await fs.readFile(filepath, "utf-8");
245
+ const existingBody = this.extractBody(existingContent);
246
+ if (existingBody === page.content) {
247
+ skipped++;
248
+ continue;
249
+ }
250
+ } catch {}
237
251
  const header = `<!--
238
252
  Source: ${page.url}
239
253
  Downloaded: ${new Date().toISOString()}
@@ -241,7 +255,9 @@ Downloaded: ${new Date().toISOString()}
241
255
 
242
256
  `;
243
257
  await fs.writeFile(filepath, header + page.content, "utf-8");
258
+ updated++;
244
259
  }
260
+ return { updated, skipped };
245
261
  }
246
262
  getPagesToScrape() {
247
263
  const pages = [];
@@ -256,10 +272,14 @@ Downloaded: ${new Date().toISOString()}
256
272
  async function scrapeMarkdownDocs(options) {
257
273
  const scraper = new MarkdownDocsScraper(options);
258
274
  const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
275
+ let saveStats;
259
276
  if (options.outputDir) {
260
- await scraper.savePages(result.downloaded);
277
+ saveStats = await scraper.savePages(result.downloaded);
278
+ if (saveStats.updated > 0 || saveStats.skipped > 0) {
279
+ console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
280
+ }
261
281
  }
262
- return result;
282
+ return { ...result, saveStats };
263
283
  }
264
284
 
265
285
  // src/scrapers/llms-txt.ts
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ebowwa/markdown-docs-scraper",
3
- "version": "1.2.0",
3
+ "version": "1.2.1",
4
4
  "files": [
5
5
  "dist",
6
6
  "src"
package/src/index.ts CHANGED
@@ -367,12 +367,24 @@ export class MarkdownDocsScraper {
367
367
  }
368
368
 
369
369
  /**
370
- * Save scraped pages to disk
370
+ * Extract body content from a file (strips header comment)
371
371
  */
372
- async savePages(pages: DocPage[]): Promise<void> {
372
+ private extractBody(content: string): string {
373
+ // Match header comment and remove it
374
+ const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
375
+ return content.replace(headerRegex, "");
376
+ }
377
+
378
+ /**
379
+ * Save scraped pages to disk (only writes if content changed)
380
+ */
381
+ async savePages(pages: DocPage[]): Promise<{ updated: number; skipped: number }> {
373
382
  const fs = await import("fs/promises");
374
383
  const path = await import("path");
375
384
 
385
+ let updated = 0;
386
+ let skipped = 0;
387
+
376
388
  for (const page of pages) {
377
389
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
378
390
 
@@ -384,9 +396,27 @@ export class MarkdownDocsScraper {
384
396
 
385
397
  const filepath = path.join(dir, `${nameToUse}.md`);
386
398
 
399
+ // Check if file exists and compare content
400
+ try {
401
+ const existingContent = await fs.readFile(filepath, "utf-8");
402
+ const existingBody = this.extractBody(existingContent);
403
+
404
+ // Skip if content unchanged
405
+ if (existingBody === page.content) {
406
+ skipped++;
407
+ continue;
408
+ }
409
+ } catch {
410
+ // File doesn't exist, will create it
411
+ }
412
+
413
+ // Content changed or new file - write it
387
414
  const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
388
415
  await fs.writeFile(filepath, header + page.content, "utf-8");
416
+ updated++;
389
417
  }
418
+
419
+ return { updated, skipped };
390
420
  }
391
421
 
392
422
  /**
@@ -414,17 +444,21 @@ export class MarkdownDocsScraper {
414
444
  */
415
445
  export async function scrapeMarkdownDocs(
416
446
  options: ScraperOptions & { useLlms?: boolean }
417
- ): Promise<ScraperResult> {
447
+ ): Promise<ScraperResult & { saveStats?: { updated: number; skipped: number } }> {
418
448
  const scraper = new MarkdownDocsScraper(options);
419
449
  const result = options.useLlms
420
450
  ? await scraper.scrapeFromLlms()
421
451
  : await scraper.scrape();
422
452
 
453
+ let saveStats;
423
454
  if (options.outputDir) {
424
- await scraper.savePages(result.downloaded);
455
+ saveStats = await scraper.savePages(result.downloaded);
456
+ if (saveStats.updated > 0 || saveStats.skipped > 0) {
457
+ console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
458
+ }
425
459
  }
426
460
 
427
- return result;
461
+ return { ...result, saveStats };
428
462
  }
429
463
 
430
464
  // ============================================================================