@ebowwa/markdown-docs-scraper 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +15 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +22 -2
- package/dist/scrapers/index.js +22 -2
- package/package.json +1 -1
- package/src/index.ts +39 -5
package/dist/index.d.ts
CHANGED
|
@@ -85,9 +85,16 @@ export declare class MarkdownDocsScraper {
|
|
|
85
85
|
*/
|
|
86
86
|
scrape(): Promise<ScraperResult>;
|
|
87
87
|
/**
|
|
88
|
-
*
|
|
88
|
+
* Extract body content from a file (strips header comment)
|
|
89
89
|
*/
|
|
90
|
-
|
|
90
|
+
private extractBody;
|
|
91
|
+
/**
|
|
92
|
+
* Save scraped pages to disk (only writes if content changed)
|
|
93
|
+
*/
|
|
94
|
+
savePages(pages: DocPage[]): Promise<{
|
|
95
|
+
updated: number;
|
|
96
|
+
skipped: number;
|
|
97
|
+
}>;
|
|
91
98
|
/**
|
|
92
99
|
* Get list of pages to scrape based on categories
|
|
93
100
|
*/
|
|
@@ -98,7 +105,12 @@ export declare class MarkdownDocsScraper {
|
|
|
98
105
|
*/
|
|
99
106
|
export declare function scrapeMarkdownDocs(options: ScraperOptions & {
|
|
100
107
|
useLlms?: boolean;
|
|
101
|
-
}): Promise<ScraperResult
|
|
108
|
+
}): Promise<ScraperResult & {
|
|
109
|
+
saveStats?: {
|
|
110
|
+
updated: number;
|
|
111
|
+
skipped: number;
|
|
112
|
+
};
|
|
113
|
+
}>;
|
|
102
114
|
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
103
115
|
export declare const CLAUDE_CODE_PATTERN: RegExp;
|
|
104
116
|
/** Pattern for generic docs: any domain/path.md */
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACH,OAAO,CAAC,WAAW;IAMnB;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC;IAyChF;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,GAAG;IAAE,SAAS,CAAC,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC,CAe/E;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -382,14 +382,28 @@ class MarkdownDocsScraper {
|
|
|
382
382
|
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
383
383
|
return { downloaded, failed, duration };
|
|
384
384
|
}
|
|
385
|
+
extractBody(content) {
|
|
386
|
+
const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
|
|
387
|
+
return content.replace(headerRegex, "");
|
|
388
|
+
}
|
|
385
389
|
async savePages(pages) {
|
|
386
390
|
const fs = await import("fs/promises");
|
|
387
391
|
const path = await import("path");
|
|
392
|
+
let updated = 0;
|
|
393
|
+
let skipped = 0;
|
|
388
394
|
for (const page of pages) {
|
|
389
395
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
390
396
|
const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
|
|
391
397
|
await fs.mkdir(dir, { recursive: true });
|
|
392
398
|
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
399
|
+
try {
|
|
400
|
+
const existingContent = await fs.readFile(filepath, "utf-8");
|
|
401
|
+
const existingBody = this.extractBody(existingContent);
|
|
402
|
+
if (existingBody === page.content) {
|
|
403
|
+
skipped++;
|
|
404
|
+
continue;
|
|
405
|
+
}
|
|
406
|
+
} catch {}
|
|
393
407
|
const header = `<!--
|
|
394
408
|
Source: ${page.url}
|
|
395
409
|
Downloaded: ${new Date().toISOString()}
|
|
@@ -397,7 +411,9 @@ Downloaded: ${new Date().toISOString()}
|
|
|
397
411
|
|
|
398
412
|
`;
|
|
399
413
|
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
414
|
+
updated++;
|
|
400
415
|
}
|
|
416
|
+
return { updated, skipped };
|
|
401
417
|
}
|
|
402
418
|
getPagesToScrape() {
|
|
403
419
|
const pages = [];
|
|
@@ -412,10 +428,14 @@ Downloaded: ${new Date().toISOString()}
|
|
|
412
428
|
async function scrapeMarkdownDocs(options) {
|
|
413
429
|
const scraper = new MarkdownDocsScraper(options);
|
|
414
430
|
const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
|
|
431
|
+
let saveStats;
|
|
415
432
|
if (options.outputDir) {
|
|
416
|
-
await scraper.savePages(result.downloaded);
|
|
433
|
+
saveStats = await scraper.savePages(result.downloaded);
|
|
434
|
+
if (saveStats.updated > 0 || saveStats.skipped > 0) {
|
|
435
|
+
console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
|
|
436
|
+
}
|
|
417
437
|
}
|
|
418
|
-
return result;
|
|
438
|
+
return { ...result, saveStats };
|
|
419
439
|
}
|
|
420
440
|
var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
421
441
|
var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
|
package/dist/scrapers/index.js
CHANGED
|
@@ -226,14 +226,28 @@ class MarkdownDocsScraper {
|
|
|
226
226
|
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
227
227
|
return { downloaded, failed, duration };
|
|
228
228
|
}
|
|
229
|
+
extractBody(content) {
|
|
230
|
+
const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
|
|
231
|
+
return content.replace(headerRegex, "");
|
|
232
|
+
}
|
|
229
233
|
async savePages(pages) {
|
|
230
234
|
const fs = await import("fs/promises");
|
|
231
235
|
const path = await import("path");
|
|
236
|
+
let updated = 0;
|
|
237
|
+
let skipped = 0;
|
|
232
238
|
for (const page of pages) {
|
|
233
239
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
234
240
|
const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
|
|
235
241
|
await fs.mkdir(dir, { recursive: true });
|
|
236
242
|
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
243
|
+
try {
|
|
244
|
+
const existingContent = await fs.readFile(filepath, "utf-8");
|
|
245
|
+
const existingBody = this.extractBody(existingContent);
|
|
246
|
+
if (existingBody === page.content) {
|
|
247
|
+
skipped++;
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
} catch {}
|
|
237
251
|
const header = `<!--
|
|
238
252
|
Source: ${page.url}
|
|
239
253
|
Downloaded: ${new Date().toISOString()}
|
|
@@ -241,7 +255,9 @@ Downloaded: ${new Date().toISOString()}
|
|
|
241
255
|
|
|
242
256
|
`;
|
|
243
257
|
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
258
|
+
updated++;
|
|
244
259
|
}
|
|
260
|
+
return { updated, skipped };
|
|
245
261
|
}
|
|
246
262
|
getPagesToScrape() {
|
|
247
263
|
const pages = [];
|
|
@@ -256,10 +272,14 @@ Downloaded: ${new Date().toISOString()}
|
|
|
256
272
|
async function scrapeMarkdownDocs(options) {
|
|
257
273
|
const scraper = new MarkdownDocsScraper(options);
|
|
258
274
|
const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
|
|
275
|
+
let saveStats;
|
|
259
276
|
if (options.outputDir) {
|
|
260
|
-
await scraper.savePages(result.downloaded);
|
|
277
|
+
saveStats = await scraper.savePages(result.downloaded);
|
|
278
|
+
if (saveStats.updated > 0 || saveStats.skipped > 0) {
|
|
279
|
+
console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
|
|
280
|
+
}
|
|
261
281
|
}
|
|
262
|
-
return result;
|
|
282
|
+
return { ...result, saveStats };
|
|
263
283
|
}
|
|
264
284
|
|
|
265
285
|
// src/scrapers/llms-txt.ts
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -367,12 +367,24 @@ export class MarkdownDocsScraper {
|
|
|
367
367
|
}
|
|
368
368
|
|
|
369
369
|
/**
|
|
370
|
-
*
|
|
370
|
+
* Extract body content from a file (strips header comment)
|
|
371
371
|
*/
|
|
372
|
-
|
|
372
|
+
private extractBody(content: string): string {
|
|
373
|
+
// Match header comment and remove it
|
|
374
|
+
const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
|
|
375
|
+
return content.replace(headerRegex, "");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Save scraped pages to disk (only writes if content changed)
|
|
380
|
+
*/
|
|
381
|
+
async savePages(pages: DocPage[]): Promise<{ updated: number; skipped: number }> {
|
|
373
382
|
const fs = await import("fs/promises");
|
|
374
383
|
const path = await import("path");
|
|
375
384
|
|
|
385
|
+
let updated = 0;
|
|
386
|
+
let skipped = 0;
|
|
387
|
+
|
|
376
388
|
for (const page of pages) {
|
|
377
389
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
378
390
|
|
|
@@ -384,9 +396,27 @@ export class MarkdownDocsScraper {
|
|
|
384
396
|
|
|
385
397
|
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
386
398
|
|
|
399
|
+
// Check if file exists and compare content
|
|
400
|
+
try {
|
|
401
|
+
const existingContent = await fs.readFile(filepath, "utf-8");
|
|
402
|
+
const existingBody = this.extractBody(existingContent);
|
|
403
|
+
|
|
404
|
+
// Skip if content unchanged
|
|
405
|
+
if (existingBody === page.content) {
|
|
406
|
+
skipped++;
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
} catch {
|
|
410
|
+
// File doesn't exist, will create it
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Content changed or new file - write it
|
|
387
414
|
const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
|
|
388
415
|
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
416
|
+
updated++;
|
|
389
417
|
}
|
|
418
|
+
|
|
419
|
+
return { updated, skipped };
|
|
390
420
|
}
|
|
391
421
|
|
|
392
422
|
/**
|
|
@@ -414,17 +444,21 @@ export class MarkdownDocsScraper {
|
|
|
414
444
|
*/
|
|
415
445
|
export async function scrapeMarkdownDocs(
|
|
416
446
|
options: ScraperOptions & { useLlms?: boolean }
|
|
417
|
-
): Promise<ScraperResult> {
|
|
447
|
+
): Promise<ScraperResult & { saveStats?: { updated: number; skipped: number } }> {
|
|
418
448
|
const scraper = new MarkdownDocsScraper(options);
|
|
419
449
|
const result = options.useLlms
|
|
420
450
|
? await scraper.scrapeFromLlms()
|
|
421
451
|
: await scraper.scrape();
|
|
422
452
|
|
|
453
|
+
let saveStats;
|
|
423
454
|
if (options.outputDir) {
|
|
424
|
-
await scraper.savePages(result.downloaded);
|
|
455
|
+
saveStats = await scraper.savePages(result.downloaded);
|
|
456
|
+
if (saveStats.updated > 0 || saveStats.skipped > 0) {
|
|
457
|
+
console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
|
|
458
|
+
}
|
|
425
459
|
}
|
|
426
460
|
|
|
427
|
-
return result;
|
|
461
|
+
return { ...result, saveStats };
|
|
428
462
|
}
|
|
429
463
|
|
|
430
464
|
// ============================================================================
|