@astrofoundry/grimoire 1.12.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/scraper.d.ts +1 -1
- package/dist/scraper.d.ts.map +1 -1
- package/dist/scraper.js +8 -4
- package/dist/scraper.js.map +1 -1
- package/package.json +1 -1
package/dist/scraper.d.ts
CHANGED
|
@@ -3,7 +3,7 @@ import type { SourceConfig } from "./config.js";
|
|
|
3
3
|
export declare function slugifyUrl(url: string): string;
|
|
4
4
|
export declare function filterUrls(urls: string[], includePatterns?: string[], excludePatterns?: string[]): string[];
|
|
5
5
|
export declare function discoverUrls(page: Page, source: SourceConfig): Promise<string[]>;
|
|
6
|
-
export declare function fetchPage(page: Page, url: string): Promise<string>;
|
|
6
|
+
export declare function fetchPage(page: Page, url: string, headed?: boolean): Promise<string>;
|
|
7
7
|
export declare function scrapeSource(source: SourceConfig, sourceName: string, dataDir: string, onProgress?: (current: number, total: number, url: string) => void): Promise<string[]>;
|
|
8
8
|
export declare function createBrowser(): Promise<Browser>;
|
|
9
9
|
//# sourceMappingURL=scraper.d.ts.map
|
package/dist/scraper.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,IAAI,EAAE,MAAM,YAAY,CAAC;AAC/D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAO9C;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,GACzB,MAAM,EAAE,CAkBV;AAoBD,wBAAsB,YAAY,CAChC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,YAAY,GACnB,OAAO,CAAC,MAAM,EAAE,CAAC,
|
|
1
|
+
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,IAAI,EAAE,MAAM,YAAY,CAAC;AAC/D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAO9C;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,GACzB,MAAM,EAAE,CAkBV;AAoBD,wBAAsB,YAAY,CAChC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,YAAY,GACnB,OAAO,CAAC,MAAM,EAAE,CAAC,CAiBnB;AAED,wBAAsB,SAAS,CAC7B,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,MAAM,CAAC,CAGjB;AAsBD,wBAAsB,YAAY,CAChC,MAAM,EAAE,YAAY,EACpB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,KAAK,IAAI,GACjE,OAAO,CAAC,MAAM,EAAE,CAAC,CAiCnB;AAED,wBAAsB,aAAa,IAAI,OAAO,CAAC,OAAO,CAAC,CAEtD"}
|
package/dist/scraper.js
CHANGED
|
@@ -39,10 +39,14 @@ export async function discoverUrls(page, source) {
|
|
|
39
39
|
}
|
|
40
40
|
await page.goto(source.start_url, { waitUntil: "domcontentloaded" });
|
|
41
41
|
const rawUrls = await page.$$eval(`${source.nav_selector} a[href]`, (links) => links.map((a) => a.href));
|
|
42
|
-
|
|
42
|
+
const discovered = filterUrls(rawUrls, source.include_patterns, source.exclude_patterns);
|
|
43
|
+
if (!discovered.includes(source.start_url)) {
|
|
44
|
+
discovered.unshift(source.start_url);
|
|
45
|
+
}
|
|
46
|
+
return discovered;
|
|
43
47
|
}
|
|
44
|
-
export async function fetchPage(page, url) {
|
|
45
|
-
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
48
|
+
export async function fetchPage(page, url, headed) {
|
|
49
|
+
await page.goto(url, { waitUntil: headed ? "networkidle" : "domcontentloaded" });
|
|
46
50
|
return page.content();
|
|
47
51
|
}
|
|
48
52
|
const DEFAULT_CONCURRENCY = 50;
|
|
@@ -71,7 +75,7 @@ export async function scrapeSource(source, sourceName, dataDir, onProgress) {
|
|
|
71
75
|
await runPool(urls, concurrency, async (url) => {
|
|
72
76
|
const page = await context.newPage();
|
|
73
77
|
try {
|
|
74
|
-
const html = await fetchPage(page, url);
|
|
78
|
+
const html = await fetchPage(page, url, source.headed);
|
|
75
79
|
const slug = slugifyUrl(url);
|
|
76
80
|
await writeFile(join(rawDir, `${slug}.html`), html, "utf-8");
|
|
77
81
|
completed++;
|
package/dist/scraper.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAG/D,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,QAAQ;SACnB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAc,EACd,eAA0B,EAC1B,eAA0B;IAE1B,IAAI,QAAQ,GAAG,IAAI,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAC/E,CAAC;IAEF,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACjC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACzD,CAAC;IACJ,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACnE,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;AACvC,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,UAAkB;IAChD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IAClC,MAAM,IAAI,GAAG,CAAC,GAAG,GAAG,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAExE,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC3E,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;IACvB,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,mBAAmB,CAAC,UAAkB,EAAE,MAAoB;IACzE,MAAM,IAAI,GAAG,MAAM,gBAAgB,CAAC,UAAU,CAAC,CAAC;IAChD,OAAO,UAAU,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;AAC5E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAU,EACV,MAAoB;IAEpB,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;QACvB,OAAO,mBAAmB,CAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IAED,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAErE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAC/B,GAAG,MAAM,CAAC,YAAY,UAAU,EAChC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAuB,CAAC,IAAI,CAAC,CAC3D,CAAC;IAEF,
|
|
1
|
+
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAG/D,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,QAAQ;SACnB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAc,EACd,eAA0B,EAC1B,eAA0B;IAE1B,IAAI,QAAQ,GAAG,IAAI,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAC/E,CAAC;IAEF,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACjC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACzD,CAAC;IACJ,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACnE,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;AACvC,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,UAAkB;IAChD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IAClC,MAAM,IAAI,GAAG,CAAC,GAAG,GAAG,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAExE,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC3E,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;IACvB,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,mBAAmB,CAAC,UAAkB,EAAE,MAAoB;IACzE,MAAM,IAAI,GAAG,MAAM,gBAAgB,CAAC,UAAU,CAAC,CAAC;IAChD,OAAO,UAAU,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;AAC5E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAU,EACV,MAAoB;IAEpB,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;QACvB,OAAO,mBAAmB,CAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IAED,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAErE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAC/B,GAAG,MAAM,CAAC,YAAY,UAAU,EAChC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAuB,CAAC,IAAI,CAAC,CAC3D,CAAC;IAEF,MAAM,UAAU,GAAG,UAAU,CAAC,OAAO,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;IACzF,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3C,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;IACvC,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAU,EACV,GAAW,EACX,MAAgB;IAEhB,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC,CAAC;IACjF,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,mBAAmB,GAAG,EAAE,CAAC;AAE/B,KAAK,UAAU,OAAO,CACpB,KAAU,EACV,WAAmB,EACnB,EAA6C;IAE7C,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,UAAU,MAAM;QACnB,OAAO,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,SAAS,EAAE,CAAC;YAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,KAAK,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;IAC5F,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,MAAoB,EACpB,UAAkB,EAClB,OAAe,EACf,UAAkE;IAElE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;IAChD,MAAM,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAEzC,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,IAAI,mBAAmB,CAAC;IAC9D,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IACvF,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAE3C,MAAM,aAAa,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IAC9C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;IACvD,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;IAE5B,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC;QACH,MAAM,OAAO,CAAC,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YACrC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;gBACvD,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;gBAC7B,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,OAAO,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;gBAC7D,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YAC5C,CAAC;oBAAS,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,WAAW,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;QAC1E,OAAO,IAAI,CAAC;IACd,CAAC;YAAS,CAAC;QACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa;IACjC,OAAO,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;AAChD,CAAC"}
|