@astrofoundry/grimoire 1.9.2 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/scraper.d.ts.map +1 -1
- package/dist/scraper.js +13 -6
- package/dist/scraper.js.map +1 -1
- package/package.json +1 -1
package/dist/scraper.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,IAAI,EAAE,MAAM,YAAY,CAAC;AAC/D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAO9C;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,GACzB,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"scraper.d.ts","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAEA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,IAAI,EAAE,MAAM,YAAY,CAAC;AAC/D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAO9C;AAED,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EAAE,EACd,eAAe,CAAC,EAAE,MAAM,EAAE,EAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,GACzB,MAAM,EAAE,CAkBV;AAoBD,wBAAsB,YAAY,CAChC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,YAAY,GACnB,OAAO,CAAC,MAAM,EAAE,CAAC,CAanB;AAED,wBAAsB,SAAS,CAC7B,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,MAAM,CAAC,CAGjB;AAsBD,wBAAsB,YAAY,CAChC,MAAM,EAAE,YAAY,EACpB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,KAAK,IAAI,GACjE,OAAO,CAAC,MAAM,EAAE,CAAC,CAiCnB;AAED,wBAAsB,aAAa,IAAI,OAAO,CAAC,OAAO,CAAC,CAEtD"}
|
package/dist/scraper.js
CHANGED
|
@@ -10,7 +10,7 @@ export function slugifyUrl(url) {
|
|
|
10
10
|
.replace(/[^a-zA-Z0-9-]/g, "");
|
|
11
11
|
}
|
|
12
12
|
export function filterUrls(urls, includePatterns, excludePatterns) {
|
|
13
|
-
let filtered = urls;
|
|
13
|
+
let filtered = urls.filter((url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#"));
|
|
14
14
|
if (includePatterns && includePatterns.length > 0) {
|
|
15
15
|
filtered = filtered.filter((url) => includePatterns.some((pattern) => url.includes(pattern)));
|
|
16
16
|
}
|
|
@@ -19,10 +19,18 @@ export function filterUrls(urls, includePatterns, excludePatterns) {
|
|
|
19
19
|
}
|
|
20
20
|
return [...new Set(filtered)].sort();
|
|
21
21
|
}
|
|
22
|
-
async function
|
|
22
|
+
async function fetchSitemapUrls(sitemapUrl) {
|
|
23
23
|
const response = await fetch(sitemapUrl);
|
|
24
24
|
const xml = await response.text();
|
|
25
|
-
const
|
|
25
|
+
const locs = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map((m) => m[1]);
|
|
26
|
+
if (xml.includes("<sitemapindex")) {
|
|
27
|
+
const nested = await Promise.all(locs.map((loc) => fetchSitemapUrls(loc)));
|
|
28
|
+
return nested.flat();
|
|
29
|
+
}
|
|
30
|
+
return locs;
|
|
31
|
+
}
|
|
32
|
+
async function discoverFromSitemap(sitemapUrl, source) {
|
|
33
|
+
const urls = await fetchSitemapUrls(sitemapUrl);
|
|
26
34
|
return filterUrls(urls, source.include_patterns, source.exclude_patterns);
|
|
27
35
|
}
|
|
28
36
|
export async function discoverUrls(page, source) {
|
|
@@ -31,14 +39,13 @@ export async function discoverUrls(page, source) {
|
|
|
31
39
|
}
|
|
32
40
|
await page.goto(source.start_url, { waitUntil: "domcontentloaded" });
|
|
33
41
|
const rawUrls = await page.$$eval(`${source.nav_selector} a[href]`, (links) => links.map((a) => a.href));
|
|
34
|
-
|
|
35
|
-
return filterUrls(cleanUrls, source.include_patterns, source.exclude_patterns);
|
|
42
|
+
return filterUrls(rawUrls, source.include_patterns, source.exclude_patterns);
|
|
36
43
|
}
|
|
37
44
|
export async function fetchPage(page, url) {
|
|
38
45
|
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
39
46
|
return page.content();
|
|
40
47
|
}
|
|
41
|
-
const DEFAULT_CONCURRENCY =
|
|
48
|
+
const DEFAULT_CONCURRENCY = 50;
|
|
42
49
|
async function runPool(items, concurrency, fn) {
|
|
43
50
|
let nextIndex = 0;
|
|
44
51
|
async function worker() {
|
package/dist/scraper.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAG/D,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,QAAQ;SACnB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAc,EACd,eAA0B,EAC1B,eAA0B;IAE1B,IAAI,QAAQ,GAAG,IAAI,CAAC;
|
|
1
|
+
{"version":3,"file":"scraper.js","sourceRoot":"","sources":["../src/scraper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAG/D,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,QAAQ;SACnB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAc,EACd,eAA0B,EAC1B,eAA0B;IAE1B,IAAI,QAAQ,GAAG,IAAI,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAC/E,CAAC;IAEF,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACjC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACzD,CAAC;IACJ,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CACnE,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;AACvC,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,UAAkB;IAChD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IAClC,MAAM,IAAI,GAAG,CAAC,GAAG,GAAG,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAExE,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC3E,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;IACvB,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,mBAAmB,CAAC,UAAkB,EAAE,MAAoB;IACzE,MAAM,IAAI,GAAG,MAAM,gBAAgB,CAAC,UAAU,CAAC,CAAC;IAChD,OAAO,UAAU,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;AAC5E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAU,EACV,MAAoB;IAEpB,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;QACvB,OAAO,mBAAmB,CAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IAED,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAErE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAC/B,GAAG,MAAM,CAAC,YAAY,UAAU,EAChC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAuB,CAAC,IAAI,CAAC,CAC3D,CAAC;IAEF,OAAO,UAAU,CAAC,OAAO,EAAE,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;AAC/E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAU,EACV,GAAW;IAEX,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IACxD,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,mBAAmB,GAAG,EAAE,CAAC;AAE/B,KAAK,UAAU,OAAO,CACpB,KAAU,EACV,WAAmB,EACnB,EAA6C;IAE7C,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,UAAU,MAAM;QACnB,OAAO,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,SAAS,EAAE,CAAC;YAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,KAAK,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;IAC5F,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,MAAoB,EACpB,UAAkB,EAClB,OAAe,EACf,UAAkE;IAElE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;IAChD,MAAM,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAEzC,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,IAAI,mBAAmB,CAAC;IAC9D,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IACvF,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAE3C,MAAM,aAAa,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IAC9C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;IACvD,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;IAE5B,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC;QACH,MAAM,OAAO,CAAC,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YACrC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBACxC,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;gBAC7B,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,OAAO,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;gBAC7D,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YAC5C,CAAC;oBAAS,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,WAAW,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;QAC1E,OAAO,IAAI,CAAC;IACd,CAAC;YAAS,CAAC;QACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa;IACjC,OAAO,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;AAChD,CAAC"}
|