@crawlee/cheerio 4.0.0-beta.41 → 4.0.0-beta.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -141,8 +141,8 @@ export declare class CheerioCrawler<ContextExtension = Dictionary<never>, Extend
|
|
|
141
141
|
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>);
|
|
142
142
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
143
143
|
protected buildContextPipeline(): import("@crawlee/http").ContextPipeline<import("@crawlee/http").CrawlingContext<Dictionary>, InternalHttpCrawlingContext<any, any> & {
|
|
144
|
-
|
|
145
|
-
|
|
144
|
+
readonly body: string;
|
|
145
|
+
readonly $: CheerioAPI;
|
|
146
146
|
} & {
|
|
147
147
|
enqueueLinks: (enqueueOptions?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
|
|
148
148
|
waitForSelector: (selector: string, _timeoutMs?: number) => Promise<void>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cheerio-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,
|
|
1
|
+
{"version":3,"file":"cheerio-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,sBAAsB,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACzE,OAAO,EAAE,KAAK,WAAW,EAA0B,KAAK,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC9F,OAAO,KAAK,EAAE,UAAU,EAAkB,MAAM,SAAS,CAAC;AAC1D,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,MAAM,MAAM,mBAAmB,CAC3B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE7D,MAAM,WAAW,qBAAqB,CAClC,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,sBAAsB,GAAG,sBAAsB,GAAG,gBAAgB,EAC1F,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,EAAE,gBAAgB,EAAE,eAAe,CAAC;CAAG;AAE9G,MAAM,MAAM,WAAW,CACnB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAEjE,MAAM,WAAW,sBAAsB,CACnC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;IACrD;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IAEb;;;OAGG;IACH,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC;IAEtB;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;;;OAaG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9E;;OAEG;IACH,YAAY,CAAC,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAAC;CAChF;AAED,MAAM,MAAM,qBAAqB,CAC7B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4EG;AACH,qBAAa,cAAc,CACvB,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,sBAAsB,GAAG,sBAAsB,GAAG,gBAAgB,CAC5F,SAAQ,WAAW,CAAC,sBAAsB,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC5E;;OAEG;gBACS,OAAO,CAAC,EAAE,qBAAqB,CAAC,gBAAgB,EAAE,eAAe,CAAC;cAS3D,oBAAoB;uBA2BX,MAAM;oBAMT,UAAU;;wCAiBW,mBAAmB;oCAYvB,MAAM,eAAe,MAAM;sCAKzB,MAAM,cAAc,MAAM;;YA1DxD,YAAY;YAqCZ,UAAU;CA8B3B;AAED,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,mBAAmB,CAAC;IAC9B,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,mBAAmB,CAAC;IAC9B,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAC7B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAmC1E;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,mBAAmB,CAC/B,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,EAC/D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,kDAEzC"}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
|
|
1
|
+
import { enqueueLinks, HttpCrawler, NavigationSkippedError, resolveBaseUrlForEnqueueLinksFiltering, Router, } from '@crawlee/http';
|
|
2
2
|
import { extractUrlsFromCheerio } from '@crawlee/utils';
|
|
3
3
|
import * as cheerio from 'cheerio';
|
|
4
4
|
import { parseDocument } from 'htmlparser2';
|
|
@@ -99,18 +99,33 @@ export class CheerioCrawler extends HttpCrawler {
|
|
|
99
99
|
.compose({ action: async (context) => await this.addHelpers(context) });
|
|
100
100
|
}
|
|
101
101
|
async parseContent(crawlingContext) {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
102
|
+
try {
|
|
103
|
+
const isXml = crawlingContext.contentType.type.includes('xml');
|
|
104
|
+
const body = Buffer.isBuffer(crawlingContext.body)
|
|
105
|
+
? crawlingContext.body.toString(crawlingContext.contentType.encoding)
|
|
106
|
+
: crawlingContext.body;
|
|
107
|
+
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
|
|
108
|
+
const $ = cheerio.load(dom, {
|
|
109
|
+
xml: { decodeEntities: true, xmlMode: isXml },
|
|
110
|
+
});
|
|
111
|
+
return {
|
|
112
|
+
$,
|
|
113
|
+
body,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
catch (err) {
|
|
117
|
+
if (err instanceof NavigationSkippedError) {
|
|
118
|
+
return {
|
|
119
|
+
get body() {
|
|
120
|
+
throw new NavigationSkippedError('The `body` property is not available - `skipNavigation` was used', { cause: err });
|
|
121
|
+
},
|
|
122
|
+
get $() {
|
|
123
|
+
throw new NavigationSkippedError('The `$` property is not available - `skipNavigation` was used', { cause: err });
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
throw err;
|
|
128
|
+
}
|
|
114
129
|
}
|
|
115
130
|
async addHelpers(crawlingContext) {
|
|
116
131
|
const originalEnqueueLinks = crawlingContext.enqueueLinks;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cheerio-crawler.js","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAaA,OAAO,
|
|
1
|
+
{"version":3,"file":"cheerio-crawler.js","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAaA,OAAO,EACH,YAAY,EACZ,WAAW,EACX,sBAAsB,EACtB,sCAAsC,EACtC,MAAM,GACT,MAAM,eAAe,CAAC;AAEvB,OAAO,EAAoB,sBAAsB,EAAsB,MAAM,gBAAgB,CAAC;AAE9F,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AA2E5C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4EG;AACH,MAAM,OAAO,cAGX,SAAQ,WAAsE;IAC5E;;OAEG;IACH,YAAY,OAAkE;QAC1E,MAAM,EAAE,sBAAsB,EAAE,GAAG,IAAI,EAAE,GAAG,OAAO,IAAI,EAAE,CAAC;QAE1D,KAAK,CAAC;YACF,GAAG,IAAI;YACP,sBAAsB,EAAE,sBAAsB,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;SACxF,CAAC,CAAC;IACP,CAAC;IAEkB,oBAAoB;QACnC,OAAO,KAAK;aACP,oBAAoB,EAAE;aACtB,OAAO,CAAC;YACL,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;SAC9D,CAAC;aACD,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IAChF,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,eAA4C;QACnE,IAAI,CAAC;YACD,MAAM,KAAK,GAAG,eAAe,CAAC,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;YAC/D,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,CAAC;gBAC9C,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC,WAAW,CAAC,QAAQ,CAAC;gBACrE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC;YAC3B,MAAM,GAAG,GAAG,aAAa,CAAC,IAAI,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;YAC1E,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE;gBACxB,GAAG,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE;aAC9B,CAAC,CAAC;YAErB,OAAO;gBACH,CAAC;gBACD,IAAI;aACP,CAAC;QACN,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,IAAI,GAAG,YAAY,sBAAsB,EAAE,CAAC;gBACxC,OAAO;oBACH,IAAI,IAAI;wBACJ,MAAM,IAAI,sBAAsB,CAC5B,kEAAkE,EAClE,EAAE,KAAK,EAAE,GAAG,EAAE,CACjB,CAAC;oBACN,CAAC;oBACD,IAAI,CAAC;wBACD,MAAM,IAAI,sBAAsB,CAC5B,+DAA+D,EAC/D,EAAE,KAAK,EAAE,GAAG,EAAE,CACjB,CAAC;oBACN,CAAC;iBACJ,CAAC;YACN,CAAC;YAED,MAAM,GAAG,CAAC;QACd,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,UAAU,CAAC,eAAgE;QACrF,MAAM,oBAAoB,GAAG,eAAe,CAAC,YAAY,CAAC;QAE1D,OAAO;YACH,YAAY,EAAE,KAAK,EAAE,cAAoC,EAAE,EAAE;gBACzD,OAAO,CAAC,MAAM,0BAA0B,CAAC;oBACrC,OAAO,EAAE,EAAE,GAAG,cAAc,EAAE,KAAK,EAAE,IAAI,CAAC,6BAA6B,CAAC,cAAc,EAAE,KAAK,CAAC,EAAE;oBAChG,CAAC,EAAE,eAAe,CAAC,CAAC;oBACpB,YAAY,EAAE,MAAM,IAAI,CAAC,eAAe,EAAE;oBAC1C,aAAa,EAAE,MAAM,IAAI,CAAC,sBAAsB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC;oBAC7E,gBAAgB,EAAE,IAAI,CAAC,oBAAoB;oBAC3C,kBAAkB,EAAE,eAAe,CAAC,OAAO,CAAC,GAAG;oBAC/C,eAAe,EAAE,eAAe,CAAC,OAAO,CAAC,SAAS;oBAClD,YAAY,EAAE,oBAAoB;iBACrC,CAAC,CAA2B,CAAC,CAAC,2BAA2B;YAC9D,CAAC;YACD,eAAe,EAAE,KAAK,EAAE,QAAgB,EAAE,UAAmB,EAAE,EAAE;gBAC7D,IAAI,eAAe,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;oBACjD,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;gBACzD,CAAC;YACL,CAAC;YACD,gBAAgB,EAAE,KAAK,EAAE,QAAiB,EAAE,SAAkB,EAAE,EAAE;gBAC9D,IAAI,QAAQ,EAAE,CAAC;oBACX,MAAM,eAAe,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;gBAC/D,CAAC;gBAED,OAAO,eAAe,CAAC,CAAC,CAAC;YAC7B,CAAC;SACJ,CAAC;IACN,CAAC;CACJ;AAoBD,gBAAgB;AAChB,SAAS,oBAAoB,CACzB,OAAuE;IAEvE,OAAO,CAAC,CAAE,OAA4C,CAAC,YAAY,CAAC;AACxE,CAAC;AAED,gBAAgB;AAChB,MAAM,CAAC,KAAK,UAAU,0BAA0B,CAC5C,OAAuE;IAEvE,MAAM,EAAE,OAAO,EAAE,mBAAmB,EAAE,CAAC,EAAE,kBAAkB,EAAE,eAAe,EAAE,GAAG,OAAO,CAAC;IACzF,IAAI,CAAC,CAAC,EAAE,CAAC;QACL,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,OAAO,GAAG,sCAAsC,CAAC;QACnD,eAAe,EAAE,mBAAmB,EAAE,QAAQ;QAC9C,eAAe;QACf,kBAAkB;QAClB,mBAAmB,EAAE,mBAAmB,EAAE,OAAO;KACpD,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,sBAAsB,CAC/B,CAAC,EACD,mBAAmB,EAAE,QAAQ,IAAI,GAAG,EACpC,mBAAmB,EAAE,OAAO,IAAI,eAAe,IAAI,kBAAkB,CACxE,CAAC;IAEF,IAAI,oBAAoB,CAAC,OAAO,CAAC,EAAE,CAAC;QAChC,OAAO,OAAO,CAAC,YAAY,CAAC;YACxB,IAAI;YACJ,OAAO;YACP,GAAG,mBAAmB;SACzB,CAAC,CAAC;IACP,CAAC;IACD,OAAO,YAAY,CAAC;QAChB,YAAY,EAAE,OAAO,CAAC,YAAY;QAClC,aAAa,EAAE,OAAO,CAAC,aAAa;QACpC,gBAAgB,EAAE,OAAO,CAAC,gBAAgB;QAC1C,IAAI;QACJ,OAAO;QACP,GAAG,mBAAmB;KACzB,CAAC,CAAC;AACP,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,UAAU,mBAAmB,CAGjC,MAAwC;IACtC,OAAO,MAAM,CAAC,MAAM,CAAU,MAAM,CAAC,CAAC;AAC1C,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/cheerio",
|
|
3
|
-
"version": "4.0.0-beta.
|
|
3
|
+
"version": "4.0.0-beta.42",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=22.0.0"
|
|
@@ -47,9 +47,9 @@
|
|
|
47
47
|
"access": "public"
|
|
48
48
|
},
|
|
49
49
|
"dependencies": {
|
|
50
|
-
"@crawlee/http": "4.0.0-beta.
|
|
51
|
-
"@crawlee/types": "4.0.0-beta.
|
|
52
|
-
"@crawlee/utils": "4.0.0-beta.
|
|
50
|
+
"@crawlee/http": "4.0.0-beta.42",
|
|
51
|
+
"@crawlee/types": "4.0.0-beta.42",
|
|
52
|
+
"@crawlee/utils": "4.0.0-beta.42",
|
|
53
53
|
"cheerio": "^1.0.0",
|
|
54
54
|
"htmlparser2": "^10.0.0",
|
|
55
55
|
"tslib": "^2.8.1"
|
|
@@ -61,5 +61,5 @@
|
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
},
|
|
64
|
-
"gitHead": "
|
|
64
|
+
"gitHead": "fe1827977c6ca78c509f2a7e1106c48b20c1cbf2"
|
|
65
65
|
}
|