@crawlee/cheerio 4.0.0-beta.31 → 4.0.0-beta.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,6 +2,7 @@ import type { BasicCrawlingContext, EnqueueLinksOptions, ErrorHandler, GetUserDa
|
|
|
2
2
|
import { HttpCrawler } from '@crawlee/http';
|
|
3
3
|
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
|
|
4
4
|
import { type CheerioRoot, type RobotsTxtFile } from '@crawlee/utils';
|
|
5
|
+
import type { CheerioAPI } from 'cheerio';
|
|
5
6
|
import * as cheerio from 'cheerio';
|
|
6
7
|
export type CheerioErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
7
8
|
JSONData extends Dictionary = any> = ErrorHandler<CheerioCrawlingContext<UserData, JSONData>>;
|
|
@@ -138,6 +139,15 @@ export declare class CheerioCrawler<ContextExtension = Dictionary<never>, Extend
|
|
|
138
139
|
* All `CheerioCrawler` parameters are passed via an options object.
|
|
139
140
|
*/
|
|
140
141
|
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>);
|
|
142
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
143
|
+
protected buildContextPipeline(): import("@crawlee/http").ContextPipeline<import("@crawlee/http").CrawlingContext<Dictionary>, InternalHttpCrawlingContext<any, any> & {
|
|
144
|
+
$: CheerioAPI;
|
|
145
|
+
body: string;
|
|
146
|
+
} & {
|
|
147
|
+
enqueueLinks: (enqueueOptions?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
|
|
148
|
+
waitForSelector: (selector: string, _timeoutMs?: number) => Promise<void>;
|
|
149
|
+
parseWithCheerio: (selector?: string, timeoutMs?: number) => Promise<CheerioAPI>;
|
|
150
|
+
}>;
|
|
141
151
|
private parseContent;
|
|
142
152
|
private addHelpers;
|
|
143
153
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cheerio-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAgB,WAAW,EAAkD,MAAM,eAAe,CAAC;AAC1G,OAAO,KAAK,EAAE,sBAAsB,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACzE,OAAO,EAAE,KAAK,WAAW,EAA0B,KAAK,aAAa,EAAE,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"cheerio-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAgB,WAAW,EAAkD,MAAM,eAAe,CAAC;AAC1G,OAAO,KAAK,EAAE,sBAAsB,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACzE,OAAO,EAAE,KAAK,WAAW,EAA0B,KAAK,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC9F,OAAO,KAAK,EAAE,UAAU,EAAkB,MAAM,SAAS,CAAC;AAC1D,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,MAAM,MAAM,mBAAmB,CAC3B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE7D,MAAM,WAAW,qBAAqB,CAClC,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,sBAAsB,GAAG,sBAAsB,GAAG,gBAAgB,EAC1F,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,EAAE,gBAAgB,EAAE,eAAe,CAAC;CAAG;AAE9G,MAAM,MAAM,WAAW,CACnB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAEjE,MAAM,WAAW,sBAAsB,CACnC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;IACrD;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IAEb;;;OAGG;IACH,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC;IAEtB;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;;;OAaG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9E;;OAEG;IACH,YAAY,CAAC,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAAC;CAChF;AAED,MAAM,MAAM,qBAAqB,CAC7B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4EG;AACH,qBAAa,cAAc,CACvB,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,sBAAsB,GAAG,sBAAsB,GAAG,gBAAgB,CAC5F,SAAQ,WAAW,CAAC,sBAAsB,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC5E;;OAEG;gBACS,OAAO,CAAC,EAAE,qBAAqB,CAAC,gBAAgB,EAAE,eAAe,CAAC;cAS3D,oBAAoB;;;;wCA6BO,mBAAmB;oCAYvB,MAAM,eAAe,MAAM;sCAKzB,MAAM,cAAc,MAAM;;YArCxD,YAAY;YAgBZ,UAAU;CA8B3B;AAED,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,mBAAmB,CAAC;IAC9B,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,mBAAmB,CAAC;IAC9B,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC;IAC7B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAmC1E;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,mBAAmB,CAC/B,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,EAC/D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,kDAEzC"}
|
|
@@ -84,15 +84,20 @@ export class CheerioCrawler extends HttpCrawler {
|
|
|
84
84
|
* All `CheerioCrawler` parameters are passed via an options object.
|
|
85
85
|
*/
|
|
86
86
|
constructor(options) {
|
|
87
|
+
const { contextPipelineBuilder, ...rest } = options ?? {};
|
|
87
88
|
super({
|
|
88
|
-
...
|
|
89
|
-
contextPipelineBuilder: () => this.buildContextPipeline()
|
|
90
|
-
.compose({
|
|
91
|
-
action: async (context) => await this.parseContent(context),
|
|
92
|
-
})
|
|
93
|
-
.compose({ action: async (context) => await this.addHelpers(context) }),
|
|
89
|
+
...rest,
|
|
90
|
+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
|
|
94
91
|
});
|
|
95
92
|
}
|
|
93
|
+
buildContextPipeline() {
|
|
94
|
+
return super
|
|
95
|
+
.buildContextPipeline()
|
|
96
|
+
.compose({
|
|
97
|
+
action: async (context) => await this.parseContent(context),
|
|
98
|
+
})
|
|
99
|
+
.compose({ action: async (context) => await this.addHelpers(context) });
|
|
100
|
+
}
|
|
96
101
|
async parseContent(crawlingContext) {
|
|
97
102
|
const isXml = crawlingContext.contentType.type.includes('xml');
|
|
98
103
|
const body = Buffer.isBuffer(crawlingContext.body)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cheerio-crawler.js","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAaA,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,sCAAsC,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAE1G,OAAO,EAAoB,sBAAsB,EAAsB,MAAM,gBAAgB,CAAC;AAE9F,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AA2E5C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4EG;AACH,MAAM,OAAO,cAGX,SAAQ,WAAsE;IAC5E;;OAEG;IACH,YAAY,OAAkE;QAC1E,KAAK,CAAC;YACF,GAAG,
|
|
1
|
+
{"version":3,"file":"cheerio-crawler.js","sourceRoot":"","sources":["../../src/internals/cheerio-crawler.ts"],"names":[],"mappings":"AAaA,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,sCAAsC,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAE1G,OAAO,EAAoB,sBAAsB,EAAsB,MAAM,gBAAgB,CAAC;AAE9F,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AA2E5C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4EG;AACH,MAAM,OAAO,cAGX,SAAQ,WAAsE;IAC5E;;OAEG;IACH,YAAY,OAAkE;QAC1E,MAAM,EAAE,sBAAsB,EAAE,GAAG,IAAI,EAAE,GAAG,OAAO,IAAI,EAAE,CAAC;QAE1D,KAAK,CAAC;YACF,GAAG,IAAI;YACP,sBAAsB,EAAE,sBAAsB,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;SACxF,CAAC,CAAC;IACP,CAAC;IAEkB,oBAAoB;QACnC,OAAO,KAAK;aACP,oBAAoB,EAAE;aACtB,OAAO,CAAC;YACL,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;SAC9D,CAAC;aACD,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IAChF,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,eAA4C;QACnE,MAAM,KAAK,GAAG,eAAe,CAAC,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QAC/D,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,CAAC;YAC9C,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC,WAAW,CAAC,QAAQ,CAAC;YACrE,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC;QAC3B,MAAM,GAAG,GAAG,aAAa,CAAC,IAAI,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;QAC1E,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE;YACxB,GAAG,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE;SAC9B,CAAC,CAAC;QAErB,OAAO;YACH,CAAC;YACD,IAAI;SACP,CAAC;IACN,CAAC;IAEO,KAAK,CAAC,UAAU,CAAC,eAAgE;QACrF,MAAM,oBAAoB,GAAG,eAAe,CAAC,YAAY,CAAC;QAE1D,OAAO;YACH,YAAY,EAAE,KAAK,EAAE,cAAoC,EAAE,EAAE;gBACzD,OAAO,CAAC,MAAM,0BAA0B,CAAC;oBACrC,OAAO,EAAE,EAAE,GAAG,cAAc,EAAE,KAAK,EAAE,IAAI,CAAC,6BAA6B,CAAC,cAAc,EAAE,KAAK,CAAC,EAAE;oBAChG,CAAC,EAAE,eAAe,CAAC,CAAC;oBACpB,YAAY,EAAE,MAAM,IAAI,CAAC,eAAe,EAAE;oBAC1C,aAAa,EAAE,MAAM,IAAI,CAAC,sBAAsB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC;oBAC7E,gBAAgB,EAAE,IAAI,CAAC,oBAAoB;oBAC3C,kBAAkB,EAAE,eAAe,CAAC,OAAO,CAAC,GAAG;oBAC/C,eAAe,EAAE,eAAe,CAAC,OAAO,CAAC,SAAS;oBAClD,YAAY,EAAE,oBAAoB;iBACrC,CAAC,CAA2B,CAAC,CAAC,2BAA2B;YAC9D,CAAC;YACD,eAAe,EAAE,KAAK,EAAE,QAAgB,EAAE,UAAmB,EAAE,EAAE;gBAC7D,IAAI,eAAe,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;oBACjD,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;gBACzD,CAAC;YACL,CAAC;YACD,gBAAgB,EAAE,KAAK,EAAE,QAAiB,EAAE,SAAkB,EAAE,EAAE;gBAC9D,IAAI,QAAQ,EAAE,CAAC;oBACX,MAAM,eAAe,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;gBAC/D,CAAC;gBAED,OAAO,eAAe,CAAC,CAAC,CAAC;YAC7B,CAAC;SACJ,CAAC;IACN,CAAC;CACJ;AAoBD,gBAAgB;AAChB,SAAS,oBAAoB,CACzB,OAAuE;IAEvE,OAAO,CAAC,CAAE,OAA4C,CAAC,YAAY,CAAC;AACxE,CAAC;AAED,gBAAgB;AAChB,MAAM,CAAC,KAAK,UAAU,0BAA0B,CAC5C,OAAuE;IAEvE,MAAM,EAAE,OAAO,EAAE,mBAAmB,EAAE,CAAC,EAAE,kBAAkB,EAAE,eAAe,EAAE,GAAG,OAAO,CAAC;IACzF,IAAI,CAAC,CAAC,EAAE,CAAC;QACL,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,OAAO,GAAG,sCAAsC,CAAC;QACnD,eAAe,EAAE,mBAAmB,EAAE,QAAQ;QAC9C,eAAe;QACf,kBAAkB;QAClB,mBAAmB,EAAE,mBAAmB,EAAE,OAAO;KACpD,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,sBAAsB,CAC/B,CAAC,EACD,mBAAmB,EAAE,QAAQ,IAAI,GAAG,EACpC,mBAAmB,EAAE,OAAO,IAAI,eAAe,IAAI,kBAAkB,CACxE,CAAC;IAEF,IAAI,oBAAoB,CAAC,OAAO,CAAC,EAAE,CAAC;QAChC,OAAO,OAAO,CAAC,YAAY,CAAC;YACxB,IAAI;YACJ,OAAO;YACP,GAAG,mBAAmB;SACzB,CAAC,CAAC;IACP,CAAC;IACD,OAAO,YAAY,CAAC;QAChB,YAAY,EAAE,OAAO,CAAC,YAAY;QAClC,aAAa,EAAE,OAAO,CAAC,aAAa;QACpC,gBAAgB,EAAE,OAAO,CAAC,gBAAgB;QAC1C,IAAI;QACJ,OAAO;QACP,GAAG,mBAAmB;KACzB,CAAC,CAAC;AACP,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,UAAU,mBAAmB,CAGjC,MAAwC;IACtC,OAAO,MAAM,CAAC,MAAM,CAAU,MAAM,CAAC,CAAC;AAC1C,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/cheerio",
|
|
3
|
-
"version": "4.0.0-beta.
|
|
3
|
+
"version": "4.0.0-beta.32",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=22.0.0"
|
|
@@ -47,9 +47,9 @@
|
|
|
47
47
|
"access": "public"
|
|
48
48
|
},
|
|
49
49
|
"dependencies": {
|
|
50
|
-
"@crawlee/http": "4.0.0-beta.
|
|
51
|
-
"@crawlee/types": "4.0.0-beta.
|
|
52
|
-
"@crawlee/utils": "4.0.0-beta.
|
|
50
|
+
"@crawlee/http": "4.0.0-beta.32",
|
|
51
|
+
"@crawlee/types": "4.0.0-beta.32",
|
|
52
|
+
"@crawlee/utils": "4.0.0-beta.32",
|
|
53
53
|
"cheerio": "^1.0.0",
|
|
54
54
|
"htmlparser2": "^10.0.0",
|
|
55
55
|
"tslib": "^2.8.1"
|
|
@@ -61,5 +61,5 @@
|
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
},
|
|
64
|
-
"gitHead": "
|
|
64
|
+
"gitHead": "86c266f094f0ac2bcf8c376512e16b677279a614"
|
|
65
65
|
}
|