@crawlee/cheerio 3.13.3-beta.8 → 3.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -57,23 +57,23 @@ JSONData extends Dictionary = any> = RequestHandler<CheerioCrawlingContext<UserD
|
|
|
57
57
|
*
|
|
58
58
|
* Since `CheerioCrawler` uses raw HTTP requests to download web pages,
|
|
59
59
|
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
|
|
60
|
-
* to display the content, you might need to use {@
|
|
60
|
+
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
|
|
61
61
|
* because it loads the pages using full-featured headless Chrome browser.
|
|
62
62
|
*
|
|
63
63
|
* `CheerioCrawler` downloads each URL using a plain HTTP request,
|
|
64
64
|
* parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
|
|
65
|
-
* and then invokes the user-provided {@
|
|
65
|
+
* and then invokes the user-provided {@link CheerioCrawlerOptions.requestHandler} to extract page data
|
|
66
66
|
* using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
|
|
67
67
|
*
|
|
68
|
-
* The source URLs are represented using {@
|
|
69
|
-
* {@
|
|
70
|
-
* or {@
|
|
68
|
+
* The source URLs are represented using {@link Request} objects that are fed from
|
|
69
|
+
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
|
|
70
|
+
* or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
|
|
71
71
|
*
|
|
72
|
-
* If both {@
|
|
73
|
-
* the instance first processes URLs from the {@
|
|
74
|
-
* to {@
|
|
72
|
+
* If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
|
|
73
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
74
|
+
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
75
75
|
*
|
|
76
|
-
* The crawler finishes when there are no more {@
|
|
76
|
+
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
77
77
|
*
|
|
78
78
|
* We can use the `preNavigationHooks` to adjust `gotOptions`:
|
|
79
79
|
*
|
|
@@ -88,15 +88,15 @@ JSONData extends Dictionary = any> = RequestHandler<CheerioCrawlingContext<UserD
|
|
|
88
88
|
* By default, `CheerioCrawler` only processes web pages with the `text/html`
|
|
89
89
|
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
90
90
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
91
|
-
* use the {@
|
|
91
|
+
* use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
|
|
92
92
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
93
|
-
* For more details, see {@
|
|
93
|
+
* For more details, see {@link CheerioCrawlerOptions.requestHandler}.
|
|
94
94
|
*
|
|
95
95
|
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
96
|
-
* using the functionality provided by the {@
|
|
97
|
-
* All {@
|
|
96
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
97
|
+
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
|
|
98
98
|
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
99
|
-
* {@
|
|
99
|
+
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
|
|
100
100
|
*
|
|
101
101
|
* **Example usage:**
|
|
102
102
|
*
|
|
@@ -154,9 +154,9 @@ interface EnqueueLinksInternalOptions {
|
|
|
154
154
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
155
155
|
export declare function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
|
|
156
156
|
/**
|
|
157
|
-
* Creates new {@
|
|
158
|
-
* This instance can then serve as a `requestHandler` of your {@
|
|
159
|
-
* Defaults to the {@
|
|
157
|
+
* Creates new {@link Router} instance that works based on request labels.
|
|
158
|
+
* This instance can then serve as a `requestHandler` of your {@link CheerioCrawler}.
|
|
159
|
+
* Defaults to the {@link CheerioCrawlingContext}.
|
|
160
160
|
*
|
|
161
161
|
* > Serves as a shortcut for using `Router.create<CheerioCrawlingContext>()`.
|
|
162
162
|
*
|
|
@@ -18,23 +18,23 @@ const WritableStream_1 = require("htmlparser2/lib/WritableStream");
|
|
|
18
18
|
*
|
|
19
19
|
* Since `CheerioCrawler` uses raw HTTP requests to download web pages,
|
|
20
20
|
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
|
|
21
|
-
* to display the content, you might need to use {@
|
|
21
|
+
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
|
|
22
22
|
* because it loads the pages using full-featured headless Chrome browser.
|
|
23
23
|
*
|
|
24
24
|
* `CheerioCrawler` downloads each URL using a plain HTTP request,
|
|
25
25
|
* parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
|
|
26
|
-
* and then invokes the user-provided {@
|
|
26
|
+
* and then invokes the user-provided {@link CheerioCrawlerOptions.requestHandler} to extract page data
|
|
27
27
|
* using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
|
|
28
28
|
*
|
|
29
|
-
* The source URLs are represented using {@
|
|
30
|
-
* {@
|
|
31
|
-
* or {@
|
|
29
|
+
* The source URLs are represented using {@link Request} objects that are fed from
|
|
30
|
+
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
|
|
31
|
+
* or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
|
|
32
32
|
*
|
|
33
|
-
* If both {@
|
|
34
|
-
* the instance first processes URLs from the {@
|
|
35
|
-
* to {@
|
|
33
|
+
* If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
|
|
34
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
35
|
+
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
36
36
|
*
|
|
37
|
-
* The crawler finishes when there are no more {@
|
|
37
|
+
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
38
38
|
*
|
|
39
39
|
* We can use the `preNavigationHooks` to adjust `gotOptions`:
|
|
40
40
|
*
|
|
@@ -49,15 +49,15 @@ const WritableStream_1 = require("htmlparser2/lib/WritableStream");
|
|
|
49
49
|
* By default, `CheerioCrawler` only processes web pages with the `text/html`
|
|
50
50
|
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
51
51
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
52
|
-
* use the {@
|
|
52
|
+
* use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
|
|
53
53
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
54
|
-
* For more details, see {@
|
|
54
|
+
* For more details, see {@link CheerioCrawlerOptions.requestHandler}.
|
|
55
55
|
*
|
|
56
56
|
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
57
|
-
* using the functionality provided by the {@
|
|
58
|
-
* All {@
|
|
57
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
58
|
+
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
|
|
59
59
|
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
60
|
-
* {@
|
|
60
|
+
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
|
|
61
61
|
*
|
|
62
62
|
* **Example usage:**
|
|
63
63
|
*
|
|
@@ -174,9 +174,9 @@ async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, robotsTxtF
|
|
|
174
174
|
});
|
|
175
175
|
}
|
|
176
176
|
/**
|
|
177
|
-
* Creates new {@
|
|
178
|
-
* This instance can then serve as a `requestHandler` of your {@
|
|
179
|
-
* Defaults to the {@
|
|
177
|
+
* Creates new {@link Router} instance that works based on request labels.
|
|
178
|
+
* This instance can then serve as a `requestHandler` of your {@link CheerioCrawler}.
|
|
179
|
+
* Defaults to the {@link CheerioCrawlingContext}.
|
|
180
180
|
*
|
|
181
181
|
* > Serves as a shortcut for using `Router.create<CheerioCrawlingContext>()`.
|
|
182
182
|
*
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/cheerio",
|
|
3
|
-
"version": "3.13.3
|
|
3
|
+
"version": "3.13.3",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -53,9 +53,9 @@
|
|
|
53
53
|
"access": "public"
|
|
54
54
|
},
|
|
55
55
|
"dependencies": {
|
|
56
|
-
"@crawlee/http": "3.13.3
|
|
57
|
-
"@crawlee/types": "3.13.3
|
|
58
|
-
"@crawlee/utils": "3.13.3
|
|
56
|
+
"@crawlee/http": "3.13.3",
|
|
57
|
+
"@crawlee/types": "3.13.3",
|
|
58
|
+
"@crawlee/utils": "3.13.3",
|
|
59
59
|
"cheerio": "1.0.0-rc.12",
|
|
60
60
|
"htmlparser2": "^9.0.0",
|
|
61
61
|
"tslib": "^2.4.0"
|
|
@@ -67,5 +67,5 @@
|
|
|
67
67
|
}
|
|
68
68
|
}
|
|
69
69
|
},
|
|
70
|
-
"gitHead": "
|
|
70
|
+
"gitHead": "279cadbd3cd6342f36cc4d841e07b999e472420d"
|
|
71
71
|
}
|