@crawlee/linkedom 3.13.1-beta.8 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -28,7 +28,7 @@ Crawlee is available as the [`crawlee`](https://www.npmjs.com/package/crawlee) N
|
|
|
28
28
|
|
|
29
29
|
## Installation
|
|
30
30
|
|
|
31
|
-
We recommend visiting the [Introduction tutorial](https://crawlee.dev/docs/introduction) in Crawlee documentation for more information.
|
|
31
|
+
We recommend visiting the [Introduction tutorial](https://crawlee.dev/js/docs/introduction) in Crawlee documentation for more information.
|
|
32
32
|
|
|
33
33
|
> Crawlee requires **Node.js 16 or higher**.
|
|
34
34
|
|
|
@@ -78,7 +78,7 @@ const crawler = new PlaywrightCrawler({
|
|
|
78
78
|
await crawler.run(['https://crawlee.dev']);
|
|
79
79
|
```
|
|
80
80
|
|
|
81
|
-
By default, Crawlee stores data to `./storage` in the current working directory. You can override this directory via Crawlee configuration. For details, see [Configuration guide](https://crawlee.dev/docs/guides/configuration), [Request storage](https://crawlee.dev/docs/guides/request-storage) and [Result storage](https://crawlee.dev/docs/guides/result-storage).
|
|
81
|
+
By default, Crawlee stores data to `./storage` in the current working directory. You can override this directory via Crawlee configuration. For details, see [Configuration guide](https://crawlee.dev/js/docs/guides/configuration), [Request storage](https://crawlee.dev/js/docs/guides/request-storage) and [Result storage](https://crawlee.dev/js/docs/guides/result-storage).
|
|
82
82
|
|
|
83
83
|
### Installing pre-release versions
|
|
84
84
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import type { IncomingMessage } from 'http';
|
|
2
|
-
import type {
|
|
1
|
+
import type { IncomingMessage } from 'node:http';
|
|
2
|
+
import type { EnqueueLinksOptions, ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RequestProvider, RouterRoutes } from '@crawlee/http';
|
|
3
3
|
import { HttpCrawler } from '@crawlee/http';
|
|
4
4
|
import type { Dictionary } from '@crawlee/types';
|
|
5
|
-
import { type CheerioRoot } from '@crawlee/utils';
|
|
5
|
+
import { type CheerioRoot, type RobotsTxtFile } from '@crawlee/utils';
|
|
6
6
|
export type LinkeDOMErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
7
7
|
JSONData extends Dictionary = any> = ErrorHandler<LinkeDOMCrawlingContext<UserData, JSONData>>;
|
|
8
8
|
export interface LinkeDOMCrawlerOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
@@ -131,12 +131,13 @@ interface EnqueueLinksInternalOptions {
|
|
|
131
131
|
options?: LinkeDOMCrawlerEnqueueLinksOptions;
|
|
132
132
|
window: Window | null;
|
|
133
133
|
requestQueue: RequestProvider;
|
|
134
|
+
robotsTxtFile?: RobotsTxtFile;
|
|
134
135
|
originalRequestUrl: string;
|
|
135
136
|
finalRequestUrl?: string;
|
|
136
137
|
}
|
|
137
138
|
/** @internal */
|
|
138
139
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
139
|
-
export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
|
|
140
|
+
export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
|
|
140
141
|
/**
|
|
141
142
|
* Creates new {@apilink Router} instance that works based on request labels.
|
|
142
143
|
* This instance can then serve as a `requestHandler` of your {@apilink LinkeDOMCrawler}.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"linkedom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"linkedom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,OAAO,KAAK,EACR,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACf,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAS,MAAM,gBAAgB,CAAC;AAO7E,MAAM,MAAM,oBAAoB,CAC5B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,MAAM,WAAW,sBAAsB,CACnC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;CAAG;AAE5E,MAAM,WAAW,kCAAmC,SAAQ,IAAI,CAAC,mBAAmB,EAAE,MAAM,GAAG,cAAc,CAAC;CAAG;AAEjH,MAAM,MAAM,YAAY,CACpB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAElE,MAAM,WAAW,uBAAuB,CACpC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,EAAE,eAAe,CAAC;IACtE,MAAM,EAAE,MAAM,CAAC;IAMf,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,MAAM,sBAAsB,CAC9B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAEhE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEG;AAEH,qBAAa,eAAgB,SAAQ,WAAW,CAAC,uBAAuB,CAAC;IACrE,OAAO,CAAC,MAAM,CAAC,MAAM,CAAmB;cAEf,UAAU,CAC/B,QAAQ,EAAE,eAAe,EACzB,KAAK,EAAE,OAAO,EACd,eAAe,EAAE,uBAAuB;;;2BAaF,QAAQ;wCAEJ,kCAAkC;;IAajE,kBAAkB,CAAC,OAAO,EAAE,uBAAuB;CA0BrE;AAED,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,kCAAkC,CAAC;IAC7C,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,wBAAsB,2BAA2B,CAAC,EAC9C,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,kBAAkB,EAClB,eAAe,GAClB,EAAE,2BAA2B,4DAyB7B;AAmBD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,oBAAoB,CAChC,OAAO,SAAS,uBAAuB,GAAG,uBAAuB,EACjE,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,kDAEzC"}
|
|
@@ -4,12 +4,12 @@ exports.LinkeDOMCrawler = void 0;
|
|
|
4
4
|
exports.linkedomCrawlerEnqueueLinks = linkedomCrawlerEnqueueLinks;
|
|
5
5
|
exports.createLinkeDOMRouter = createLinkeDOMRouter;
|
|
6
6
|
const tslib_1 = require("tslib");
|
|
7
|
-
const utilities_1 = require("@apify/utilities");
|
|
8
7
|
const http_1 = require("@crawlee/http");
|
|
9
8
|
const utils_1 = require("@crawlee/utils");
|
|
10
9
|
const cheerio = tslib_1.__importStar(require("cheerio"));
|
|
11
10
|
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
|
|
12
11
|
const cached_1 = require("linkedom/cached");
|
|
12
|
+
const utilities_1 = require("@apify/utilities");
|
|
13
13
|
/**
|
|
14
14
|
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
|
|
15
15
|
* [linkedom](https://www.npmjs.com/package/linkedom) LinkeDOM implementation.
|
|
@@ -98,6 +98,7 @@ class LinkeDOMCrawler extends http_1.HttpCrawler {
|
|
|
98
98
|
options: enqueueOptions,
|
|
99
99
|
window: document.defaultView,
|
|
100
100
|
requestQueue: await this.getRequestQueue(),
|
|
101
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
|
|
101
102
|
originalRequestUrl: crawlingContext.request.url,
|
|
102
103
|
finalRequestUrl: crawlingContext.request.loadedUrl,
|
|
103
104
|
});
|
|
@@ -110,7 +111,8 @@ class LinkeDOMCrawler extends http_1.HttpCrawler {
|
|
|
110
111
|
if ($(selector).get().length === 0) {
|
|
111
112
|
if (timeoutMs) {
|
|
112
113
|
await (0, utils_1.sleep)(50);
|
|
113
|
-
|
|
114
|
+
await context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
|
|
115
|
+
return;
|
|
114
116
|
}
|
|
115
117
|
throw new Error(`Selector '${selector}' not found.`);
|
|
116
118
|
}
|
|
@@ -133,7 +135,7 @@ Object.defineProperty(LinkeDOMCrawler, "parser", {
|
|
|
133
135
|
value: new cached_1.DOMParser()
|
|
134
136
|
});
|
|
135
137
|
/** @internal */
|
|
136
|
-
async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl, }) {
|
|
138
|
+
async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, originalRequestUrl, finalRequestUrl, }) {
|
|
137
139
|
if (!window) {
|
|
138
140
|
throw new Error('Cannot enqueue links because the DOM is not available.');
|
|
139
141
|
}
|
|
@@ -146,6 +148,7 @@ async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, orig
|
|
|
146
148
|
const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
|
|
147
149
|
return (0, http_1.enqueueLinks)({
|
|
148
150
|
requestQueue,
|
|
151
|
+
robotsTxtFile,
|
|
149
152
|
urls,
|
|
150
153
|
baseUrl,
|
|
151
154
|
...options,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"linkedom-crawler.js","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"linkedom-crawler.js","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":";;;AA2OA,kEAgCC;AA2CD,oDAKC;;AA9SD,wCAMuB;AAEvB,0CAA6E;AAC7E,yDAAmC;AACnC,mHAAmH;AACnH,4CAA4C;AAE5C,gDAAwD;AAkExD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEG;AAEH,MAAa,eAAgB,SAAQ,kBAAoC;IAGlD,KAAK,CAAC,UAAU,CAC/B,QAAyB,EACzB,KAAc,EACd,eAAwC;QAExC,MAAM,IAAI,GAAG,MAAM,IAAA,gCAAoB,EAAC,QAAQ,CAAC,CAAC;QAElD,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAE3G,OAAO;YACH,MAAM,EAAE,QAAQ,CAAC,WAAW;YAC5B,IAAI,IAAI;gBACJ,OAAO,QAAQ,CAAC,eAAe,CAAC,SAAS,CAAC;YAC9C,CAAC;YACD,IAAI,QAAQ;gBACR,iEAAiE;gBACjE,OAAO,QAA+B,CAAC;YAC3C,CAAC;YACD,YAAY,EAAE,KAAK,EAAE,cAAmD,EAAE,EAAE;gBACxE,OAAO,2BAA2B,CAAC;oBAC/B,OAAO,EAAE,cAAc;oBACvB,MAAM,EAAE,QAAQ,CAAC,WAAW;oBAC5B,YAAY,EAAE,MAAM,IAAI,CAAC,eAAe,EAAE;oBAC1C,aAAa,EAAE,MAAM,IAAI,CAAC,sBAAsB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC;oBAC7E,kBAAkB,EAAE,eAAe,CAAC,OAAO,CAAC,GAAG;oBAC/C,eAAe,EAAE,eAAe,CAAC,OAAO,CAAC,SAAS;iBACrD,CAAC,CAAC;YACP,CAAC;SACJ,CAAC;IACN,CAAC;IAEQ,KAAK,CAAC,kBAAkB,CAAC,OAAgC;QAC9D,OAAO,CAAC,eAAe,GAAG,KAAK,EAAE,QAAgB,EAAE,SAAS,GAAG,IAAK,EAAE,EAAE;YACpE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACjC,IAAI,SAAS,EAAE,CAAC;oBACZ,MAAM,IAAA,aAAK,EAAC,EAAE,CAAC,CAAC;oBAChB,MAAM,OAAO,CAAC,eAAe,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;oBACrE,OAAO;gBACX,CAAC;gBAED,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;QACL,CAAC,CAAC;QACF,OAAO,CAAC,gBAAgB,GAAG,KAAK,EAAE,QAAiB,EAAE,UAAU,GAAG,IAAK,EAAE,EAAE;YACvE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,QAAQ,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7C,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;YAED,OAAO,CAAC,CAAC;QACb,CAAC,CAAC;QAEF,MAAM,KAAK,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;;AA3DL,0CA4DC;AA3DkB;;;;WAAS,IAAI,kBAAS,EAAE;GAAC;AAsE5C,gBAAgB;AACT,KAAK,UAAU,2BAA2B,CAAC,EAC9C,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,kBAAkB,EAClB,eAAe,GACW;IAC1B,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,OAAO,GAAG,IAAA,6CAAsC,EAAC;QACnD,eAAe,EAAE,OAAO,EAAE,QAAQ;QAClC,eAAe;QACf,kBAAkB;QAClB,mBAAmB,EAAE,OAAO,EAAE,OAAO;KACxC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,qBAAqB,CAC9B,MAAM,EACN,OAAO,EAAE,QAAQ,IAAI,GAAG,EACxB,OAAO,EAAE,OAAO,IAAI,eAAe,IAAI,kBAAkB,CAC5D,CAAC;IAEF,OAAO,IAAA,mBAAY,EAAC;QAChB,YAAY;QACZ,aAAa;QACb,IAAI;QACJ,OAAO;QACP,GAAG,OAAO;KACb,CAAC,CAAC;AACP,CAAC;AAED;;;GAGG;AACH,SAAS,qBAAqB,CAAC,MAAc,EAAE,QAAgB,EAAE,OAAe;IAC5E,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;SACxD,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SACvB,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAC;SACnD,GAAG,CAAC,CAAC,IAAwB,EAAE,EAAE;QAC9B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACrB,OAAO,SAAS,CAAC;QACrB,CAAC;QACD,OAAO,IAAA,qBAAc,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAa,CAAC;AACzE,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,SAAgB,oBAAoB,CAGlC,MAAwC;IACtC,OAAO,aAAM,CAAC,MAAM,CAAU,MAAM,CAAC,CAAC;AAC1C,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/linkedom",
|
|
3
|
-
"version": "3.13.1
|
|
3
|
+
"version": "3.13.1",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -55,8 +55,8 @@
|
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@apify/timeout": "^0.3.0",
|
|
57
57
|
"@apify/utilities": "^2.7.10",
|
|
58
|
-
"@crawlee/http": "3.13.1
|
|
59
|
-
"@crawlee/types": "3.13.1
|
|
58
|
+
"@crawlee/http": "3.13.1",
|
|
59
|
+
"@crawlee/types": "3.13.1",
|
|
60
60
|
"linkedom": "^0.18.0",
|
|
61
61
|
"ow": "^0.28.2",
|
|
62
62
|
"tslib": "^2.4.0"
|
|
@@ -68,5 +68,5 @@
|
|
|
68
68
|
}
|
|
69
69
|
}
|
|
70
70
|
},
|
|
71
|
-
"gitHead": "
|
|
71
|
+
"gitHead": "99af95e0dda511718b45cd41452589260c69909a"
|
|
72
72
|
}
|