@crawlee/jsdom 3.13.6-beta.1 → 4.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +2 -5
- package/index.js.map +1 -1
- package/internals/jsdom-crawler.d.ts +0 -8
- package/internals/jsdom-crawler.d.ts.map +1 -1
- package/internals/jsdom-crawler.js +32 -64
- package/internals/jsdom-crawler.js.map +1 -1
- package/package.json +16 -22
- package/tsconfig.build.tsbuildinfo +1 -1
- package/index.mjs +0 -100
package/index.d.ts
CHANGED
package/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
package/index.js
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
const tslib_1 = require("tslib");
|
|
4
|
-
tslib_1.__exportStar(require("@crawlee/http"), exports);
|
|
5
|
-
tslib_1.__exportStar(require("./internals/jsdom-crawler"), exports);
|
|
1
|
+
export * from '@crawlee/http';
|
|
2
|
+
export * from './internals/jsdom-crawler.js';
|
|
6
3
|
//# sourceMappingURL=index.js.map
|
package/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
|
@@ -60,8 +60,6 @@ export declare class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
|
|
|
60
60
|
runScripts: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
61
61
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
62
62
|
hideInternalConsole: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
63
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
64
|
-
handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
65
63
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
66
64
|
navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
67
65
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -90,18 +88,12 @@ export declare class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
|
|
|
90
88
|
requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
91
89
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
92
90
|
requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
93
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
94
|
-
handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
95
91
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
96
92
|
requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
97
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
98
|
-
handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
99
93
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
100
94
|
errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
101
95
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
102
96
|
failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
103
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
104
|
-
handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
105
97
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
106
98
|
maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
107
99
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsdom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/jsdom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,OAAO,KAAK,EACR,aAAa,EACb,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAS,MAAM,gBAAgB,CAAC;AAE7E,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC;AACvC,OAAO,EAAyB,cAAc,EAAE,MAAM,OAAO,CAAC;AAM9D,MAAM,MAAM,iBAAiB,CACzB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE3D,MAAM,WAAW,mBAAmB,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAClE;;OAEG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB;;OAEG;IACH,mBAAmB,CAAC,EAAE,OAAO,CAAC;CACjC;AAED,MAAM,MAAM,SAAS,CACjB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE/D,MAAM,WAAW,oBAAoB,CACjC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,EAAE,YAAY,CAAC;IACnE,MAAM,EAAE,SAAS,CAAC;IAClB,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,MAAM,mBAAmB,CAC3B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAkF7D,qBAAa,YAAa,SAAQ,WAAW,CAAC,oBAAoB,CAAC;IAC/D,iBAA0B,YAAY
|
|
1
|
+
{"version":3,"file":"jsdom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/jsdom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,OAAO,KAAK,EACR,aAAa,EACb,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACZ,sBAAsB,EACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAS,MAAM,gBAAgB,CAAC;AAE7E,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC;AACvC,OAAO,EAAyB,cAAc,EAAE,MAAM,OAAO,CAAC;AAM9D,MAAM,MAAM,iBAAiB,CACzB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE3D,MAAM,WAAW,mBAAmB,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAClE;;OAEG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB;;OAEG;IACH,mBAAmB,CAAC,EAAE,OAAO,CAAC;CACjC;AAED,MAAM,MAAM,SAAS,CACjB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE/D,MAAM,WAAW,oBAAoB,CACjC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,EAAE,YAAY,CAAC;IACnE,MAAM,EAAE,SAAS,CAAC;IAClB,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,MAAM,mBAAmB,CAC3B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAkF7D,qBAAa,YAAa,SAAQ,WAAW,CAAC,oBAAoB,CAAC;IAC/D,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAIpC;IAEF,SAAS,CAAC,UAAU,EAAE,OAAO,CAAC;IAC9B,SAAS,CAAC,mBAAmB,EAAE,OAAO,CAAC;IACvC,SAAS,CAAC,cAAc,EAAE,cAAc,GAAG,IAAI,CAAQ;gBAE3C,OAAO,GAAE,mBAAwB,EAAE,MAAM,CAAC,EAAE,aAAa;IASrE;;;;;;;;;;;;;OAaG;IACH,iBAAiB;IAgBjB,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAuE;cAEhF,eAAe,CAAC,OAAO,EAAE,oBAAoB;cAK7C,UAAU,CAC/B,QAAQ,EAAE,eAAe,EACzB,KAAK,EAAE,OAAO,EACd,eAAe,EAAE,oBAAoB;;;;wCAgEK,mBAAmB;;IAclD,kBAAkB,CAAC,OAAO,EAAE,oBAAoB;CA0BlE;AAED,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,mBAAmB,CAAC;IAC9B,MAAM,EAAE,SAAS,GAAG,IAAI,CAAC;IACzB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,wBAAsB,sBAAsB,CAAC,EACzC,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,eAAe,GAClB,EAAE,2BAA2B,4DA0B7B;AAmBD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,iBAAiB,CAC7B,OAAO,SAAS,oBAAoB,GAAG,oBAAoB,EAC3D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,kDAEzC"}
|
|
@@ -1,16 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
const utils_1 = require("@crawlee/utils");
|
|
9
|
-
const cheerio = tslib_1.__importStar(require("cheerio"));
|
|
10
|
-
const jsdom_1 = require("jsdom");
|
|
11
|
-
const ow_1 = tslib_1.__importDefault(require("ow"));
|
|
12
|
-
const timeout_1 = require("@apify/timeout");
|
|
13
|
-
const utilities_1 = require("@apify/utilities");
|
|
1
|
+
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router, tryAbsoluteURL, } from '@crawlee/http';
|
|
2
|
+
import { sleep } from '@crawlee/utils';
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
|
|
5
|
+
import ow from 'ow';
|
|
6
|
+
import { addTimeoutToPromise } from '@apify/timeout';
|
|
7
|
+
import { concatStreamToBuffer } from '@apify/utilities';
|
|
14
8
|
/**
|
|
15
9
|
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
|
|
16
10
|
* [jsdom](https://www.npmjs.com/package/jsdom) JSDOM implementation.
|
|
@@ -84,39 +78,23 @@ const utilities_1 = require("@apify/utilities");
|
|
|
84
78
|
* ```
|
|
85
79
|
* @category Crawlers
|
|
86
80
|
*/
|
|
87
|
-
const resources = new
|
|
81
|
+
const resources = new ResourceLoader({
|
|
88
82
|
// Copy from /packages/browser-pool/src/abstract-classes/browser-plugin.ts:17
|
|
89
83
|
// in order not to include the entire package here
|
|
90
84
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
|
|
91
85
|
});
|
|
92
|
-
class JSDOMCrawler extends
|
|
86
|
+
export class JSDOMCrawler extends HttpCrawler {
|
|
87
|
+
static optionsShape = {
|
|
88
|
+
...HttpCrawler.optionsShape,
|
|
89
|
+
runScripts: ow.optional.boolean,
|
|
90
|
+
hideInternalConsole: ow.optional.boolean,
|
|
91
|
+
};
|
|
92
|
+
runScripts;
|
|
93
|
+
hideInternalConsole;
|
|
94
|
+
virtualConsole = null;
|
|
93
95
|
constructor(options = {}, config) {
|
|
94
96
|
const { runScripts = false, hideInternalConsole = false, ...httpOptions } = options;
|
|
95
97
|
super(httpOptions, config);
|
|
96
|
-
Object.defineProperty(this, "runScripts", {
|
|
97
|
-
enumerable: true,
|
|
98
|
-
configurable: true,
|
|
99
|
-
writable: true,
|
|
100
|
-
value: void 0
|
|
101
|
-
});
|
|
102
|
-
Object.defineProperty(this, "hideInternalConsole", {
|
|
103
|
-
enumerable: true,
|
|
104
|
-
configurable: true,
|
|
105
|
-
writable: true,
|
|
106
|
-
value: void 0
|
|
107
|
-
});
|
|
108
|
-
Object.defineProperty(this, "virtualConsole", {
|
|
109
|
-
enumerable: true,
|
|
110
|
-
configurable: true,
|
|
111
|
-
writable: true,
|
|
112
|
-
value: null
|
|
113
|
-
});
|
|
114
|
-
Object.defineProperty(this, "jsdomErrorHandler", {
|
|
115
|
-
enumerable: true,
|
|
116
|
-
configurable: true,
|
|
117
|
-
writable: true,
|
|
118
|
-
value: (error) => this.log.debug('JSDOM error from console', error)
|
|
119
|
-
});
|
|
120
98
|
this.runScripts = runScripts;
|
|
121
99
|
this.hideInternalConsole = hideInternalConsole;
|
|
122
100
|
}
|
|
@@ -138,20 +116,21 @@ class JSDOMCrawler extends http_1.HttpCrawler {
|
|
|
138
116
|
if (this.virtualConsole) {
|
|
139
117
|
return this.virtualConsole;
|
|
140
118
|
}
|
|
141
|
-
this.virtualConsole = new
|
|
119
|
+
this.virtualConsole = new VirtualConsole();
|
|
142
120
|
if (!this.hideInternalConsole) {
|
|
143
121
|
this.virtualConsole.sendTo(console, { omitJSDOMErrors: true });
|
|
144
122
|
}
|
|
145
123
|
this.virtualConsole.on('jsdomError', this.jsdomErrorHandler);
|
|
146
124
|
return this.virtualConsole;
|
|
147
125
|
}
|
|
126
|
+
jsdomErrorHandler = (error) => this.log.debug('JSDOM error from console', error);
|
|
148
127
|
async _cleanupContext(context) {
|
|
149
128
|
this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
|
|
150
129
|
context.window?.close();
|
|
151
130
|
}
|
|
152
131
|
async _parseHTML(response, isXml, crawlingContext) {
|
|
153
|
-
const body = await
|
|
154
|
-
const { window } = new
|
|
132
|
+
const body = await concatStreamToBuffer(response);
|
|
133
|
+
const { window } = new JSDOM(body, {
|
|
155
134
|
url: response.url,
|
|
156
135
|
contentType: isXml ? 'text/xml' : 'text/html',
|
|
157
136
|
runScripts: this.runScripts ? 'dangerously' : undefined,
|
|
@@ -181,13 +160,13 @@ class JSDOMCrawler extends http_1.HttpCrawler {
|
|
|
181
160
|
};
|
|
182
161
|
if (this.runScripts) {
|
|
183
162
|
try {
|
|
184
|
-
await
|
|
163
|
+
await addTimeoutToPromise(async () => {
|
|
185
164
|
return new Promise((resolve) => {
|
|
186
165
|
window.addEventListener('load', () => {
|
|
187
166
|
resolve();
|
|
188
167
|
}, false);
|
|
189
168
|
}).catch();
|
|
190
|
-
},
|
|
169
|
+
}, 10_000, 'Window.load event not fired after 10 seconds.').catch();
|
|
191
170
|
}
|
|
192
171
|
catch (e) {
|
|
193
172
|
this.log.debug(e.message);
|
|
@@ -215,18 +194,18 @@ class JSDOMCrawler extends http_1.HttpCrawler {
|
|
|
215
194
|
};
|
|
216
195
|
}
|
|
217
196
|
async _runRequestHandler(context) {
|
|
218
|
-
context.waitForSelector = async (selector, timeoutMs =
|
|
197
|
+
context.waitForSelector = async (selector, timeoutMs = 5_000) => {
|
|
219
198
|
const $ = cheerio.load(context.body);
|
|
220
199
|
if ($(selector).get().length === 0) {
|
|
221
200
|
if (timeoutMs) {
|
|
222
|
-
await
|
|
201
|
+
await sleep(50);
|
|
223
202
|
await context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
|
|
224
203
|
return;
|
|
225
204
|
}
|
|
226
205
|
throw new Error(`Selector '${selector}' not found.`);
|
|
227
206
|
}
|
|
228
207
|
};
|
|
229
|
-
context.parseWithCheerio = async (selector, _timeoutMs =
|
|
208
|
+
context.parseWithCheerio = async (selector, _timeoutMs = 5_000) => {
|
|
230
209
|
const $ = cheerio.load(context.body);
|
|
231
210
|
if (selector && $(selector).get().length === 0) {
|
|
232
211
|
throw new Error(`Selector '${selector}' not found.`);
|
|
@@ -236,30 +215,19 @@ class JSDOMCrawler extends http_1.HttpCrawler {
|
|
|
236
215
|
await super._runRequestHandler(context);
|
|
237
216
|
}
|
|
238
217
|
}
|
|
239
|
-
exports.JSDOMCrawler = JSDOMCrawler;
|
|
240
|
-
Object.defineProperty(JSDOMCrawler, "optionsShape", {
|
|
241
|
-
enumerable: true,
|
|
242
|
-
configurable: true,
|
|
243
|
-
writable: true,
|
|
244
|
-
value: {
|
|
245
|
-
...http_1.HttpCrawler.optionsShape,
|
|
246
|
-
runScripts: ow_1.default.optional.boolean,
|
|
247
|
-
hideInternalConsole: ow_1.default.optional.boolean,
|
|
248
|
-
}
|
|
249
|
-
});
|
|
250
218
|
/** @internal */
|
|
251
|
-
async function domCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }) {
|
|
219
|
+
export async function domCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }) {
|
|
252
220
|
if (!window) {
|
|
253
221
|
throw new Error('Cannot enqueue links because the JSDOM is not available.');
|
|
254
222
|
}
|
|
255
|
-
const baseUrl =
|
|
223
|
+
const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
|
|
256
224
|
enqueueStrategy: options?.strategy,
|
|
257
225
|
finalRequestUrl,
|
|
258
226
|
originalRequestUrl,
|
|
259
227
|
userProvidedBaseUrl: options?.baseUrl,
|
|
260
228
|
});
|
|
261
229
|
const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
|
|
262
|
-
return
|
|
230
|
+
return enqueueLinks({
|
|
263
231
|
requestQueue,
|
|
264
232
|
robotsTxtFile,
|
|
265
233
|
onSkippedRequest,
|
|
@@ -280,7 +248,7 @@ function extractUrlsFromWindow(window, selector, baseUrl) {
|
|
|
280
248
|
if (href === undefined) {
|
|
281
249
|
return undefined;
|
|
282
250
|
}
|
|
283
|
-
return
|
|
251
|
+
return tryAbsoluteURL(href, baseUrl);
|
|
284
252
|
})
|
|
285
253
|
.filter((href) => href !== undefined && href !== '');
|
|
286
254
|
}
|
|
@@ -308,7 +276,7 @@ function extractUrlsFromWindow(window, selector, baseUrl) {
|
|
|
308
276
|
* await crawler.run();
|
|
309
277
|
* ```
|
|
310
278
|
*/
|
|
311
|
-
function createJSDOMRouter(routes) {
|
|
312
|
-
return
|
|
279
|
+
export function createJSDOMRouter(routes) {
|
|
280
|
+
return Router.create(routes);
|
|
313
281
|
}
|
|
314
282
|
//# sourceMappingURL=jsdom-crawler.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jsdom-crawler.js","sourceRoot":"","sources":["../../src/internals/jsdom-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"jsdom-crawler.js","sourceRoot":"","sources":["../../src/internals/jsdom-crawler.ts"],"names":[],"mappings":"AAeA,OAAO,EACH,YAAY,EACZ,WAAW,EACX,sCAAsC,EACtC,MAAM,EACN,cAAc,GACjB,MAAM,eAAe,CAAC;AAEvB,OAAO,EAAwC,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAC7E,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,OAAO,CAAC;AAC9D,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,OAAO,EAAE,mBAAmB,EAAE,MAAM,gBAAgB,CAAC;AACrD,OAAO,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AAoExD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwEG;AACH,MAAM,SAAS,GAAG,IAAI,cAAc,CAAC;IACjC,6EAA6E;IAC7E,kDAAkD;IAClD,SAAS,EACL,uHAAuH;CAC9H,CAAC,CAAC;AAEH,MAAM,OAAO,YAAa,SAAQ,WAAiC;IACrD,MAAM,CAAU,YAAY,GAAG;QACrC,GAAG,WAAW,CAAC,YAAY;QAC3B,UAAU,EAAE,EAAE,CAAC,QAAQ,CAAC,OAAO;QAC/B,mBAAmB,EAAE,EAAE,CAAC,QAAQ,CAAC,OAAO;KAC3C,CAAC;IAEQ,UAAU,CAAU;IACpB,mBAAmB,CAAU;IAC7B,cAAc,GAA0B,IAAI,CAAC;IAEvD,YAAY,UAA+B,EAAE,EAAE,MAAsB;QACjE,MAAM,EAAE,UAAU,GAAG,KAAK,EAAE,mBAAmB,GAAG,KAAK,EAAE,GAAG,WAAW,EAAE,GAAG,OAAO,CAAC;QAEpF,KAAK,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;QAE3B,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,mBAAmB,GAAG,mBAAmB,CAAC;IACnD,CAAC;IAED;;;;;;;;;;;;;OAaG;IACH,iBAAiB;QACb,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACtB,OAAO,IAAI,CAAC,cAAc,CAAC;QAC/B,CAAC;QAED,IAAI,CAAC,cAAc,GAAG,IAAI,cAAc,EAAE,CAAC;QAE3C,IAAI,CAAC,IAAI,CAAC,mBAAmB,EAAE,CAAC;YAC5B,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,OAAO,EAAE,EAAE,eAAe,EAAE,IAAI,EAAE,CAAC,CAAC;QACnE,CAAC;QAED,IAAI,CAAC,cAAc,CAAC,EAAE,CAAC,YAAY,EAAE,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAE7D,OAAO,IAAI,CAAC,cAAc,CAAC;IAC/B,CAAC;IAEgB,iBAAiB,GAAG,CAAC,KAAY,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,0BAA0B,EAAE,KAAK,CAAC,CAAC;IAEtF,KAAK,CAAC,eAAe,CAAC,OAA6B;QAClE,IAAI,CAAC,iBAAiB,EAAE,CAAC,GAAG,CAAC,YAAY,EAAE,IAAI,CAAC,iBAAiB,CAAC,CAAC;QACnE,OAAO,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC;IAC5B,CAAC;IAEkB,KAAK,CAAC,UAAU,CAC/B,QAAyB,EACzB,KAAc,EACd,eAAqC;QAErC,MAAM,IAAI,GAAG,MAAM,oBAAoB,CAAC,QAAQ,CAAC,CAAC;QAElD,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;YAC/B,GAAG,EAAE,QAAQ,CAAC,GAAG;YACjB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,WAAW;YAC7C,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,SAAS;YACvD,SAAS;YACT,cAAc,EAAE,IAAI,CAAC,iBAAiB,EAAE;YACxC,iBAAiB,EAAE,IAAI;SAC1B,CAAC,CAAC;QAEH,kEAAkE;QAClE,MAAM,CAAC,cAAc,CAAC,MAAM,EAAE,YAAY,EAAE;YACxC,QAAQ,EAAE,IAAI;YACd,KAAK,EAAE,CAAC,KAAc,EAAO,EAAE,CAAC,CAAC;gBAC7B,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,KAAK;gBACZ,QAAQ,EAAE,IAAI;gBACd,WAAW,EAAE,GAAG,EAAE,GAAE,CAAC;gBACrB,cAAc,EAAE,GAAG,EAAE,GAAE,CAAC;gBACxB,gBAAgB,EAAE,GAAG,EAAE,GAAE,CAAC;gBAC1B,mBAAmB,EAAE,GAAG,EAAE,GAAE,CAAC;gBAC7B,aAAa,EAAE,GAAG,EAAE,GAAE,CAAC;aAC1B,CAAC;SACL,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,WAAW,GAAG,GAAG,EAAE;YAC/B,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YACjC,KAAK,CAAC,qBAAqB,GAAG,GAAG,EAAE,CAAC,CAAC,EAAE,CAAQ,CAAC;YAChD,KAAK,CAAC,cAAc,GAAG,GAAG,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC,IAAW,EAAE,MAAM,EAAE,CAAC,EAAE,CAAQ,CAAC;YAC7E,OAAO,KAAK,CAAC;QACjB,CAAC,CAAC;QAEF,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YAClB,IAAI,CAAC;gBACD,MAAM,mBAAmB,CACrB,KAAK,IAAI,EAAE;oBACP,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;wBACjC,MAAM,CAAC,gBAAgB,CACnB,MAAM,EACN,GAAG,EAAE;4BACD,OAAO,EAAE,CAAC;wBACd,CAAC,EACD,KAAK,CACR,CAAC;oBACN,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC;gBACf,CAAC,EACD,MAAM,EACN,+CAA+C,CAClD,CAAC,KAAK,EAAE,CAAC;YACd,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACT,IAAI,CAAC,GAAG,CAAC,KAAK,CAAE,CAAW,CAAC,OAAO,CAAC,CAAC;YACzC,CAAC;QACL,CAAC;QAED,OAAO;YACH,MAAM;YACN,IAAI,IAAI;gBACJ,OAAO,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,SAAS,CAAC;YACrD,CAAC;YACD,IAAI,QAAQ;gBACR,OAAO,MAAM,CAAC,QAAQ,CAAC;YAC3B,CAAC;YACD,YAAY,EAAE,KAAK,EAAE,cAAoC,EAAE,EAAE;gBACzD,OAAO,sBAAsB,CAAC;oBAC1B,OAAO,EAAE,cAAc;oBACvB,MAAM;oBACN,YAAY,EAAE,MAAM,IAAI,CAAC,eAAe,EAAE;oBAC1C,aAAa,EAAE,MAAM,IAAI,CAAC,sBAAsB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC;oBAC7E,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;oBACvC,kBAAkB,EAAE,eAAe,CAAC,OAAO,CAAC,GAAG;oBAC/C,eAAe,EAAE,eAAe,CAAC,OAAO,CAAC,SAAS;iBACrD,CAAC,CAAC;YACP,CAAC;SACJ,CAAC;IACN,CAAC;IAEQ,KAAK,CAAC,kBAAkB,CAAC,OAA6B;QAC3D,OAAO,CAAC,eAAe,GAAG,KAAK,EAAE,QAAgB,EAAE,SAAS,GAAG,KAAK,EAAE,EAAE;YACpE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACjC,IAAI,SAAS,EAAE,CAAC;oBACZ,MAAM,KAAK,CAAC,EAAE,CAAC,CAAC;oBAChB,MAAM,OAAO,CAAC,eAAe,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;oBACrE,OAAO;gBACX,CAAC;gBAED,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;QACL,CAAC,CAAC;QACF,OAAO,CAAC,gBAAgB,GAAG,KAAK,EAAE,QAAiB,EAAE,UAAU,GAAG,KAAK,EAAE,EAAE;YACvE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,QAAQ,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7C,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;YAED,OAAO,CAAC,CAAC;QACb,CAAC,CAAC;QAEF,MAAM,KAAK,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;;AAaL,gBAAgB;AAChB,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAAC,EACzC,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,eAAe,GACW;IAC1B,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,MAAM,IAAI,KAAK,CAAC,0DAA0D,CAAC,CAAC;IAChF,CAAC;IAED,MAAM,OAAO,GAAG,sCAAsC,CAAC;QACnD,eAAe,EAAE,OAAO,EAAE,QAAQ;QAClC,eAAe;QACf,kBAAkB;QAClB,mBAAmB,EAAE,OAAO,EAAE,OAAO;KACxC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,qBAAqB,CAC9B,MAAM,EACN,OAAO,EAAE,QAAQ,IAAI,GAAG,EACxB,OAAO,EAAE,OAAO,IAAI,eAAe,IAAI,kBAAkB,CAC5D,CAAC;IAEF,OAAO,YAAY,CAAC;QAChB,YAAY;QACZ,aAAa;QACb,gBAAgB;QAChB,IAAI;QACJ,OAAO;QACP,GAAG,OAAO;KACb,CAAC,CAAC;AACP,CAAC;AAED;;;GAGG;AACH,SAAS,qBAAqB,CAAC,MAAiB,EAAE,QAAgB,EAAE,OAAe;IAC/E,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;SACxD,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SACvB,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAC;SACnD,GAAG,CAAC,CAAC,IAAwB,EAAE,EAAE;QAC9B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACrB,OAAO,SAAS,CAAC;QACrB,CAAC;QACD,OAAO,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAa,CAAC;AACzE,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,UAAU,iBAAiB,CAG/B,MAAwC;IACtC,OAAO,MAAM,CAAC,MAAM,CAAU,MAAM,CAAC,CAAC;AAC1C,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,19 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/jsdom",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.0-beta.1",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
|
-
"node": ">=
|
|
6
|
+
"node": ">=22.0.0"
|
|
7
7
|
},
|
|
8
|
-
"
|
|
9
|
-
"module": "./index.mjs",
|
|
10
|
-
"types": "./index.d.ts",
|
|
8
|
+
"type": "module",
|
|
11
9
|
"exports": {
|
|
12
|
-
".":
|
|
13
|
-
"import": "./index.mjs",
|
|
14
|
-
"require": "./index.js",
|
|
15
|
-
"types": "./index.d.ts"
|
|
16
|
-
},
|
|
10
|
+
".": "./index.js",
|
|
17
11
|
"./package.json": "./package.json"
|
|
18
12
|
},
|
|
19
13
|
"keywords": [
|
|
@@ -46,23 +40,23 @@
|
|
|
46
40
|
"scripts": {
|
|
47
41
|
"build": "yarn clean && yarn compile && yarn copy",
|
|
48
42
|
"clean": "rimraf ./dist",
|
|
49
|
-
"compile": "tsc -p tsconfig.build.json
|
|
43
|
+
"compile": "tsc -p tsconfig.build.json",
|
|
50
44
|
"copy": "tsx ../../scripts/copy.ts"
|
|
51
45
|
},
|
|
52
46
|
"publishConfig": {
|
|
53
47
|
"access": "public"
|
|
54
48
|
},
|
|
55
49
|
"dependencies": {
|
|
56
|
-
"@apify/timeout": "^0.3.
|
|
57
|
-
"@apify/utilities": "^2.
|
|
58
|
-
"@crawlee/http": "
|
|
59
|
-
"@crawlee/types": "
|
|
60
|
-
"@crawlee/utils": "
|
|
61
|
-
"@types/jsdom": "^21.
|
|
62
|
-
"cheerio": "1.0.0
|
|
63
|
-
"jsdom": "^26.
|
|
64
|
-
"ow": "^0.
|
|
65
|
-
"tslib": "^2.
|
|
50
|
+
"@apify/timeout": "^0.3.2",
|
|
51
|
+
"@apify/utilities": "^2.15.5",
|
|
52
|
+
"@crawlee/http": "4.0.0-beta.1",
|
|
53
|
+
"@crawlee/types": "4.0.0-beta.1",
|
|
54
|
+
"@crawlee/utils": "4.0.0-beta.1",
|
|
55
|
+
"@types/jsdom": "^21.1.7",
|
|
56
|
+
"cheerio": "^1.0.0",
|
|
57
|
+
"jsdom": "^26.1.0",
|
|
58
|
+
"ow": "^2.0.0",
|
|
59
|
+
"tslib": "^2.8.1"
|
|
66
60
|
},
|
|
67
61
|
"lerna": {
|
|
68
62
|
"command": {
|
|
@@ -71,5 +65,5 @@
|
|
|
71
65
|
}
|
|
72
66
|
}
|
|
73
67
|
},
|
|
74
|
-
"gitHead": "
|
|
68
|
+
"gitHead": "4375a5af786811feda9dc872bbbb3406b2f13974"
|
|
75
69
|
}
|