@crawlee/http 4.0.0-beta.4 → 4.0.0-beta.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/file-download.d.ts +58 -32
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +116 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +92 -175
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +169 -321
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +7 -7
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Readable } from 'node:stream';
|
|
2
2
|
import util from 'node:util';
|
|
3
|
-
import {
|
|
3
|
+
import { BasicCrawler, ContextPipeline, mergeCookies, RequestState, Router, SessionError } from '@crawlee/basic';
|
|
4
|
+
import { ResponseWithUrl } from '@crawlee/http-client';
|
|
4
5
|
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
|
|
5
6
|
import * as cheerio from 'cheerio';
|
|
6
7
|
import contentTypeParser from 'content-type';
|
|
7
8
|
import iconv from 'iconv-lite';
|
|
8
|
-
import
|
|
9
|
-
import ow, { ObjectPredicate } from 'ow';
|
|
9
|
+
import ow from 'ow';
|
|
10
10
|
import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
|
|
11
|
-
import {
|
|
12
|
-
let TimeoutError;
|
|
11
|
+
import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
|
|
13
12
|
/**
|
|
14
13
|
* Default mime types, which HttpScraper supports.
|
|
15
14
|
*/
|
|
@@ -46,18 +45,18 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
46
45
|
*
|
|
47
46
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
48
47
|
*
|
|
49
|
-
* We can use the `preNavigationHooks` to adjust
|
|
48
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
50
49
|
*
|
|
51
50
|
* ```javascript
|
|
52
51
|
* preNavigationHooks: [
|
|
53
|
-
* (crawlingContext
|
|
52
|
+
* (crawlingContext) => {
|
|
54
53
|
* // ...
|
|
55
54
|
* },
|
|
56
55
|
* ]
|
|
57
56
|
* ```
|
|
58
57
|
*
|
|
59
|
-
* By default, this crawler only processes web pages with the `text/html`
|
|
60
|
-
* and `application/
|
|
58
|
+
* By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
59
|
+
* and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
61
60
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
62
61
|
* use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
|
|
63
62
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
@@ -93,13 +92,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
93
92
|
* @category Crawlers
|
|
94
93
|
*/
|
|
95
94
|
export class HttpCrawler extends BasicCrawler {
|
|
96
|
-
config;
|
|
97
|
-
/**
|
|
98
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
99
|
-
* Only available if used by the crawler.
|
|
100
|
-
*/
|
|
101
|
-
proxyConfiguration;
|
|
102
|
-
userRequestHandlerTimeoutMillis;
|
|
103
95
|
preNavigationHooks;
|
|
104
96
|
postNavigationHooks;
|
|
105
97
|
persistCookiesPerSession;
|
|
@@ -107,8 +99,6 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
107
99
|
ignoreSslErrors;
|
|
108
100
|
suggestResponseEncoding;
|
|
109
101
|
forceResponseEncoding;
|
|
110
|
-
additionalHttpErrorStatusCodes;
|
|
111
|
-
ignoreHttpErrorStatusCodes;
|
|
112
102
|
supportedMimeTypes;
|
|
113
103
|
static optionsShape = {
|
|
114
104
|
...BasicCrawler.optionsShape,
|
|
@@ -117,158 +107,144 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
117
107
|
additionalMimeTypes: ow.optional.array.ofType(ow.string),
|
|
118
108
|
suggestResponseEncoding: ow.optional.string,
|
|
119
109
|
forceResponseEncoding: ow.optional.string,
|
|
120
|
-
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
121
110
|
persistCookiesPerSession: ow.optional.boolean,
|
|
122
|
-
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
123
|
-
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
124
111
|
preNavigationHooks: ow.optional.array,
|
|
125
112
|
postNavigationHooks: ow.optional.array,
|
|
126
113
|
};
|
|
127
114
|
/**
|
|
128
115
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
129
116
|
*/
|
|
130
|
-
constructor(options = {}
|
|
117
|
+
constructor(options = {}) {
|
|
131
118
|
ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
|
|
132
|
-
const {
|
|
119
|
+
const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession = true, preNavigationHooks = [], postNavigationHooks = [],
|
|
133
120
|
// BasicCrawler
|
|
134
|
-
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
|
|
121
|
+
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
|
|
135
122
|
super({
|
|
136
123
|
...basicCrawlerOptions,
|
|
137
|
-
requestHandler,
|
|
138
124
|
autoscaledPoolOptions,
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
}, config);
|
|
143
|
-
this.config = config;
|
|
144
|
-
// FIXME any
|
|
145
|
-
this.requestHandler = requestHandler ?? this.router;
|
|
146
|
-
// Cookies should be persisted per session only if session pool is used
|
|
147
|
-
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
148
|
-
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
149
|
-
}
|
|
125
|
+
contextPipelineBuilder: contextPipelineBuilder ??
|
|
126
|
+
(() => this.buildContextPipeline()),
|
|
127
|
+
});
|
|
150
128
|
this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
|
|
151
129
|
if (additionalMimeTypes.length)
|
|
152
130
|
this._extendSupportedMimeTypes(additionalMimeTypes);
|
|
153
131
|
if (suggestResponseEncoding && forceResponseEncoding) {
|
|
154
132
|
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
|
|
155
133
|
}
|
|
156
|
-
this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
157
134
|
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
|
|
158
135
|
this.ignoreSslErrors = ignoreSslErrors;
|
|
159
136
|
this.suggestResponseEncoding = suggestResponseEncoding;
|
|
160
137
|
this.forceResponseEncoding = forceResponseEncoding;
|
|
161
|
-
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
162
|
-
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
163
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
164
138
|
this.preNavigationHooks = preNavigationHooks;
|
|
165
139
|
this.postNavigationHooks = [
|
|
166
140
|
({ request, response }) => this._abortDownloadOfBody(request, response),
|
|
167
141
|
...postNavigationHooks,
|
|
168
142
|
];
|
|
169
|
-
|
|
170
|
-
this.persistCookiesPerSession = persistCookiesPerSession ?? true;
|
|
171
|
-
}
|
|
172
|
-
else {
|
|
173
|
-
this.persistCookiesPerSession = false;
|
|
174
|
-
}
|
|
143
|
+
this.persistCookiesPerSession = persistCookiesPerSession;
|
|
175
144
|
}
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
const className = this.constructor.name;
|
|
184
|
-
const extensionOptions = extension.getCrawlerOptions();
|
|
185
|
-
for (const [key, value] of Object.entries(extensionOptions)) {
|
|
186
|
-
const isConfigurable = Object.hasOwn(this, key);
|
|
187
|
-
const originalType = typeof this[key];
|
|
188
|
-
const extensionType = typeof value; // What if we want to null something? It is really needed?
|
|
189
|
-
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
|
|
190
|
-
const exists = this[key] != null;
|
|
191
|
-
if (!isConfigurable) {
|
|
192
|
-
// Test if the property can be configured on the crawler
|
|
193
|
-
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
|
|
194
|
-
}
|
|
195
|
-
if (!isSameType && exists) {
|
|
196
|
-
// Assuming that extensions will only add up configuration
|
|
197
|
-
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
|
|
198
|
-
}
|
|
199
|
-
this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
|
|
200
|
-
this[key] = value;
|
|
201
|
-
}
|
|
145
|
+
buildContextPipeline() {
|
|
146
|
+
return ContextPipeline.create()
|
|
147
|
+
.compose({
|
|
148
|
+
action: this.makeHttpRequest.bind(this),
|
|
149
|
+
})
|
|
150
|
+
.compose({ action: this.processHttpResponse.bind(this) })
|
|
151
|
+
.compose({ action: this.handleBlockedRequestByContent.bind(this) });
|
|
202
152
|
}
|
|
203
|
-
|
|
204
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
205
|
-
*/
|
|
206
|
-
async _runRequestHandler(crawlingContext) {
|
|
153
|
+
async makeHttpRequest(crawlingContext) {
|
|
207
154
|
const { request, session } = crawlingContext;
|
|
208
|
-
if (
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
const $ = cheerio.load(parsed.body.toString());
|
|
222
|
-
if ($(selector).get().length === 0) {
|
|
223
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
224
|
-
}
|
|
155
|
+
if (request.skipNavigation) {
|
|
156
|
+
return {
|
|
157
|
+
request: new Proxy(request, {
|
|
158
|
+
get(target, propertyName, receiver) {
|
|
159
|
+
if (propertyName === 'loadedUrl') {
|
|
160
|
+
throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
|
|
161
|
+
}
|
|
162
|
+
return Reflect.get(target, propertyName, receiver);
|
|
163
|
+
},
|
|
164
|
+
}),
|
|
165
|
+
get response() {
|
|
166
|
+
throw new Error('The `response` property is not available - `skipNavigation` was used');
|
|
167
|
+
},
|
|
225
168
|
};
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
169
|
+
}
|
|
170
|
+
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
171
|
+
request.state = RequestState.BEFORE_NAV;
|
|
172
|
+
// Execute pre navigation hooks before applying session pool cookies,
|
|
173
|
+
// as they may also set cookies in the session
|
|
174
|
+
await this._executeHooks(this.preNavigationHooks, crawlingContext);
|
|
175
|
+
tryCancel();
|
|
176
|
+
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
177
|
+
const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
178
|
+
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
179
|
+
const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
180
|
+
tryCancel();
|
|
181
|
+
request.loadedUrl = httpResponse?.url;
|
|
182
|
+
request.state = RequestState.AFTER_NAV;
|
|
183
|
+
return { request: request, response: httpResponse };
|
|
184
|
+
}
|
|
185
|
+
async processHttpResponse(crawlingContext) {
|
|
186
|
+
if (crawlingContext.request.skipNavigation) {
|
|
187
|
+
return {
|
|
188
|
+
get contentType() {
|
|
189
|
+
throw new Error('The `contentType` property is not available - `skipNavigation` was used');
|
|
190
|
+
},
|
|
191
|
+
get body() {
|
|
192
|
+
throw new Error('The `body` property is not available - `skipNavigation` was used');
|
|
193
|
+
},
|
|
194
|
+
get json() {
|
|
195
|
+
throw new Error('The `json` property is not available - `skipNavigation` was used');
|
|
196
|
+
},
|
|
197
|
+
get waitForSelector() {
|
|
198
|
+
throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
|
|
199
|
+
},
|
|
200
|
+
get parseWithCheerio() {
|
|
201
|
+
throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
|
|
202
|
+
},
|
|
232
203
|
};
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
204
|
+
}
|
|
205
|
+
await this._executeHooks(this.postNavigationHooks, crawlingContext);
|
|
206
|
+
tryCancel();
|
|
207
|
+
const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
|
|
208
|
+
tryCancel();
|
|
209
|
+
const response = parsed.response;
|
|
210
|
+
const contentType = parsed.contentType;
|
|
211
|
+
const waitForSelector = async (selector, _timeoutMs) => {
|
|
212
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
213
|
+
if ($(selector).get().length === 0) {
|
|
214
|
+
throw new Error(`Selector '${selector}' not found.`);
|
|
238
215
|
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
request.noRetry = true;
|
|
245
|
-
request.state = RequestState.SKIPPED;
|
|
246
|
-
return;
|
|
216
|
+
};
|
|
217
|
+
const parseWithCheerio = async (selector, timeoutMs) => {
|
|
218
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
219
|
+
if (selector) {
|
|
220
|
+
await crawlingContext.waitForSelector(selector, timeoutMs);
|
|
247
221
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
const jsonString = parsed.body.toString(contentType.encoding);
|
|
254
|
-
return JSON.parse(jsonString);
|
|
255
|
-
},
|
|
256
|
-
});
|
|
222
|
+
return $;
|
|
223
|
+
};
|
|
224
|
+
this._throwOnBlockedRequest(response.status);
|
|
225
|
+
if (this.persistCookiesPerSession) {
|
|
226
|
+
crawlingContext.session.setCookiesFromResponse(response);
|
|
257
227
|
}
|
|
228
|
+
return {
|
|
229
|
+
get json() {
|
|
230
|
+
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
231
|
+
return null;
|
|
232
|
+
const jsonString = parsed.body.toString(contentType.encoding);
|
|
233
|
+
return JSON.parse(jsonString);
|
|
234
|
+
},
|
|
235
|
+
waitForSelector,
|
|
236
|
+
parseWithCheerio,
|
|
237
|
+
contentType,
|
|
238
|
+
body: parsed.body,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
async handleBlockedRequestByContent(crawlingContext) {
|
|
258
242
|
if (this.retryOnBlocked) {
|
|
259
243
|
const error = await this.isRequestBlocked(crawlingContext);
|
|
260
244
|
if (error)
|
|
261
245
|
throw new SessionError(error);
|
|
262
246
|
}
|
|
263
|
-
|
|
264
|
-
try {
|
|
265
|
-
await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
|
|
266
|
-
request.state = RequestState.DONE;
|
|
267
|
-
}
|
|
268
|
-
catch (e) {
|
|
269
|
-
request.state = RequestState.ERROR;
|
|
270
|
-
throw e;
|
|
271
|
-
}
|
|
247
|
+
return {};
|
|
272
248
|
}
|
|
273
249
|
async isRequestBlocked(crawlingContext) {
|
|
274
250
|
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
|
|
@@ -278,84 +254,34 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
278
254
|
return `Found selectors: ${foundSelectors.join(', ')}`;
|
|
279
255
|
}
|
|
280
256
|
}
|
|
257
|
+
if (this.blockedStatusCodes.has(crawlingContext.response.status)) {
|
|
258
|
+
return `Blocked by status code ${crawlingContext.response.status}`;
|
|
259
|
+
}
|
|
281
260
|
return false;
|
|
282
261
|
}
|
|
283
|
-
async _handleNavigation(crawlingContext) {
|
|
284
|
-
const gotOptions = {};
|
|
285
|
-
const { request, session } = crawlingContext;
|
|
286
|
-
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
287
|
-
request.state = RequestState.BEFORE_NAV;
|
|
288
|
-
// Execute pre navigation hooks before applying session pool cookies,
|
|
289
|
-
// as they may also set cookies in the session
|
|
290
|
-
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
|
|
291
|
-
tryCancel();
|
|
292
|
-
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
293
|
-
this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
294
|
-
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
295
|
-
crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
296
|
-
tryCancel();
|
|
297
|
-
request.state = RequestState.AFTER_NAV;
|
|
298
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
|
|
299
|
-
tryCancel();
|
|
300
|
-
}
|
|
301
262
|
/**
|
|
302
|
-
*
|
|
263
|
+
* Returns the `Cookie` header value based on the current context and
|
|
264
|
+
* any changes that occurred in the navigation hooks.
|
|
303
265
|
*/
|
|
304
|
-
_applyCookies({ session, request },
|
|
305
|
-
const sessionCookie = session
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
|
|
309
|
-
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
|
|
310
|
-
const sourceCookies = [];
|
|
311
|
-
if (Array.isArray(lowerCaseHeader)) {
|
|
312
|
-
sourceCookies.push(...lowerCaseHeader);
|
|
313
|
-
}
|
|
314
|
-
else {
|
|
315
|
-
sourceCookies.push(lowerCaseHeader);
|
|
316
|
-
}
|
|
317
|
-
if (Array.isArray(upperCaseHeader)) {
|
|
318
|
-
sourceCookies.push(...upperCaseHeader);
|
|
319
|
-
}
|
|
320
|
-
else {
|
|
321
|
-
sourceCookies.push(upperCaseHeader);
|
|
322
|
-
}
|
|
323
|
-
alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
|
|
324
|
-
}
|
|
325
|
-
const sourceCookies = [sessionCookie, preHookCookies];
|
|
326
|
-
if (Array.isArray(alteredGotOptionsCookies)) {
|
|
327
|
-
sourceCookies.push(...alteredGotOptionsCookies);
|
|
328
|
-
}
|
|
329
|
-
else {
|
|
330
|
-
sourceCookies.push(alteredGotOptionsCookies);
|
|
331
|
-
}
|
|
332
|
-
sourceCookies.push(postHookCookies);
|
|
333
|
-
const mergedCookie = mergeCookies(request.url, sourceCookies);
|
|
334
|
-
gotOptions.headers ??= {};
|
|
335
|
-
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
|
|
336
|
-
Reflect.deleteProperty(gotOptions.headers, 'cookie');
|
|
337
|
-
if (mergedCookie !== '') {
|
|
338
|
-
gotOptions.headers.Cookie = mergedCookie;
|
|
339
|
-
}
|
|
266
|
+
_applyCookies({ session, request }, preHookCookies, postHookCookies) {
|
|
267
|
+
const sessionCookie = session.getCookieString(request.url);
|
|
268
|
+
const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
|
|
269
|
+
return mergeCookies(request.url, sourceCookies);
|
|
340
270
|
}
|
|
341
271
|
/**
|
|
342
272
|
* Function to make the HTTP request. It performs optimizations
|
|
343
273
|
* on the request such as only downloading the request body if the
|
|
344
274
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
345
275
|
*/
|
|
346
|
-
async _requestFunction({ request, session, proxyUrl,
|
|
347
|
-
|
|
348
|
-
// @ts-ignore
|
|
349
|
-
({ TimeoutError } = await import('got-scraping'));
|
|
350
|
-
}
|
|
351
|
-
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
|
|
276
|
+
async _requestFunction({ request, session, proxyUrl, cookieString, }) {
|
|
277
|
+
const opts = this._getRequestOptions(request, session, proxyUrl);
|
|
352
278
|
try {
|
|
353
|
-
return await this._requestAsBrowser(opts, session);
|
|
279
|
+
return await this._requestAsBrowser(opts, session, cookieString);
|
|
354
280
|
}
|
|
355
281
|
catch (e) {
|
|
356
|
-
if (e instanceof TimeoutError) {
|
|
282
|
+
if (e instanceof Error && e.constructor.name === 'TimeoutError') {
|
|
357
283
|
this._handleRequestTimeout(session);
|
|
358
|
-
return
|
|
284
|
+
return new Response(); // this will never happen, as _handleRequestTimeout always throws
|
|
359
285
|
}
|
|
360
286
|
if (this.isProxyError(e)) {
|
|
361
287
|
throw new SessionError(this._getMessageFromError(e));
|
|
@@ -368,18 +294,16 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
368
294
|
/**
|
|
369
295
|
* Encodes and parses response according to the provided content type
|
|
370
296
|
*/
|
|
371
|
-
async _parseResponse(request,
|
|
372
|
-
const {
|
|
373
|
-
const { type, charset } = parseContentTypeFromResponse(
|
|
374
|
-
const { response, encoding } = this._encodeResponse(request,
|
|
297
|
+
async _parseResponse(request, response) {
|
|
298
|
+
const { status } = response;
|
|
299
|
+
const { type, charset } = parseContentTypeFromResponse(response);
|
|
300
|
+
const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
|
|
375
301
|
const contentType = { type, encoding };
|
|
376
|
-
if (
|
|
377
|
-
this.stats.registerStatusCode(
|
|
302
|
+
if (status >= 400 && status <= 599) {
|
|
303
|
+
this.stats.registerStatusCode(status);
|
|
378
304
|
}
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
if ((statusCode >= 500 && !excludeError) || includeError) {
|
|
382
|
-
const body = await readStreamToString(response, encoding);
|
|
305
|
+
if (this.isErrorStatusCode(status)) {
|
|
306
|
+
const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
|
|
383
307
|
// Errors are often sent as JSON, so attempt to parse them,
|
|
384
308
|
// despite Accept header being set to text/html.
|
|
385
309
|
if (type === APPLICATION_JSON_MIME_TYPE) {
|
|
@@ -387,59 +311,47 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
387
311
|
let { message } = errorResponse;
|
|
388
312
|
if (!message)
|
|
389
313
|
message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
|
|
390
|
-
throw new Error(`${
|
|
314
|
+
throw new Error(`${status} - ${message}`);
|
|
391
315
|
}
|
|
392
|
-
if (
|
|
393
|
-
throw new Error(`${
|
|
316
|
+
if (this.additionalHttpErrorStatusCodes.has(status)) {
|
|
317
|
+
throw new Error(`${status} - Error status code was set by user.`);
|
|
394
318
|
}
|
|
395
319
|
// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
|
|
396
|
-
throw new Error(`${
|
|
320
|
+
throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
|
|
397
321
|
}
|
|
398
322
|
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
|
|
399
|
-
|
|
400
|
-
const parsed = await this._parseHTML(response, isXml, crawlingContext);
|
|
401
|
-
return { ...parsed, isXml, response, contentType };
|
|
323
|
+
return { response, contentType, body: await reencodedResponse.text() };
|
|
402
324
|
}
|
|
403
325
|
else {
|
|
404
|
-
const body = await
|
|
326
|
+
const body = Buffer.from(await reencodedResponse.bytes());
|
|
405
327
|
return {
|
|
406
328
|
body,
|
|
407
329
|
response,
|
|
408
330
|
contentType,
|
|
409
|
-
enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
|
|
410
331
|
};
|
|
411
332
|
}
|
|
412
333
|
}
|
|
413
|
-
async _parseHTML(response, _isXml, _crawlingContext) {
|
|
414
|
-
return {
|
|
415
|
-
body: await concatStreamToBuffer(response),
|
|
416
|
-
};
|
|
417
|
-
}
|
|
418
334
|
/**
|
|
419
335
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
420
336
|
*/
|
|
421
|
-
_getRequestOptions(request, session, proxyUrl
|
|
337
|
+
_getRequestOptions(request, session, proxyUrl) {
|
|
422
338
|
const requestOptions = {
|
|
423
339
|
url: request.url,
|
|
424
340
|
method: request.method,
|
|
425
341
|
proxyUrl,
|
|
426
|
-
timeout:
|
|
342
|
+
timeout: this.navigationTimeoutMillis,
|
|
343
|
+
cookieJar: this.persistCookiesPerSession ? session.cookieJar : undefined,
|
|
427
344
|
sessionToken: session,
|
|
428
|
-
|
|
429
|
-
headers: { ...request.headers, ...gotOptions?.headers },
|
|
345
|
+
headers: request.headers,
|
|
430
346
|
https: {
|
|
431
|
-
...gotOptions?.https,
|
|
432
347
|
rejectUnauthorized: !this.ignoreSslErrors,
|
|
433
348
|
},
|
|
434
|
-
|
|
349
|
+
body: undefined,
|
|
435
350
|
};
|
|
436
351
|
// Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
|
|
437
352
|
Reflect.deleteProperty(requestOptions.headers, 'cookie');
|
|
438
|
-
// TODO this is incorrect, the check for man in the middle needs to be done
|
|
439
|
-
// on individual proxy level, not on the `proxyConfiguration` level,
|
|
440
|
-
// because users can use normal + MITM proxies in a single configuration.
|
|
441
353
|
// Disable SSL verification for MITM proxies
|
|
442
|
-
if (
|
|
354
|
+
if (session.proxyInfo?.ignoreTlsErrors) {
|
|
443
355
|
requestOptions.https = {
|
|
444
356
|
...requestOptions.https,
|
|
445
357
|
rejectUnauthorized: false,
|
|
@@ -468,13 +380,13 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
468
380
|
if (iconv.encodingExists(encoding)) {
|
|
469
381
|
const encodeStream = iconv.encodeStream(utf8);
|
|
470
382
|
const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
383
|
+
const reencodedBody = response.body
|
|
384
|
+
? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
|
|
385
|
+
.pipe(decodeStream)
|
|
386
|
+
.pipe(encodeStream)))
|
|
387
|
+
: null;
|
|
476
388
|
return {
|
|
477
|
-
response:
|
|
389
|
+
response: new ResponseWithUrl(reencodedBody, response),
|
|
478
390
|
encoding: utf8,
|
|
479
391
|
};
|
|
480
392
|
}
|
|
@@ -502,16 +414,13 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
502
414
|
* Handles timeout request
|
|
503
415
|
*/
|
|
504
416
|
_handleRequestTimeout(session) {
|
|
505
|
-
session
|
|
506
|
-
throw new Error(`request timed out after ${this.
|
|
417
|
+
session.markBad();
|
|
418
|
+
throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
507
419
|
}
|
|
508
420
|
_abortDownloadOfBody(request, response) {
|
|
509
|
-
const {
|
|
421
|
+
const { status } = response;
|
|
510
422
|
const { type } = parseContentTypeFromResponse(response);
|
|
511
|
-
|
|
512
|
-
const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
|
|
513
|
-
// if we retry the request, can the Content-Type change?
|
|
514
|
-
const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
|
|
423
|
+
const isTransientContentType = status >= 500 || this.blockedStatusCodes.has(status);
|
|
515
424
|
if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
|
|
516
425
|
request.noRetry = true;
|
|
517
426
|
throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
|
|
@@ -521,89 +430,28 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
521
430
|
/**
|
|
522
431
|
* @internal wraps public utility for mocking purposes
|
|
523
432
|
*/
|
|
524
|
-
_requestAsBrowser = async (options, session) => {
|
|
525
|
-
const
|
|
433
|
+
_requestAsBrowser = async (options, session, cookieString) => {
|
|
434
|
+
const opts = processHttpRequestOptions({
|
|
526
435
|
...options,
|
|
527
|
-
cookieJar: options.cookieJar,
|
|
436
|
+
cookieJar: options.cookieJar,
|
|
528
437
|
responseType: 'text',
|
|
529
|
-
}), (redirectResponse, updatedRequest) => {
|
|
530
|
-
if (this.persistCookiesPerSession) {
|
|
531
|
-
session.setCookiesFromResponse(redirectResponse);
|
|
532
|
-
const cookieString = session.getCookieString(updatedRequest.url.toString());
|
|
533
|
-
if (cookieString !== '') {
|
|
534
|
-
updatedRequest.headers.Cookie = cookieString;
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
438
|
});
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
* The stream object returned from got does not have the below properties.
|
|
543
|
-
* At the same time, you can't read data directly from the response stream,
|
|
544
|
-
* because they won't get emitted unless you also read from the primary
|
|
545
|
-
* got stream. To be able to work with only one stream, we move the expected props
|
|
546
|
-
* from the response stream to the got stream.
|
|
547
|
-
* @internal
|
|
548
|
-
*/
|
|
549
|
-
function addResponsePropertiesToStream(stream, response) {
|
|
550
|
-
const properties = [
|
|
551
|
-
'statusCode',
|
|
552
|
-
'statusMessage',
|
|
553
|
-
'headers',
|
|
554
|
-
'complete',
|
|
555
|
-
'httpVersion',
|
|
556
|
-
'rawHeaders',
|
|
557
|
-
'rawTrailers',
|
|
558
|
-
'trailers',
|
|
559
|
-
'url',
|
|
560
|
-
'request',
|
|
561
|
-
];
|
|
562
|
-
stream.on('end', () => {
|
|
563
|
-
// @ts-expect-error
|
|
564
|
-
if (stream.rawTrailers)
|
|
565
|
-
stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
|
|
566
|
-
// @ts-expect-error
|
|
567
|
-
if (stream.trailers)
|
|
568
|
-
stream.trailers = response.trailers;
|
|
569
|
-
// @ts-expect-error
|
|
570
|
-
stream.complete = response.complete;
|
|
571
|
-
});
|
|
572
|
-
for (const prop of properties) {
|
|
573
|
-
if (!(prop in stream)) {
|
|
574
|
-
stream[prop] = response[prop];
|
|
439
|
+
if (cookieString) {
|
|
440
|
+
opts.headers?.delete('Cookie');
|
|
441
|
+
opts.headers?.delete('cookie');
|
|
442
|
+
opts.headers?.set('Cookie', cookieString);
|
|
575
443
|
}
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
}));
|
|
588
|
-
const { url, headers } = response;
|
|
589
|
-
let parsedContentType;
|
|
590
|
-
if (headers['content-type']) {
|
|
591
|
-
try {
|
|
592
|
-
parsedContentType = contentTypeParser.parse(headers['content-type']);
|
|
593
|
-
}
|
|
594
|
-
catch {
|
|
595
|
-
// Can not parse content type from Content-Type header. Try to parse it from file extension.
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
// Parse content type from file extension as fallback
|
|
599
|
-
if (!parsedContentType) {
|
|
600
|
-
const parsedUrl = new URL(url);
|
|
601
|
-
const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
|
|
602
|
-
parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
|
|
603
|
-
}
|
|
604
|
-
return {
|
|
605
|
-
type: parsedContentType.type,
|
|
606
|
-
charset: parsedContentType.parameters.charset,
|
|
444
|
+
const response = await this.httpClient.sendRequest(new Request(opts.url, {
|
|
445
|
+
body: opts.body ? Readable.toWeb(opts.body) : undefined,
|
|
446
|
+
headers: new Headers(opts.headers),
|
|
447
|
+
method: opts.method,
|
|
448
|
+
// Node-specific option to make the request body work with streams
|
|
449
|
+
duplex: 'half',
|
|
450
|
+
}), {
|
|
451
|
+
session,
|
|
452
|
+
timeoutMillis: opts.timeout,
|
|
453
|
+
});
|
|
454
|
+
return response;
|
|
607
455
|
};
|
|
608
456
|
}
|
|
609
457
|
/**
|