@crawlee/http 4.0.0-beta.6 → 4.0.0-beta.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/file-download.d.ts +58 -32
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +116 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +91 -184
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +178 -326
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +9 -8
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Readable } from 'node:stream';
|
|
2
2
|
import util from 'node:util';
|
|
3
|
-
import {
|
|
3
|
+
import { BasicCrawler, ContextPipeline, NavigationSkippedError, RequestState, Router, SessionError, } from '@crawlee/basic';
|
|
4
|
+
import { getCookiesFromResponse } from '@crawlee/core';
|
|
5
|
+
import { ResponseWithUrl } from '@crawlee/http-client';
|
|
4
6
|
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
|
|
5
7
|
import * as cheerio from 'cheerio';
|
|
6
8
|
import contentTypeParser from 'content-type';
|
|
7
9
|
import iconv from 'iconv-lite';
|
|
8
|
-
import
|
|
9
|
-
import ow, { ObjectPredicate } from 'ow';
|
|
10
|
+
import ow from 'ow';
|
|
10
11
|
import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
|
|
11
|
-
import {
|
|
12
|
-
let TimeoutError;
|
|
12
|
+
import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
|
|
13
13
|
/**
|
|
14
14
|
* Default mime types, which HttpScraper supports.
|
|
15
15
|
*/
|
|
@@ -46,18 +46,18 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
46
46
|
*
|
|
47
47
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
48
48
|
*
|
|
49
|
-
* We can use the `preNavigationHooks` to adjust
|
|
49
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
50
50
|
*
|
|
51
51
|
* ```javascript
|
|
52
52
|
* preNavigationHooks: [
|
|
53
|
-
* (crawlingContext
|
|
53
|
+
* (crawlingContext) => {
|
|
54
54
|
* // ...
|
|
55
55
|
* },
|
|
56
56
|
* ]
|
|
57
57
|
* ```
|
|
58
58
|
*
|
|
59
|
-
* By default, this crawler only processes web pages with the `text/html`
|
|
60
|
-
* and `application/
|
|
59
|
+
* By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
60
|
+
* and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
61
61
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
62
62
|
* use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
|
|
63
63
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
@@ -93,22 +93,13 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
93
93
|
* @category Crawlers
|
|
94
94
|
*/
|
|
95
95
|
export class HttpCrawler extends BasicCrawler {
|
|
96
|
-
config;
|
|
97
|
-
/**
|
|
98
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
99
|
-
* Only available if used by the crawler.
|
|
100
|
-
*/
|
|
101
|
-
proxyConfiguration;
|
|
102
|
-
userRequestHandlerTimeoutMillis;
|
|
103
96
|
preNavigationHooks;
|
|
104
97
|
postNavigationHooks;
|
|
105
|
-
|
|
98
|
+
saveResponseCookies;
|
|
106
99
|
navigationTimeoutMillis;
|
|
107
100
|
ignoreSslErrors;
|
|
108
101
|
suggestResponseEncoding;
|
|
109
102
|
forceResponseEncoding;
|
|
110
|
-
additionalHttpErrorStatusCodes;
|
|
111
|
-
ignoreHttpErrorStatusCodes;
|
|
112
103
|
supportedMimeTypes;
|
|
113
104
|
static optionsShape = {
|
|
114
105
|
...BasicCrawler.optionsShape,
|
|
@@ -117,157 +108,153 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
117
108
|
additionalMimeTypes: ow.optional.array.ofType(ow.string),
|
|
118
109
|
suggestResponseEncoding: ow.optional.string,
|
|
119
110
|
forceResponseEncoding: ow.optional.string,
|
|
120
|
-
|
|
121
|
-
persistCookiesPerSession: ow.optional.boolean,
|
|
122
|
-
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
123
|
-
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
111
|
+
saveResponseCookies: ow.optional.boolean,
|
|
124
112
|
preNavigationHooks: ow.optional.array,
|
|
125
113
|
postNavigationHooks: ow.optional.array,
|
|
126
114
|
};
|
|
127
115
|
/**
|
|
128
116
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
129
117
|
*/
|
|
130
|
-
constructor(options = {}
|
|
118
|
+
constructor(options = {}) {
|
|
131
119
|
ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
|
|
132
|
-
const {
|
|
120
|
+
const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, saveResponseCookies = true, preNavigationHooks = [], postNavigationHooks = [],
|
|
133
121
|
// BasicCrawler
|
|
134
|
-
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
|
|
122
|
+
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
|
|
135
123
|
super({
|
|
136
124
|
...basicCrawlerOptions,
|
|
137
|
-
requestHandler,
|
|
138
125
|
autoscaledPoolOptions,
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
}, config);
|
|
143
|
-
this.config = config;
|
|
144
|
-
this.requestHandler = requestHandler ?? this.router;
|
|
145
|
-
// Cookies should be persisted per session only if session pool is used
|
|
146
|
-
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
147
|
-
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
148
|
-
}
|
|
126
|
+
contextPipelineBuilder: contextPipelineBuilder ??
|
|
127
|
+
(() => this.buildContextPipeline()),
|
|
128
|
+
});
|
|
149
129
|
this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
|
|
150
130
|
if (additionalMimeTypes.length)
|
|
151
131
|
this._extendSupportedMimeTypes(additionalMimeTypes);
|
|
152
132
|
if (suggestResponseEncoding && forceResponseEncoding) {
|
|
153
133
|
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
|
|
154
134
|
}
|
|
155
|
-
this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
156
135
|
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
|
|
157
136
|
this.ignoreSslErrors = ignoreSslErrors;
|
|
158
137
|
this.suggestResponseEncoding = suggestResponseEncoding;
|
|
159
138
|
this.forceResponseEncoding = forceResponseEncoding;
|
|
160
|
-
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
161
|
-
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
162
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
163
139
|
this.preNavigationHooks = preNavigationHooks;
|
|
164
140
|
this.postNavigationHooks = [
|
|
165
141
|
({ request, response }) => this._abortDownloadOfBody(request, response),
|
|
166
142
|
...postNavigationHooks,
|
|
167
143
|
];
|
|
168
|
-
|
|
169
|
-
this.persistCookiesPerSession = persistCookiesPerSession ?? true;
|
|
170
|
-
}
|
|
171
|
-
else {
|
|
172
|
-
this.persistCookiesPerSession = false;
|
|
173
|
-
}
|
|
144
|
+
this.saveResponseCookies = saveResponseCookies;
|
|
174
145
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
const className = this.constructor.name;
|
|
183
|
-
const extensionOptions = extension.getCrawlerOptions();
|
|
184
|
-
for (const [key, value] of Object.entries(extensionOptions)) {
|
|
185
|
-
const isConfigurable = Object.hasOwn(this, key);
|
|
186
|
-
const originalType = typeof this[key];
|
|
187
|
-
const extensionType = typeof value; // What if we want to null something? It is really needed?
|
|
188
|
-
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
|
|
189
|
-
const exists = this[key] != null;
|
|
190
|
-
if (!isConfigurable) {
|
|
191
|
-
// Test if the property can be configured on the crawler
|
|
192
|
-
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
|
|
193
|
-
}
|
|
194
|
-
if (!isSameType && exists) {
|
|
195
|
-
// Assuming that extensions will only add up configuration
|
|
196
|
-
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
|
|
197
|
-
}
|
|
198
|
-
this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
|
|
199
|
-
this[key] = value;
|
|
200
|
-
}
|
|
146
|
+
buildContextPipeline() {
|
|
147
|
+
return ContextPipeline.create()
|
|
148
|
+
.compose({
|
|
149
|
+
action: this.makeHttpRequest.bind(this),
|
|
150
|
+
})
|
|
151
|
+
.compose({ action: this.processHttpResponse.bind(this) })
|
|
152
|
+
.compose({ action: this.handleBlockedRequestByContent.bind(this) });
|
|
201
153
|
}
|
|
202
|
-
|
|
203
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
204
|
-
*/
|
|
205
|
-
async _runRequestHandler(crawlingContext) {
|
|
154
|
+
async makeHttpRequest(crawlingContext) {
|
|
206
155
|
const { request, session } = crawlingContext;
|
|
207
|
-
if (
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
const $ = cheerio.load(parsed.body.toString());
|
|
221
|
-
if ($(selector).get().length === 0) {
|
|
222
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
223
|
-
}
|
|
156
|
+
if (request.skipNavigation) {
|
|
157
|
+
return {
|
|
158
|
+
request: new Proxy(request, {
|
|
159
|
+
get(target, propertyName, receiver) {
|
|
160
|
+
if (propertyName === 'loadedUrl') {
|
|
161
|
+
throw new NavigationSkippedError('The `request.loadedUrl` property is not available - `skipNavigation` was used');
|
|
162
|
+
}
|
|
163
|
+
return Reflect.get(target, propertyName, receiver);
|
|
164
|
+
},
|
|
165
|
+
}),
|
|
166
|
+
get response() {
|
|
167
|
+
throw new NavigationSkippedError('The `response` property is not available - `skipNavigation` was used');
|
|
168
|
+
},
|
|
224
169
|
};
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
170
|
+
}
|
|
171
|
+
request.state = RequestState.BEFORE_NAV;
|
|
172
|
+
await this._executeHooks(this.preNavigationHooks, crawlingContext);
|
|
173
|
+
tryCancel();
|
|
174
|
+
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
175
|
+
const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
176
|
+
tryCancel();
|
|
177
|
+
request.loadedUrl = httpResponse?.url;
|
|
178
|
+
request.state = RequestState.AFTER_NAV;
|
|
179
|
+
return { request: request, response: httpResponse };
|
|
180
|
+
}
|
|
181
|
+
async processHttpResponse(crawlingContext) {
|
|
182
|
+
if (crawlingContext.request.skipNavigation) {
|
|
183
|
+
return {
|
|
184
|
+
get contentType() {
|
|
185
|
+
throw new NavigationSkippedError('The `contentType` property is not available - `skipNavigation` was used');
|
|
186
|
+
},
|
|
187
|
+
get body() {
|
|
188
|
+
throw new NavigationSkippedError('The `body` property is not available - `skipNavigation` was used');
|
|
189
|
+
},
|
|
190
|
+
get json() {
|
|
191
|
+
throw new NavigationSkippedError('The `json` property is not available - `skipNavigation` was used');
|
|
192
|
+
},
|
|
193
|
+
get waitForSelector() {
|
|
194
|
+
throw new NavigationSkippedError('The `waitForSelector` method is not available - `skipNavigation` was used');
|
|
195
|
+
},
|
|
196
|
+
get parseWithCheerio() {
|
|
197
|
+
throw new NavigationSkippedError('The `parseWithCheerio` method is not available - `skipNavigation` was used');
|
|
198
|
+
},
|
|
231
199
|
};
|
|
232
|
-
|
|
233
|
-
|
|
200
|
+
}
|
|
201
|
+
await this._executeHooks(this.postNavigationHooks, crawlingContext);
|
|
202
|
+
tryCancel();
|
|
203
|
+
const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
|
|
204
|
+
tryCancel();
|
|
205
|
+
const response = parsed.response;
|
|
206
|
+
const contentType = parsed.contentType;
|
|
207
|
+
const waitForSelector = async (selector, _timeoutMs) => {
|
|
208
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
209
|
+
if ($(selector).get().length === 0) {
|
|
210
|
+
throw new Error(`Selector '${selector}' not found.`);
|
|
234
211
|
}
|
|
235
|
-
|
|
236
|
-
|
|
212
|
+
};
|
|
213
|
+
const parseWithCheerio = async (selector, timeoutMs) => {
|
|
214
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
215
|
+
if (selector) {
|
|
216
|
+
await crawlingContext.waitForSelector(selector, timeoutMs);
|
|
237
217
|
}
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
218
|
+
return $;
|
|
219
|
+
};
|
|
220
|
+
this._throwOnBlockedRequest(response.status);
|
|
221
|
+
if (this.saveResponseCookies) {
|
|
222
|
+
try {
|
|
223
|
+
for (const cookie of getCookiesFromResponse(response)) {
|
|
224
|
+
if (!cookie)
|
|
225
|
+
continue;
|
|
226
|
+
try {
|
|
227
|
+
crawlingContext.session.cookieJar.setCookieSync(cookie, response.url, { ignoreError: false });
|
|
228
|
+
}
|
|
229
|
+
catch (e) {
|
|
230
|
+
this.log.debug(`Could not set cookie: ${e.message}`);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
catch (e) {
|
|
235
|
+
this.log.exception(e, 'Could not get cookies from response');
|
|
246
236
|
}
|
|
247
|
-
Object.assign(crawlingContext, parsed);
|
|
248
|
-
Object.defineProperty(crawlingContext, 'json', {
|
|
249
|
-
get() {
|
|
250
|
-
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
251
|
-
return null;
|
|
252
|
-
const jsonString = parsed.body.toString(contentType.encoding);
|
|
253
|
-
return JSON.parse(jsonString);
|
|
254
|
-
},
|
|
255
|
-
});
|
|
256
237
|
}
|
|
238
|
+
return {
|
|
239
|
+
get json() {
|
|
240
|
+
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
241
|
+
return null;
|
|
242
|
+
const jsonString = parsed.body.toString(contentType.encoding);
|
|
243
|
+
return JSON.parse(jsonString);
|
|
244
|
+
},
|
|
245
|
+
waitForSelector,
|
|
246
|
+
parseWithCheerio,
|
|
247
|
+
contentType,
|
|
248
|
+
body: parsed.body,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
async handleBlockedRequestByContent(crawlingContext) {
|
|
257
252
|
if (this.retryOnBlocked) {
|
|
258
253
|
const error = await this.isRequestBlocked(crawlingContext);
|
|
259
254
|
if (error)
|
|
260
255
|
throw new SessionError(error);
|
|
261
256
|
}
|
|
262
|
-
|
|
263
|
-
try {
|
|
264
|
-
await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
|
|
265
|
-
request.state = RequestState.DONE;
|
|
266
|
-
}
|
|
267
|
-
catch (e) {
|
|
268
|
-
request.state = RequestState.ERROR;
|
|
269
|
-
throw e;
|
|
270
|
-
}
|
|
257
|
+
return {};
|
|
271
258
|
}
|
|
272
259
|
async isRequestBlocked(crawlingContext) {
|
|
273
260
|
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
|
|
@@ -277,84 +264,25 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
277
264
|
return `Found selectors: ${foundSelectors.join(', ')}`;
|
|
278
265
|
}
|
|
279
266
|
}
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
async _handleNavigation(crawlingContext) {
|
|
283
|
-
const gotOptions = {};
|
|
284
|
-
const { request, session } = crawlingContext;
|
|
285
|
-
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
286
|
-
request.state = RequestState.BEFORE_NAV;
|
|
287
|
-
// Execute pre navigation hooks before applying session pool cookies,
|
|
288
|
-
// as they may also set cookies in the session
|
|
289
|
-
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
|
|
290
|
-
tryCancel();
|
|
291
|
-
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
292
|
-
this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
293
|
-
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
294
|
-
crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
295
|
-
tryCancel();
|
|
296
|
-
request.state = RequestState.AFTER_NAV;
|
|
297
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
|
|
298
|
-
tryCancel();
|
|
299
|
-
}
|
|
300
|
-
/**
|
|
301
|
-
* Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
|
|
302
|
-
*/
|
|
303
|
-
_applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
|
|
304
|
-
const sessionCookie = session?.getCookieString(request.url) ?? '';
|
|
305
|
-
let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
|
|
306
|
-
if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
|
|
307
|
-
const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
|
|
308
|
-
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
|
|
309
|
-
const sourceCookies = [];
|
|
310
|
-
if (Array.isArray(lowerCaseHeader)) {
|
|
311
|
-
sourceCookies.push(...lowerCaseHeader);
|
|
312
|
-
}
|
|
313
|
-
else {
|
|
314
|
-
sourceCookies.push(lowerCaseHeader);
|
|
315
|
-
}
|
|
316
|
-
if (Array.isArray(upperCaseHeader)) {
|
|
317
|
-
sourceCookies.push(...upperCaseHeader);
|
|
318
|
-
}
|
|
319
|
-
else {
|
|
320
|
-
sourceCookies.push(upperCaseHeader);
|
|
321
|
-
}
|
|
322
|
-
alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
|
|
323
|
-
}
|
|
324
|
-
const sourceCookies = [sessionCookie, preHookCookies];
|
|
325
|
-
if (Array.isArray(alteredGotOptionsCookies)) {
|
|
326
|
-
sourceCookies.push(...alteredGotOptionsCookies);
|
|
327
|
-
}
|
|
328
|
-
else {
|
|
329
|
-
sourceCookies.push(alteredGotOptionsCookies);
|
|
330
|
-
}
|
|
331
|
-
sourceCookies.push(postHookCookies);
|
|
332
|
-
const mergedCookie = mergeCookies(request.url, sourceCookies);
|
|
333
|
-
gotOptions.headers ??= {};
|
|
334
|
-
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
|
|
335
|
-
Reflect.deleteProperty(gotOptions.headers, 'cookie');
|
|
336
|
-
if (mergedCookie !== '') {
|
|
337
|
-
gotOptions.headers.Cookie = mergedCookie;
|
|
267
|
+
if (this.blockedStatusCodes.has(crawlingContext.response.status)) {
|
|
268
|
+
return `Blocked by status code ${crawlingContext.response.status}`;
|
|
338
269
|
}
|
|
270
|
+
return false;
|
|
339
271
|
}
|
|
340
272
|
/**
|
|
341
273
|
* Function to make the HTTP request. It performs optimizations
|
|
342
274
|
* on the request such as only downloading the request body if the
|
|
343
275
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
344
276
|
*/
|
|
345
|
-
async _requestFunction({ request, session, proxyUrl
|
|
346
|
-
|
|
347
|
-
// @ts-ignore
|
|
348
|
-
({ TimeoutError } = await import('got-scraping'));
|
|
349
|
-
}
|
|
350
|
-
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
|
|
277
|
+
async _requestFunction({ request, session, proxyUrl }) {
|
|
278
|
+
const opts = this._getRequestOptions(request, session, proxyUrl);
|
|
351
279
|
try {
|
|
352
280
|
return await this._requestAsBrowser(opts, session);
|
|
353
281
|
}
|
|
354
282
|
catch (e) {
|
|
355
|
-
if (e instanceof TimeoutError) {
|
|
283
|
+
if (e instanceof Error && e.constructor.name === 'TimeoutError') {
|
|
356
284
|
this._handleRequestTimeout(session);
|
|
357
|
-
return
|
|
285
|
+
return new Response(); // this will never happen, as _handleRequestTimeout always throws
|
|
358
286
|
}
|
|
359
287
|
if (this.isProxyError(e)) {
|
|
360
288
|
throw new SessionError(this._getMessageFromError(e));
|
|
@@ -367,18 +295,16 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
367
295
|
/**
|
|
368
296
|
* Encodes and parses response according to the provided content type
|
|
369
297
|
*/
|
|
370
|
-
async _parseResponse(request,
|
|
371
|
-
const {
|
|
372
|
-
const { type, charset } = parseContentTypeFromResponse(
|
|
373
|
-
const { response, encoding } = this._encodeResponse(request,
|
|
298
|
+
async _parseResponse(request, response) {
|
|
299
|
+
const { status } = response;
|
|
300
|
+
const { type, charset } = parseContentTypeFromResponse(response);
|
|
301
|
+
const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
|
|
374
302
|
const contentType = { type, encoding };
|
|
375
|
-
if (
|
|
376
|
-
this.stats.registerStatusCode(
|
|
303
|
+
if (status >= 400 && status <= 599) {
|
|
304
|
+
this.stats.registerStatusCode(status);
|
|
377
305
|
}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
if ((statusCode >= 500 && !excludeError) || includeError) {
|
|
381
|
-
const body = await readStreamToString(response, encoding);
|
|
306
|
+
if (this.isErrorStatusCode(status)) {
|
|
307
|
+
const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
|
|
382
308
|
// Errors are often sent as JSON, so attempt to parse them,
|
|
383
309
|
// despite Accept header being set to text/html.
|
|
384
310
|
if (type === APPLICATION_JSON_MIME_TYPE) {
|
|
@@ -386,59 +312,48 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
386
312
|
let { message } = errorResponse;
|
|
387
313
|
if (!message)
|
|
388
314
|
message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
|
|
389
|
-
throw new Error(`${
|
|
315
|
+
throw new Error(`${status} - ${message}`);
|
|
390
316
|
}
|
|
391
|
-
if (
|
|
392
|
-
throw new Error(`${
|
|
317
|
+
if (this.additionalHttpErrorStatusCodes.has(status)) {
|
|
318
|
+
throw new Error(`${status} - Error status code was set by user.`);
|
|
393
319
|
}
|
|
394
320
|
// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
|
|
395
|
-
throw new Error(`${
|
|
321
|
+
throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
|
|
396
322
|
}
|
|
397
323
|
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
|
|
398
|
-
|
|
399
|
-
const parsed = await this._parseHTML(response, isXml, crawlingContext);
|
|
400
|
-
return { ...parsed, isXml, response, contentType };
|
|
324
|
+
return { response, contentType, body: await reencodedResponse.text() };
|
|
401
325
|
}
|
|
402
326
|
else {
|
|
403
|
-
const body = await
|
|
327
|
+
const body = Buffer.from(await reencodedResponse.bytes());
|
|
404
328
|
return {
|
|
405
329
|
body,
|
|
406
330
|
response,
|
|
407
331
|
contentType,
|
|
408
|
-
enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
|
|
409
332
|
};
|
|
410
333
|
}
|
|
411
334
|
}
|
|
412
|
-
async _parseHTML(response, _isXml, _crawlingContext) {
|
|
413
|
-
return {
|
|
414
|
-
body: await concatStreamToBuffer(response),
|
|
415
|
-
};
|
|
416
|
-
}
|
|
417
335
|
/**
|
|
418
336
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
419
337
|
*/
|
|
420
|
-
_getRequestOptions(request, session, proxyUrl
|
|
338
|
+
_getRequestOptions(request, session, proxyUrl) {
|
|
421
339
|
const requestOptions = {
|
|
422
340
|
url: request.url,
|
|
423
341
|
method: request.method,
|
|
424
342
|
proxyUrl,
|
|
425
|
-
timeout:
|
|
343
|
+
timeout: this.navigationTimeoutMillis,
|
|
426
344
|
sessionToken: session,
|
|
427
|
-
|
|
428
|
-
headers: { ...request.headers, ...gotOptions?.headers },
|
|
345
|
+
headers: request.headers,
|
|
429
346
|
https: {
|
|
430
|
-
...gotOptions?.https,
|
|
431
347
|
rejectUnauthorized: !this.ignoreSslErrors,
|
|
432
348
|
},
|
|
433
|
-
|
|
349
|
+
body: undefined,
|
|
434
350
|
};
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
// because users can use normal + MITM proxies in a single configuration.
|
|
351
|
+
if (requestOptions.headers?.cookie || requestOptions.headers?.Cookie) {
|
|
352
|
+
requestOptions.headers.Cookie = this._getCookieHeaderFromRequest(request);
|
|
353
|
+
delete requestOptions.headers.cookie;
|
|
354
|
+
}
|
|
440
355
|
// Disable SSL verification for MITM proxies
|
|
441
|
-
if (
|
|
356
|
+
if (session.proxyInfo?.ignoreTlsErrors) {
|
|
442
357
|
requestOptions.https = {
|
|
443
358
|
...requestOptions.https,
|
|
444
359
|
rejectUnauthorized: false,
|
|
@@ -466,14 +381,16 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
466
381
|
// Try to re-encode a variety of unsupported encodings to utf-8
|
|
467
382
|
if (iconv.encodingExists(encoding)) {
|
|
468
383
|
const encodeStream = iconv.encodeStream(utf8);
|
|
469
|
-
const decodeStream = iconv
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
384
|
+
const decodeStream = iconv
|
|
385
|
+
.decodeStream(encoding)
|
|
386
|
+
.on('error', (err) => encodeStream.emit('error', err));
|
|
387
|
+
const reencodedBody = response.body
|
|
388
|
+
? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
|
|
389
|
+
.pipe(decodeStream)
|
|
390
|
+
.pipe(encodeStream)))
|
|
391
|
+
: null;
|
|
475
392
|
return {
|
|
476
|
-
response:
|
|
393
|
+
response: new ResponseWithUrl(reencodedBody, response),
|
|
477
394
|
encoding: utf8,
|
|
478
395
|
};
|
|
479
396
|
}
|
|
@@ -501,16 +418,13 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
501
418
|
* Handles timeout request
|
|
502
419
|
*/
|
|
503
420
|
_handleRequestTimeout(session) {
|
|
504
|
-
session
|
|
505
|
-
throw new Error(`request timed out after ${this.
|
|
421
|
+
session.markBad();
|
|
422
|
+
throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
506
423
|
}
|
|
507
424
|
_abortDownloadOfBody(request, response) {
|
|
508
|
-
const {
|
|
425
|
+
const { status } = response;
|
|
509
426
|
const { type } = parseContentTypeFromResponse(response);
|
|
510
|
-
|
|
511
|
-
const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
|
|
512
|
-
// if we retry the request, can the Content-Type change?
|
|
513
|
-
const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
|
|
427
|
+
const isTransientContentType = status >= 500 || this.blockedStatusCodes.has(status);
|
|
514
428
|
if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
|
|
515
429
|
request.noRetry = true;
|
|
516
430
|
throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
|
|
@@ -521,88 +435,26 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
521
435
|
* @internal wraps public utility for mocking purposes
|
|
522
436
|
*/
|
|
523
437
|
_requestAsBrowser = async (options, session) => {
|
|
524
|
-
const
|
|
438
|
+
const opts = processHttpRequestOptions({
|
|
525
439
|
...options,
|
|
526
|
-
cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
|
|
527
440
|
responseType: 'text',
|
|
528
|
-
}), (redirectResponse, updatedRequest) => {
|
|
529
|
-
if (this.persistCookiesPerSession) {
|
|
530
|
-
session.setCookiesFromResponse(redirectResponse);
|
|
531
|
-
const cookieString = session.getCookieString(updatedRequest.url.toString());
|
|
532
|
-
if (cookieString !== '') {
|
|
533
|
-
updatedRequest.headers.Cookie = cookieString;
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
441
|
});
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
'complete',
|
|
554
|
-
'httpVersion',
|
|
555
|
-
'rawHeaders',
|
|
556
|
-
'rawTrailers',
|
|
557
|
-
'trailers',
|
|
558
|
-
'url',
|
|
559
|
-
'request',
|
|
560
|
-
];
|
|
561
|
-
stream.on('end', () => {
|
|
562
|
-
// @ts-expect-error
|
|
563
|
-
if (stream.rawTrailers)
|
|
564
|
-
stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
|
|
565
|
-
// @ts-expect-error
|
|
566
|
-
if (stream.trailers)
|
|
567
|
-
stream.trailers = response.trailers;
|
|
568
|
-
// @ts-expect-error
|
|
569
|
-
stream.complete = response.complete;
|
|
570
|
-
});
|
|
571
|
-
for (const prop of properties) {
|
|
572
|
-
if (!(prop in stream)) {
|
|
573
|
-
stream[prop] = response[prop];
|
|
574
|
-
}
|
|
575
|
-
}
|
|
576
|
-
return stream;
|
|
577
|
-
}
|
|
578
|
-
/**
|
|
579
|
-
* Gets parsed content type from response object
|
|
580
|
-
* @param response HTTP response object
|
|
581
|
-
*/
|
|
582
|
-
function parseContentTypeFromResponse(response) {
|
|
583
|
-
ow(response, ow.object.partialShape({
|
|
584
|
-
url: ow.string.url,
|
|
585
|
-
headers: new ObjectPredicate(),
|
|
586
|
-
}));
|
|
587
|
-
const { url, headers } = response;
|
|
588
|
-
let parsedContentType;
|
|
589
|
-
if (headers['content-type']) {
|
|
590
|
-
try {
|
|
591
|
-
parsedContentType = contentTypeParser.parse(headers['content-type']);
|
|
592
|
-
}
|
|
593
|
-
catch {
|
|
594
|
-
// Can not parse content type from Content-Type header. Try to parse it from file extension.
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
// Parse content type from file extension as fallback
|
|
598
|
-
if (!parsedContentType) {
|
|
599
|
-
const parsedUrl = new URL(url);
|
|
600
|
-
const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
|
|
601
|
-
parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
|
|
602
|
-
}
|
|
603
|
-
return {
|
|
604
|
-
type: parsedContentType.type,
|
|
605
|
-
charset: parsedContentType.parameters.charset,
|
|
442
|
+
// When saveResponseCookies is false, the response cookies must not mutate the
|
|
443
|
+
// session jar. Reads still go through the session (so session.setCookie() in pre-nav
|
|
444
|
+
// hooks keeps working) but a per-request clone is passed in so writes are discarded.
|
|
445
|
+
const cookieJar = this.saveResponseCookies ? session.cookieJar : await session.cookieJar.clone();
|
|
446
|
+
const response = await this.httpClient.sendRequest(new Request(opts.url, {
|
|
447
|
+
body: opts.body ? Readable.toWeb(opts.body) : undefined,
|
|
448
|
+
headers: new Headers(opts.headers),
|
|
449
|
+
method: opts.method,
|
|
450
|
+
// Node-specific option to make the request body work with streams
|
|
451
|
+
duplex: 'half',
|
|
452
|
+
}), {
|
|
453
|
+
session,
|
|
454
|
+
cookieJar,
|
|
455
|
+
timeoutMillis: opts.timeout,
|
|
456
|
+
});
|
|
457
|
+
return response;
|
|
606
458
|
};
|
|
607
459
|
}
|
|
608
460
|
/**
|