@arcblock/crawler 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/crawler.js CHANGED
@@ -40,7 +40,7 @@ function createCrawlQueue() {
40
40
  store: new sequelize_1.default(db, 'crawler'),
41
41
  concurrency: 1,
42
42
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
43
- config_1.logger.debug('job start:', job);
43
+ config_1.logger.info('Starting to execute crawl job', job);
44
44
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
45
45
  if (!canCrawl) {
46
46
  config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
@@ -157,11 +157,11 @@ function formatHtml(htmlString) {
157
157
  }
158
158
  return htmlString;
159
159
  }
160
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
160
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
161
161
  config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
162
162
  const page = yield (0, puppeteer_1.initPage)();
163
163
  if (width && height) {
164
- yield page.setViewport({ width, height });
164
+ yield page.setViewport({ width, height, deviceScaleFactor: 2 });
165
165
  }
166
166
  let html = null;
167
167
  let screenshot = null;
@@ -176,12 +176,24 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
176
176
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
177
177
  }
178
178
  // await for networkidle0
179
- // https://pptr.dev/api/puppeteer.page.goforward/#remarks
179
+ // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
180
180
  yield page.waitForNetworkIdle({
181
- idleTime: 2 * 1000,
181
+ idleTime: 1.5 * 1000,
182
182
  });
183
183
  // get screenshot
184
184
  if (includeScreenshot) {
185
+ // Try to find the tallest element and set the browser to the same height
186
+ if (fullPage) {
187
+ const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
188
+ config_1.logger.info('findMaxScrollHeight', { maxScrollHeight });
189
+ if (maxScrollHeight) {
190
+ yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
191
+ yield page.evaluate((scrollHeight) => {
192
+ window.scrollTo(0, scrollHeight || 0);
193
+ document.documentElement.scrollTo(0, scrollHeight || 0);
194
+ }, maxScrollHeight);
195
+ }
196
+ }
185
197
  try {
186
198
  screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
187
199
  }
@@ -215,7 +227,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
215
227
  exports.getPageContent = getPageContent;
216
228
  function createCrawlJob(params, callback) {
217
229
  return __awaiter(this, void 0, void 0, function* () {
218
- params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
230
+ params = Object.assign(Object.assign({}, params), { id: (0, crypto_1.randomUUID)(), url: (0, utils_1.formatUrl)(params.url) });
219
231
  // skip duplicate job
220
232
  const existsJob = yield getJob({
221
233
  url: params.url,
@@ -226,22 +238,21 @@ function createCrawlJob(params, callback) {
226
238
  height: params.height,
227
239
  fullPage: params.fullPage,
228
240
  });
229
- config_1.logger.info('create crawl job', params);
230
241
  if (existsJob) {
231
242
  config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
232
243
  return existsJob.id;
233
244
  }
234
- const jobId = (0, crypto_1.randomUUID)();
235
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
245
+ config_1.logger.info('create crawl job', params);
246
+ const job = crawlQueue.push(params);
236
247
  job.on('finished', ({ result }) => {
237
- config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
248
+ config_1.logger.info(`Crawl completed ${params.url}`, { job: params, result });
238
249
  callback === null || callback === void 0 ? void 0 : callback(result);
239
250
  });
240
251
  job.on('failed', ({ error }) => {
241
252
  config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
242
253
  callback === null || callback === void 0 ? void 0 : callback(null);
243
254
  });
244
- return jobId;
255
+ return params.id;
245
256
  });
246
257
  }
247
258
  // @ts-ignore
package/dist/puppeteer.js CHANGED
@@ -94,7 +94,7 @@ function ensureBrowser() {
94
94
  return __awaiter(this, void 0, void 0, function* () {
95
95
  const puppeteerConfig = yield ensurePuppeteerrc();
96
96
  const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
97
- config_2.logger.info('executablePath', executablePath);
97
+ config_2.logger.debug('Chromium executablePath', executablePath);
98
98
  if (!fs_extra_1.default.existsSync(executablePath)) {
99
99
  config_2.logger.info('start download browser', puppeteerConfig);
100
100
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
@@ -182,7 +182,7 @@ function launchBrowser() {
182
182
  '--font-render-hinting=none',
183
183
  ],
184
184
  });
185
- config_2.logger.info('Launch browser success');
185
+ config_2.logger.info('Launch browser');
186
186
  }
187
187
  catch (error) {
188
188
  config_2.logger.error('launch browser failed: ', error);
@@ -260,20 +260,20 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
260
260
  yield Promise.all(pages.map((page) => page.close()));
261
261
  }
262
262
  catch (err) {
263
- config_2.logger.error('Failed to close all pages:', err);
263
+ config_2.logger.warn('Failed to close all pages:', err);
264
264
  }
265
265
  // close browser
266
266
  try {
267
267
  yield browser.close();
268
268
  }
269
269
  catch (err) {
270
- config_2.logger.error('Failed to close browser:', err);
270
+ config_2.logger.warn('Failed to close browser:', err);
271
271
  }
272
272
  // clear cache
273
273
  try {
274
274
  if (trimCache) {
275
275
  yield puppeteer_1.default.trimCache();
276
- config_2.logger.info('Trim cache success');
276
+ config_2.logger.debug('Trim cache success');
277
277
  }
278
278
  // try to clear temporary directory
279
279
  // if (puppeteerConfig) {
@@ -284,7 +284,7 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
284
284
  }
285
285
  }
286
286
  catch (err) {
287
- config_2.logger.error('Failed to clear browser cache:', err);
287
+ config_2.logger.warn('Failed to clear browser cache:', err);
288
288
  }
289
289
  browser = null;
290
290
  clearBrowserActivatedTimer();
package/dist/utils.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import { Page } from '@blocklet/puppeteer';
1
2
  export declare const api: import("axios").AxiosInstance;
2
3
  export declare const sleep: (ms: number) => Promise<unknown>;
3
4
  export declare const CRAWLER_FLAG = "x-crawler";
@@ -13,3 +14,4 @@ export declare const getFullUrl: (req: any) => string;
13
14
  export declare const getRelativePath: (url: string) => string;
14
15
  export declare const formatUrl: (url: string) => string;
15
16
  export declare function md5(content: string | Uint8Array): string;
17
+ export declare function findMaxScrollHeight(page: Page): Promise<number>;
package/dist/utils.js CHANGED
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
16
16
  exports.getRobots = getRobots;
17
17
  exports.md5 = md5;
18
+ exports.findMaxScrollHeight = findMaxScrollHeight;
18
19
  const config_1 = require("@blocklet/sdk/lib/config");
19
20
  const axios_1 = __importDefault(require("axios"));
20
21
  const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
@@ -237,3 +238,23 @@ exports.formatUrl = formatUrl;
237
238
  function md5(content) {
238
239
  return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
239
240
  }
241
+ function findMaxScrollHeight(page) {
242
+ return __awaiter(this, void 0, void 0, function* () {
243
+ const maxHeightHandler = yield page.evaluateHandle(() => {
244
+ const elements = Array.from(document.querySelectorAll('*'));
245
+ let maxHeight = document.body.scrollHeight;
246
+ for (const el of elements) {
247
+ const style = window.getComputedStyle(el);
248
+ if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
249
+ if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
250
+ maxHeight = el.scrollHeight;
251
+ }
252
+ }
253
+ }
254
+ return maxHeight;
255
+ });
256
+ const maxHeight = yield maxHeightHandler.jsonValue();
257
+ maxHeightHandler.dispose();
258
+ return maxHeight;
259
+ });
260
+ }
package/esm/crawler.js CHANGED
@@ -19,7 +19,7 @@ import { config, logger } from './config';
19
19
  import { Job } from './db/job';
20
20
  import { Snapshot } from './db/snapshot';
21
21
  import { initPage } from './puppeteer';
22
- import { formatUrl, isAcceptCrawler, md5 } from './utils';
22
+ import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
23
23
  const { BaseState } = require('@abtnode/models');
24
24
  let crawlQueue;
25
25
  export function createCrawlQueue() {
@@ -28,7 +28,7 @@ export function createCrawlQueue() {
28
28
  store: new SequelizeStore(db, 'crawler'),
29
29
  concurrency: 1,
30
30
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
31
- logger.debug('job start:', job);
31
+ logger.info('Starting to execute crawl job', job);
32
32
  const canCrawl = yield isAcceptCrawler(job.url);
33
33
  if (!canCrawl) {
34
34
  logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
@@ -145,11 +145,11 @@ function formatHtml(htmlString) {
145
145
  }
146
146
  return htmlString;
147
147
  }
148
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
148
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
149
149
  logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
150
150
  const page = yield initPage();
151
151
  if (width && height) {
152
- yield page.setViewport({ width, height });
152
+ yield page.setViewport({ width, height, deviceScaleFactor: 2 });
153
153
  }
154
154
  let html = null;
155
155
  let screenshot = null;
@@ -164,12 +164,24 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
164
164
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
165
165
  }
166
166
  // await for networkidle0
167
- // https://pptr.dev/api/puppeteer.page.goforward/#remarks
167
+ // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
168
168
  yield page.waitForNetworkIdle({
169
- idleTime: 2 * 1000,
169
+ idleTime: 1.5 * 1000,
170
170
  });
171
171
  // get screenshot
172
172
  if (includeScreenshot) {
173
+ // Try to find the tallest element and set the browser to the same height
174
+ if (fullPage) {
175
+ const maxScrollHeight = yield findMaxScrollHeight(page);
176
+ logger.info('findMaxScrollHeight', { maxScrollHeight });
177
+ if (maxScrollHeight) {
178
+ yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
179
+ yield page.evaluate((scrollHeight) => {
180
+ window.scrollTo(0, scrollHeight || 0);
181
+ document.documentElement.scrollTo(0, scrollHeight || 0);
182
+ }, maxScrollHeight);
183
+ }
184
+ }
173
185
  try {
174
186
  screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
175
187
  }
@@ -202,7 +214,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
202
214
  });
203
215
  export function createCrawlJob(params, callback) {
204
216
  return __awaiter(this, void 0, void 0, function* () {
205
- params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
217
+ params = Object.assign(Object.assign({}, params), { id: randomUUID(), url: formatUrl(params.url) });
206
218
  // skip duplicate job
207
219
  const existsJob = yield getJob({
208
220
  url: params.url,
@@ -213,22 +225,21 @@ export function createCrawlJob(params, callback) {
213
225
  height: params.height,
214
226
  fullPage: params.fullPage,
215
227
  });
216
- logger.info('create crawl job', params);
217
228
  if (existsJob) {
218
229
  logger.warn(`Crawl job already exists for ${params.url}, skip`);
219
230
  return existsJob.id;
220
231
  }
221
- const jobId = randomUUID();
222
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
232
+ logger.info('create crawl job', params);
233
+ const job = crawlQueue.push(params);
223
234
  job.on('finished', ({ result }) => {
224
- logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
235
+ logger.info(`Crawl completed ${params.url}`, { job: params, result });
225
236
  callback === null || callback === void 0 ? void 0 : callback(result);
226
237
  });
227
238
  job.on('failed', ({ error }) => {
228
239
  logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
229
240
  callback === null || callback === void 0 ? void 0 : callback(null);
230
241
  });
231
- return jobId;
242
+ return params.id;
232
243
  });
233
244
  }
234
245
  // @ts-ignore
package/esm/puppeteer.js CHANGED
@@ -50,7 +50,7 @@ export function ensureBrowser() {
50
50
  return __awaiter(this, void 0, void 0, function* () {
51
51
  const puppeteerConfig = yield ensurePuppeteerrc();
52
52
  const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
53
- logger.info('executablePath', executablePath);
53
+ logger.debug('Chromium executablePath', executablePath);
54
54
  if (!fs.existsSync(executablePath)) {
55
55
  logger.info('start download browser', puppeteerConfig);
56
56
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
@@ -138,7 +138,7 @@ export function launchBrowser() {
138
138
  '--font-render-hinting=none',
139
139
  ],
140
140
  });
141
- logger.info('Launch browser success');
141
+ logger.info('Launch browser');
142
142
  }
143
143
  catch (error) {
144
144
  logger.error('launch browser failed: ', error);
@@ -215,20 +215,20 @@ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0
215
215
  yield Promise.all(pages.map((page) => page.close()));
216
216
  }
217
217
  catch (err) {
218
- logger.error('Failed to close all pages:', err);
218
+ logger.warn('Failed to close all pages:', err);
219
219
  }
220
220
  // close browser
221
221
  try {
222
222
  yield browser.close();
223
223
  }
224
224
  catch (err) {
225
- logger.error('Failed to close browser:', err);
225
+ logger.warn('Failed to close browser:', err);
226
226
  }
227
227
  // clear cache
228
228
  try {
229
229
  if (trimCache) {
230
230
  yield puppeteer.trimCache();
231
- logger.info('Trim cache success');
231
+ logger.debug('Trim cache success');
232
232
  }
233
233
  // try to clear temporary directory
234
234
  // if (puppeteerConfig) {
@@ -239,7 +239,7 @@ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0
239
239
  }
240
240
  }
241
241
  catch (err) {
242
- logger.error('Failed to clear browser cache:', err);
242
+ logger.warn('Failed to clear browser cache:', err);
243
243
  }
244
244
  browser = null;
245
245
  clearBrowserActivatedTimer();
package/esm/utils.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import { Page } from '@blocklet/puppeteer';
1
2
  export declare const api: import("axios").AxiosInstance;
2
3
  export declare const sleep: (ms: number) => Promise<unknown>;
3
4
  export declare const CRAWLER_FLAG = "x-crawler";
@@ -13,3 +14,4 @@ export declare const getFullUrl: (req: any) => string;
13
14
  export declare const getRelativePath: (url: string) => string;
14
15
  export declare const formatUrl: (url: string) => string;
15
16
  export declare function md5(content: string | Uint8Array): string;
17
+ export declare function findMaxScrollHeight(page: Page): Promise<number>;
package/esm/utils.js CHANGED
@@ -218,3 +218,23 @@ export const formatUrl = (url) => {
218
218
  export function md5(content) {
219
219
  return createHash('md5').update(content).digest('hex');
220
220
  }
221
+ export function findMaxScrollHeight(page) {
222
+ return __awaiter(this, void 0, void 0, function* () {
223
+ const maxHeightHandler = yield page.evaluateHandle(() => {
224
+ const elements = Array.from(document.querySelectorAll('*'));
225
+ let maxHeight = document.body.scrollHeight;
226
+ for (const el of elements) {
227
+ const style = window.getComputedStyle(el);
228
+ if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
229
+ if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
230
+ maxHeight = el.scrollHeight;
231
+ }
232
+ }
233
+ }
234
+ return maxHeight;
235
+ });
236
+ const maxHeight = yield maxHeightHandler.jsonValue();
237
+ maxHeightHandler.dispose();
238
+ return maxHeight;
239
+ });
240
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "main": "dist/index.js",
5
5
  "module": "esm/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -113,6 +113,7 @@
113
113
  "bundle": "npm run build",
114
114
  "build:cjs": "tsc -p tsconfig.cjs.json",
115
115
  "build:esm": "tsc -p tsconfig.esm.json",
116
- "build": "npm run build:cjs && npm run build:esm"
116
+ "build": "npm run build:cjs && npm run build:esm",
117
+ "fix:sqlite": "cd node_modules/sqlite3 && npm run rebuild"
117
118
  }
118
119
  }