@arcblock/crawler 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,8 +43,7 @@ await initCrawler({
43
43
  immediate: !!env.preferences.cronImmediate,
44
44
  sites: env.preferences.cronSites,
45
45
  time: env.preferences.cronTime,
46
- crawlConcurrency: env.preferences.crawlConcurrency,
47
- sitemapConcurrency: env.preferences.sitemapConcurrency,
46
+ concurrency: env.preferences.concurrency,
48
47
  },
49
48
  });
50
49
  ```
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/cjs/config.js CHANGED
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
9
9
  exports.config = {
10
10
  isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
12
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
13
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
14
14
  appUrl: process.env.BLOCKLET_APP_URL || '/',
15
15
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
16
- // cron
17
- siteCron: {
18
- sites: [],
19
- enabled: true,
20
- time: '0 0 0 * * *',
21
- immediate: false,
22
- crawlConcurrency: 2,
23
- sitemapConcurrency: 30,
24
- },
16
+ concurrency: 2,
25
17
  };
@@ -33,7 +33,7 @@ function createCrawlQueue() {
33
33
  const db = new BaseState(job_1.Job);
34
34
  crawlQueue = (0, queue_1.default)({
35
35
  store: new sequelize_1.default(db, 'crawler'),
36
- concurrency: config_1.config.siteCron.crawlConcurrency,
36
+ concurrency: config_1.config.concurrency,
37
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
38
  config_1.logger.info('Starting to execute crawl job', job);
39
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -57,18 +57,14 @@ function createCrawlQueue() {
57
57
  // } catch (error) {
58
58
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
59
59
  // }
60
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
60
61
  try {
61
62
  // get page content later
62
- const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
63
- // for blocklet theme
64
- blocklet_theme_prefer: 'light',
65
- // for blocklet domain warning
66
- 'domain-warning-skip': Date.now().toString(),
67
- } }, job));
63
+ const result = yield (0, exports.getPageContent)(formattedJob);
68
64
  if (!result || (!result.html && !result.screenshot)) {
69
- config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
65
+ config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
70
66
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
71
- job,
67
+ job: formattedJob,
72
68
  snapshot: {
73
69
  status: 'failed',
74
70
  error: 'Failed to crawl content',
@@ -84,7 +80,7 @@ function createCrawlQueue() {
84
80
  });
85
81
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
86
82
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
87
- job,
83
+ job: formattedJob,
88
84
  snapshot: {
89
85
  status: 'success',
90
86
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
@@ -96,9 +92,9 @@ function createCrawlQueue() {
96
92
  return snapshot;
97
93
  }
98
94
  catch (error) {
99
- config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
95
+ config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
100
96
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
101
- job,
97
+ job: formattedJob,
102
98
  snapshot: {
103
99
  status: 'failed',
104
100
  error: 'Internal error',
@@ -142,7 +138,7 @@ function saveSnapshotToLocal(_a) {
142
138
  };
143
139
  });
144
140
  }
145
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
141
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
146
142
  const page = yield (0, puppeteer_1.initPage)();
147
143
  if (width && height) {
148
144
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -150,13 +146,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
150
146
  if (headers) {
151
147
  yield page.setExtraHTTPHeaders(headers);
152
148
  }
153
- if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
154
- yield page.setCookie(...cookies);
149
+ // handle cookies
150
+ if (cookies) {
151
+ const { hostname } = new URL(url);
152
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
153
+ yield page.setCookie(...cookieParams);
155
154
  }
155
+ // handle localStorage
156
156
  if (localStorage) {
157
157
  yield page.evaluateOnNewDocument((items) => {
158
- Object.entries(items).forEach(([key, value]) => {
159
- window.localStorage.setItem(key, value);
158
+ items.forEach((item) => {
159
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
160
+ window.localStorage.setItem(item.key, value);
160
161
  });
161
162
  }, localStorage);
162
163
  }
@@ -255,7 +256,6 @@ exports.getPageContent = getPageContent;
255
256
  // eslint-disable-next-line require-await
256
257
  function crawlUrl(params, callback) {
257
258
  return __awaiter(this, void 0, void 0, function* () {
258
- params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
259
259
  // skip duplicate job
260
260
  const existsJob = yield job_1.Job.isExists(params);
261
261
  if (existsJob) {
package/lib/cjs/cron.js CHANGED
@@ -20,6 +20,8 @@ let cron = null;
20
20
  function initCron() {
21
21
  if (cron)
22
22
  return;
23
+ if (!config_1.config.siteCron)
24
+ return;
23
25
  config_1.logger.info('Init cron', { config: config_1.config.siteCron });
24
26
  cron = cron_1.default.init({
25
27
  context: {},
@@ -29,6 +31,9 @@ function initCron() {
29
31
  time: config_1.config.siteCron.time,
30
32
  options: { runOnInit: config_1.config.siteCron.immediate },
31
33
  fn: () => __awaiter(this, void 0, void 0, function* () {
34
+ var _a;
35
+ if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
36
+ return;
32
37
  config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
38
  for (const site of config_1.config.siteCron.sites) {
34
39
  try {
@@ -3,7 +3,4 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/cjs/index.js CHANGED
@@ -50,6 +50,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
50
50
  Object.defineProperty(exports, "__esModule", { value: true });
51
51
  exports.utils = void 0;
52
52
  exports.initCrawler = initCrawler;
53
+ /* eslint-disable @typescript-eslint/indent */
53
54
  const merge_1 = __importDefault(require("lodash/merge"));
54
55
  const config_1 = require("./config");
55
56
  const crawler_1 = require("./crawler");
@@ -62,13 +63,14 @@ __exportStar(require("./services/snapshot"), exports);
62
63
  exports.utils = __importStar(require("./utils"));
63
64
  function initCrawler(params) {
64
65
  return __awaiter(this, void 0, void 0, function* () {
66
+ var _a;
65
67
  (0, merge_1.default)(config_1.config, params);
66
68
  config_1.logger.info('Init crawler', { params, config: config_1.config });
67
69
  try {
68
70
  yield (0, store_1.initDatabase)();
69
71
  yield (0, puppeteer_1.ensureBrowser)();
70
72
  yield (0, crawler_1.createCrawlQueue)();
71
- if (config_1.config.siteCron.enabled) {
73
+ if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
72
74
  yield (0, cron_1.initCron)();
73
75
  }
74
76
  }
@@ -16,6 +16,7 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
16
16
  exports.formatSnapshot = formatSnapshot;
17
17
  exports.getSnapshot = getSnapshot;
18
18
  exports.getLatestSnapshot = getLatestSnapshot;
19
+ const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
19
20
  const pick_1 = __importDefault(require("lodash/pick"));
20
21
  const promises_1 = __importDefault(require("node:fs/promises"));
21
22
  const node_path_1 = __importDefault(require("node:path"));
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
36
37
  }
37
38
  function formatSnapshot(snapshot, columns) {
38
39
  return __awaiter(this, void 0, void 0, function* () {
39
- let data = Object.assign({}, snapshot);
40
+ let data = (0, cloneDeep_1.default)(snapshot);
40
41
  // format screenshot path to full url
41
42
  if (data.screenshot) {
42
43
  data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
46
47
  const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
47
48
  data.html = html.toString();
48
49
  }
50
+ // remove sensitive options that should not be returned
51
+ if (data.options) {
52
+ delete data.options.cookies;
53
+ delete data.options.localStorage;
54
+ delete data.options.headers;
55
+ }
49
56
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
50
57
  data = (0, pick_1.default)(data, columns);
51
58
  }
package/lib/cjs/site.js CHANGED
@@ -27,6 +27,7 @@ function parseSitemapUrl(sitemapItem) {
27
27
  return urls.map((url) => ({ url, sitemapItem }));
28
28
  }
29
29
  const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
30
+ var _b;
30
31
  config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
31
32
  const key = `${url}-${pathname}`;
32
33
  if (crawlBlockletRunningMap.has(key)) {
@@ -72,7 +73,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
72
73
  includeScreenshot: false,
73
74
  includeHtml: true,
74
75
  });
75
- }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
76
+ }), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
76
77
  config_1.logger.info('Enqueued jobs from sitemap finished', {
77
78
  url,
78
79
  pathname,
@@ -14,7 +14,10 @@ export interface JobState {
14
14
  lastModified?: string;
15
15
  headers?: Record<string, string>;
16
16
  cookies?: CookieParam[];
17
- localStorage?: Record<string, string>;
17
+ localStorage?: {
18
+ key: string;
19
+ value: string;
20
+ }[];
18
21
  }
19
22
  export interface JobModel {
20
23
  id: string;
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
19
20
  quality?: number;
20
21
  fullPage?: boolean;
21
22
  headers?: Record<string, string>;
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  }
24
30
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/esm/config.js CHANGED
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
3
3
  export const config = {
4
4
  isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
6
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
7
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
8
8
  appUrl: process.env.BLOCKLET_APP_URL || '/',
9
9
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
10
- // cron
11
- siteCron: {
12
- sites: [],
13
- enabled: true,
14
- time: '0 0 0 * * *',
15
- immediate: false,
16
- crawlConcurrency: 2,
17
- sitemapConcurrency: 30,
18
- },
10
+ concurrency: 2,
19
11
  };
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
24
24
  const db = new BaseState(Job);
25
25
  crawlQueue = createQueue({
26
26
  store: new SequelizeStore(db, 'crawler'),
27
- concurrency: config.siteCron.crawlConcurrency,
27
+ concurrency: config.concurrency,
28
28
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
29
  logger.info('Starting to execute crawl job', job);
30
30
  const canCrawl = yield isAcceptCrawler(job.url);
@@ -48,18 +48,14 @@ export function createCrawlQueue() {
48
48
  // } catch (error) {
49
49
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
50
  // }
51
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
51
52
  try {
52
53
  // get page content later
53
- const result = yield getPageContent(Object.assign({ localStorage: {
54
- // for blocklet theme
55
- blocklet_theme_prefer: 'light',
56
- // for blocklet domain warning
57
- 'domain-warning-skip': Date.now().toString(),
58
- } }, job));
54
+ const result = yield getPageContent(formattedJob);
59
55
  if (!result || (!result.html && !result.screenshot)) {
60
- logger.error(`failed to crawl ${job.url}, empty content`, job);
56
+ logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
61
57
  const snapshot = convertJobToSnapshot({
62
- job,
58
+ job: formattedJob,
63
59
  snapshot: {
64
60
  status: 'failed',
65
61
  error: 'Failed to crawl content',
@@ -75,7 +71,7 @@ export function createCrawlQueue() {
75
71
  });
76
72
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
77
73
  const snapshot = convertJobToSnapshot({
78
- job,
74
+ job: formattedJob,
79
75
  snapshot: {
80
76
  status: 'success',
81
77
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
@@ -87,9 +83,9 @@ export function createCrawlQueue() {
87
83
  return snapshot;
88
84
  }
89
85
  catch (error) {
90
- logger.error(`Failed to crawl ${job.url}`, { error, job });
86
+ logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
91
87
  const snapshot = convertJobToSnapshot({
92
- job,
88
+ job: formattedJob,
93
89
  snapshot: {
94
90
  status: 'failed',
95
91
  error: 'Internal error',
@@ -133,7 +129,7 @@ function saveSnapshotToLocal(_a) {
133
129
  };
134
130
  });
135
131
  }
136
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
132
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
137
133
  const page = yield initPage();
138
134
  if (width && height) {
139
135
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -141,13 +137,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
141
137
  if (headers) {
142
138
  yield page.setExtraHTTPHeaders(headers);
143
139
  }
144
- if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
145
- yield page.setCookie(...cookies);
140
+ // handle cookies
141
+ if (cookies) {
142
+ const { hostname } = new URL(url);
143
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
144
+ yield page.setCookie(...cookieParams);
146
145
  }
146
+ // handle localStorage
147
147
  if (localStorage) {
148
148
  yield page.evaluateOnNewDocument((items) => {
149
- Object.entries(items).forEach(([key, value]) => {
150
- window.localStorage.setItem(key, value);
149
+ items.forEach((item) => {
150
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
151
+ window.localStorage.setItem(item.key, value);
151
152
  });
152
153
  }, localStorage);
153
154
  }
@@ -245,7 +246,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
245
246
  // eslint-disable-next-line require-await
246
247
  export function crawlUrl(params, callback) {
247
248
  return __awaiter(this, void 0, void 0, function* () {
248
- params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
249
249
  // skip duplicate job
250
250
  const existsJob = yield Job.isExists(params);
251
251
  if (existsJob) {
package/lib/esm/cron.js CHANGED
@@ -14,6 +14,8 @@ let cron = null;
14
14
  export function initCron() {
15
15
  if (cron)
16
16
  return;
17
+ if (!config.siteCron)
18
+ return;
17
19
  logger.info('Init cron', { config: config.siteCron });
18
20
  cron = Cron.init({
19
21
  context: {},
@@ -23,6 +25,9 @@ export function initCron() {
23
25
  time: config.siteCron.time,
24
26
  options: { runOnInit: config.siteCron.immediate },
25
27
  fn: () => __awaiter(this, void 0, void 0, function* () {
28
+ var _a;
29
+ if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
30
+ return;
26
31
  logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
27
32
  for (const site of config.siteCron.sites) {
28
33
  try {
@@ -3,7 +3,4 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/esm/index.js CHANGED
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ /* eslint-disable @typescript-eslint/indent */
10
11
  import merge from 'lodash/merge';
11
12
  import { config, logger } from './config';
12
13
  import { createCrawlQueue } from './crawler';
@@ -19,13 +20,14 @@ export * from './services/snapshot';
19
20
  export * as utils from './utils';
20
21
  export function initCrawler(params) {
21
22
  return __awaiter(this, void 0, void 0, function* () {
23
+ var _a;
22
24
  merge(config, params);
23
25
  logger.info('Init crawler', { params, config });
24
26
  try {
25
27
  yield initDatabase();
26
28
  yield ensureBrowser();
27
29
  yield createCrawlQueue();
28
- if (config.siteCron.enabled) {
30
+ if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
29
31
  yield initCron();
30
32
  }
31
33
  }
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ import cloneDeep from 'lodash/cloneDeep';
10
11
  import pick from 'lodash/pick';
11
12
  import fs from 'node:fs/promises';
12
13
  import path from 'node:path';
@@ -27,7 +28,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
27
28
  }
28
29
  export function formatSnapshot(snapshot, columns) {
29
30
  return __awaiter(this, void 0, void 0, function* () {
30
- let data = Object.assign({}, snapshot);
31
+ let data = cloneDeep(snapshot);
31
32
  // format screenshot path to full url
32
33
  if (data.screenshot) {
33
34
  data.screenshot = joinURL(config.appUrl, data.screenshot);
@@ -37,6 +38,12 @@ export function formatSnapshot(snapshot, columns) {
37
38
  const html = yield fs.readFile(path.join(config.dataDir, data.html));
38
39
  data.html = html.toString();
39
40
  }
41
+ // remove sensitive options that should not be returned
42
+ if (data.options) {
43
+ delete data.options.cookies;
44
+ delete data.options.localStorage;
45
+ delete data.options.headers;
46
+ }
40
47
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
41
48
  data = pick(data, columns);
42
49
  }
package/lib/esm/site.js CHANGED
@@ -21,6 +21,7 @@ function parseSitemapUrl(sitemapItem) {
21
21
  return urls.map((url) => ({ url, sitemapItem }));
22
22
  }
23
23
  export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
24
+ var _b;
24
25
  logger.info(`Start crawl from sitemap ${url}`, { pathname });
25
26
  const key = `${url}-${pathname}`;
26
27
  if (crawlBlockletRunningMap.has(key)) {
@@ -66,7 +67,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
66
67
  includeScreenshot: false,
67
68
  includeHtml: true,
68
69
  });
69
- }), { concurrency: config.siteCron.sitemapConcurrency });
70
+ }), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
70
71
  logger.info('Enqueued jobs from sitemap finished', {
71
72
  url,
72
73
  pathname,
@@ -14,7 +14,10 @@ export interface JobState {
14
14
  lastModified?: string;
15
15
  headers?: Record<string, string>;
16
16
  cookies?: CookieParam[];
17
- localStorage?: Record<string, string>;
17
+ localStorage?: {
18
+ key: string;
19
+ value: string;
20
+ }[];
18
21
  }
19
22
  export interface JobModel {
20
23
  id: string;
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
19
20
  quality?: number;
20
21
  fullPage?: boolean;
21
22
  headers?: Record<string, string>;
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  }
24
30
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.5",
3
+ "version": "1.1.6",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",