@arcblock/crawler 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,8 +43,7 @@ await initCrawler({
43
43
  immediate: !!env.preferences.cronImmediate,
44
44
  sites: env.preferences.cronSites,
45
45
  time: env.preferences.cronTime,
46
- crawlConcurrency: env.preferences.crawlConcurrency,
47
- sitemapConcurrency: env.preferences.sitemapConcurrency,
46
+ concurrency: env.preferences.concurrency,
48
47
  },
49
48
  });
50
49
  ```
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/cjs/config.js CHANGED
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
9
9
  exports.config = {
10
10
  isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
12
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
13
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
14
14
  appUrl: process.env.BLOCKLET_APP_URL || '/',
15
15
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
16
- // cron
17
- siteCron: {
18
- sites: [],
19
- enabled: true,
20
- time: '0 0 0 * * *',
21
- immediate: false,
22
- crawlConcurrency: 2,
23
- sitemapConcurrency: 30,
24
- },
16
+ concurrency: 2,
25
17
  };
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
9
  html: string | null;
10
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
11
  meta: {
@@ -33,7 +33,7 @@ function createCrawlQueue() {
33
33
  const db = new BaseState(job_1.Job);
34
34
  crawlQueue = (0, queue_1.default)({
35
35
  store: new sequelize_1.default(db, 'crawler'),
36
- concurrency: config_1.config.siteCron.crawlConcurrency,
36
+ concurrency: config_1.config.concurrency,
37
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
38
  config_1.logger.info('Starting to execute crawl job', job);
39
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -57,13 +57,14 @@ function createCrawlQueue() {
57
57
  // } catch (error) {
58
58
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
59
59
  // }
60
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
60
61
  try {
61
62
  // get page content later
62
- const result = yield (0, exports.getPageContent)(job);
63
+ const result = yield (0, exports.getPageContent)(formattedJob);
63
64
  if (!result || (!result.html && !result.screenshot)) {
64
- config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
65
+ config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
65
66
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
66
- job,
67
+ job: formattedJob,
67
68
  snapshot: {
68
69
  status: 'failed',
69
70
  error: 'Failed to crawl content',
@@ -79,7 +80,7 @@ function createCrawlQueue() {
79
80
  });
80
81
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
81
82
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
82
- job,
83
+ job: formattedJob,
83
84
  snapshot: {
84
85
  status: 'success',
85
86
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
@@ -91,9 +92,9 @@ function createCrawlQueue() {
91
92
  return snapshot;
92
93
  }
93
94
  catch (error) {
94
- config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
95
+ config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
95
96
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
96
- job,
97
+ job: formattedJob,
97
98
  snapshot: {
98
99
  status: 'failed',
99
100
  error: 'Internal error',
@@ -137,7 +138,7 @@ function saveSnapshotToLocal(_a) {
137
138
  };
138
139
  });
139
140
  }
140
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
141
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
141
142
  const page = yield (0, puppeteer_1.initPage)();
142
143
  if (width && height) {
143
144
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -145,6 +146,21 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
145
146
  if (headers) {
146
147
  yield page.setExtraHTTPHeaders(headers);
147
148
  }
149
+ // handle cookies
150
+ if (cookies) {
151
+ const { hostname } = new URL(url);
152
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
153
+ yield page.setCookie(...cookieParams);
154
+ }
155
+ // handle localStorage
156
+ if (localStorage) {
157
+ yield page.evaluateOnNewDocument((items) => {
158
+ items.forEach((item) => {
159
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
160
+ window.localStorage.setItem(item.key, value);
161
+ });
162
+ }, localStorage);
163
+ }
148
164
  let html = null;
149
165
  let screenshot = null;
150
166
  const meta = {};
@@ -240,7 +256,6 @@ exports.getPageContent = getPageContent;
240
256
  // eslint-disable-next-line require-await
241
257
  function crawlUrl(params, callback) {
242
258
  return __awaiter(this, void 0, void 0, function* () {
243
- params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
244
259
  // skip duplicate job
245
260
  const existsJob = yield job_1.Job.isExists(params);
246
261
  if (existsJob) {
package/lib/cjs/cron.js CHANGED
@@ -20,6 +20,8 @@ let cron = null;
20
20
  function initCron() {
21
21
  if (cron)
22
22
  return;
23
+ if (!config_1.config.siteCron)
24
+ return;
23
25
  config_1.logger.info('Init cron', { config: config_1.config.siteCron });
24
26
  cron = cron_1.default.init({
25
27
  context: {},
@@ -29,6 +31,9 @@ function initCron() {
29
31
  time: config_1.config.siteCron.time,
30
32
  options: { runOnInit: config_1.config.siteCron.immediate },
31
33
  fn: () => __awaiter(this, void 0, void 0, function* () {
34
+ var _a;
35
+ if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
36
+ return;
32
37
  config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
38
  for (const site of config_1.config.siteCron.sites) {
34
39
  try {
@@ -3,7 +3,4 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/cjs/index.js CHANGED
@@ -50,6 +50,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
50
50
  Object.defineProperty(exports, "__esModule", { value: true });
51
51
  exports.utils = void 0;
52
52
  exports.initCrawler = initCrawler;
53
+ /* eslint-disable @typescript-eslint/indent */
53
54
  const merge_1 = __importDefault(require("lodash/merge"));
54
55
  const config_1 = require("./config");
55
56
  const crawler_1 = require("./crawler");
@@ -62,13 +63,14 @@ __exportStar(require("./services/snapshot"), exports);
62
63
  exports.utils = __importStar(require("./utils"));
63
64
  function initCrawler(params) {
64
65
  return __awaiter(this, void 0, void 0, function* () {
66
+ var _a;
65
67
  (0, merge_1.default)(config_1.config, params);
66
68
  config_1.logger.info('Init crawler', { params, config: config_1.config });
67
69
  try {
68
70
  yield (0, store_1.initDatabase)();
69
71
  yield (0, puppeteer_1.ensureBrowser)();
70
72
  yield (0, crawler_1.createCrawlQueue)();
71
- if (config_1.config.siteCron.enabled) {
73
+ if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
72
74
  yield (0, cron_1.initCron)();
73
75
  }
74
76
  }
@@ -16,6 +16,7 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
16
16
  exports.formatSnapshot = formatSnapshot;
17
17
  exports.getSnapshot = getSnapshot;
18
18
  exports.getLatestSnapshot = getLatestSnapshot;
19
+ const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
19
20
  const pick_1 = __importDefault(require("lodash/pick"));
20
21
  const promises_1 = __importDefault(require("node:fs/promises"));
21
22
  const node_path_1 = __importDefault(require("node:path"));
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
36
37
  }
37
38
  function formatSnapshot(snapshot, columns) {
38
39
  return __awaiter(this, void 0, void 0, function* () {
39
- let data = Object.assign({}, snapshot);
40
+ let data = (0, cloneDeep_1.default)(snapshot);
40
41
  // format screenshot path to full url
41
42
  if (data.screenshot) {
42
43
  data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
46
47
  const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
47
48
  data.html = html.toString();
48
49
  }
50
+ // remove sensitive options that should not be returned
51
+ if (data.options) {
52
+ delete data.options.cookies;
53
+ delete data.options.localStorage;
54
+ delete data.options.headers;
55
+ }
49
56
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
50
57
  data = (0, pick_1.default)(data, columns);
51
58
  }
package/lib/cjs/site.js CHANGED
@@ -27,6 +27,7 @@ function parseSitemapUrl(sitemapItem) {
27
27
  return urls.map((url) => ({ url, sitemapItem }));
28
28
  }
29
29
  const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
30
+ var _b;
30
31
  config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
31
32
  const key = `${url}-${pathname}`;
32
33
  if (crawlBlockletRunningMap.has(key)) {
@@ -72,7 +73,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
72
73
  includeScreenshot: false,
73
74
  includeHtml: true,
74
75
  });
75
- }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
76
+ }), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
76
77
  config_1.logger.info('Enqueued jobs from sitemap finished', {
77
78
  url,
78
79
  pathname,
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -12,6 +13,11 @@ export interface JobState {
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
14
15
  headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: {
18
+ key: string;
19
+ value: string;
20
+ }[];
15
21
  }
16
22
  export interface JobModel {
17
23
  id: string;
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
19
20
  quality?: number;
20
21
  fullPage?: boolean;
21
22
  headers?: Record<string, string>;
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  }
24
30
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/cjs/utils.js CHANGED
@@ -134,6 +134,11 @@ const staticFileExtensions = [
134
134
  'xls',
135
135
  'xml',
136
136
  'zip',
137
+ 'ts',
138
+ 'json',
139
+ 'md',
140
+ 'yml',
141
+ 'yaml',
137
142
  ];
138
143
  const sleep = (ms) => {
139
144
  return new Promise((resolve) => {
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/esm/config.js CHANGED
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
3
3
  export const config = {
4
4
  isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
6
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
7
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
8
8
  appUrl: process.env.BLOCKLET_APP_URL || '/',
9
9
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
10
- // cron
11
- siteCron: {
12
- sites: [],
13
- enabled: true,
14
- time: '0 0 0 * * *',
15
- immediate: false,
16
- crawlConcurrency: 2,
17
- sitemapConcurrency: 30,
18
- },
10
+ concurrency: 2,
19
11
  };
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
9
  html: string | null;
10
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
11
  meta: {
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
24
24
  const db = new BaseState(Job);
25
25
  crawlQueue = createQueue({
26
26
  store: new SequelizeStore(db, 'crawler'),
27
- concurrency: config.siteCron.crawlConcurrency,
27
+ concurrency: config.concurrency,
28
28
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
29
  logger.info('Starting to execute crawl job', job);
30
30
  const canCrawl = yield isAcceptCrawler(job.url);
@@ -48,13 +48,14 @@ export function createCrawlQueue() {
48
48
  // } catch (error) {
49
49
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
50
  // }
51
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
51
52
  try {
52
53
  // get page content later
53
- const result = yield getPageContent(job);
54
+ const result = yield getPageContent(formattedJob);
54
55
  if (!result || (!result.html && !result.screenshot)) {
55
- logger.error(`failed to crawl ${job.url}, empty content`, job);
56
+ logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
56
57
  const snapshot = convertJobToSnapshot({
57
- job,
58
+ job: formattedJob,
58
59
  snapshot: {
59
60
  status: 'failed',
60
61
  error: 'Failed to crawl content',
@@ -70,7 +71,7 @@ export function createCrawlQueue() {
70
71
  });
71
72
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
72
73
  const snapshot = convertJobToSnapshot({
73
- job,
74
+ job: formattedJob,
74
75
  snapshot: {
75
76
  status: 'success',
76
77
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
@@ -82,9 +83,9 @@ export function createCrawlQueue() {
82
83
  return snapshot;
83
84
  }
84
85
  catch (error) {
85
- logger.error(`Failed to crawl ${job.url}`, { error, job });
86
+ logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
86
87
  const snapshot = convertJobToSnapshot({
87
- job,
88
+ job: formattedJob,
88
89
  snapshot: {
89
90
  status: 'failed',
90
91
  error: 'Internal error',
@@ -128,7 +129,7 @@ function saveSnapshotToLocal(_a) {
128
129
  };
129
130
  });
130
131
  }
131
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
132
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
132
133
  const page = yield initPage();
133
134
  if (width && height) {
134
135
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -136,6 +137,21 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
136
137
  if (headers) {
137
138
  yield page.setExtraHTTPHeaders(headers);
138
139
  }
140
+ // handle cookies
141
+ if (cookies) {
142
+ const { hostname } = new URL(url);
143
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
144
+ yield page.setCookie(...cookieParams);
145
+ }
146
+ // handle localStorage
147
+ if (localStorage) {
148
+ yield page.evaluateOnNewDocument((items) => {
149
+ items.forEach((item) => {
150
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
151
+ window.localStorage.setItem(item.key, value);
152
+ });
153
+ }, localStorage);
154
+ }
139
155
  let html = null;
140
156
  let screenshot = null;
141
157
  const meta = {};
@@ -230,7 +246,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
230
246
  // eslint-disable-next-line require-await
231
247
  export function crawlUrl(params, callback) {
232
248
  return __awaiter(this, void 0, void 0, function* () {
233
- params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
234
249
  // skip duplicate job
235
250
  const existsJob = yield Job.isExists(params);
236
251
  if (existsJob) {
package/lib/esm/cron.js CHANGED
@@ -14,6 +14,8 @@ let cron = null;
14
14
  export function initCron() {
15
15
  if (cron)
16
16
  return;
17
+ if (!config.siteCron)
18
+ return;
17
19
  logger.info('Init cron', { config: config.siteCron });
18
20
  cron = Cron.init({
19
21
  context: {},
@@ -23,6 +25,9 @@ export function initCron() {
23
25
  time: config.siteCron.time,
24
26
  options: { runOnInit: config.siteCron.immediate },
25
27
  fn: () => __awaiter(this, void 0, void 0, function* () {
28
+ var _a;
29
+ if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
30
+ return;
26
31
  logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
27
32
  for (const site of config.siteCron.sites) {
28
33
  try {
@@ -3,7 +3,4 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/esm/index.js CHANGED
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ /* eslint-disable @typescript-eslint/indent */
10
11
  import merge from 'lodash/merge';
11
12
  import { config, logger } from './config';
12
13
  import { createCrawlQueue } from './crawler';
@@ -19,13 +20,14 @@ export * from './services/snapshot';
19
20
  export * as utils from './utils';
20
21
  export function initCrawler(params) {
21
22
  return __awaiter(this, void 0, void 0, function* () {
23
+ var _a;
22
24
  merge(config, params);
23
25
  logger.info('Init crawler', { params, config });
24
26
  try {
25
27
  yield initDatabase();
26
28
  yield ensureBrowser();
27
29
  yield createCrawlQueue();
28
- if (config.siteCron.enabled) {
30
+ if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
29
31
  yield initCron();
30
32
  }
31
33
  }
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ import cloneDeep from 'lodash/cloneDeep';
10
11
  import pick from 'lodash/pick';
11
12
  import fs from 'node:fs/promises';
12
13
  import path from 'node:path';
@@ -27,7 +28,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
27
28
  }
28
29
  export function formatSnapshot(snapshot, columns) {
29
30
  return __awaiter(this, void 0, void 0, function* () {
30
- let data = Object.assign({}, snapshot);
31
+ let data = cloneDeep(snapshot);
31
32
  // format screenshot path to full url
32
33
  if (data.screenshot) {
33
34
  data.screenshot = joinURL(config.appUrl, data.screenshot);
@@ -37,6 +38,12 @@ export function formatSnapshot(snapshot, columns) {
37
38
  const html = yield fs.readFile(path.join(config.dataDir, data.html));
38
39
  data.html = html.toString();
39
40
  }
41
+ // remove sensitive options that should not be returned
42
+ if (data.options) {
43
+ delete data.options.cookies;
44
+ delete data.options.localStorage;
45
+ delete data.options.headers;
46
+ }
40
47
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
41
48
  data = pick(data, columns);
42
49
  }
package/lib/esm/site.js CHANGED
@@ -21,6 +21,7 @@ function parseSitemapUrl(sitemapItem) {
21
21
  return urls.map((url) => ({ url, sitemapItem }));
22
22
  }
23
23
  export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
24
+ var _b;
24
25
  logger.info(`Start crawl from sitemap ${url}`, { pathname });
25
26
  const key = `${url}-${pathname}`;
26
27
  if (crawlBlockletRunningMap.has(key)) {
@@ -66,7 +67,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
66
67
  includeScreenshot: false,
67
68
  includeHtml: true,
68
69
  });
69
- }), { concurrency: config.siteCron.sitemapConcurrency });
70
+ }), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
70
71
  logger.info('Enqueued jobs from sitemap finished', {
71
72
  url,
72
73
  pathname,
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -12,6 +13,11 @@ export interface JobState {
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
14
15
  headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: {
18
+ key: string;
19
+ value: string;
20
+ }[];
15
21
  }
16
22
  export interface JobModel {
17
23
  id: string;
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
19
20
  quality?: number;
20
21
  fullPage?: boolean;
21
22
  headers?: Record<string, string>;
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  }
24
30
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/esm/utils.js CHANGED
@@ -123,6 +123,11 @@ const staticFileExtensions = [
123
123
  'xls',
124
124
  'xml',
125
125
  'zip',
126
+ 'ts',
127
+ 'json',
128
+ 'md',
129
+ 'yml',
130
+ 'yaml',
126
131
  ];
127
132
  export const sleep = (ms) => {
128
133
  return new Promise((resolve) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.4",
3
+ "version": "1.1.6",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -45,33 +45,32 @@
45
45
  ]
46
46
  },
47
47
  "dependencies": {
48
- "@abtnode/cron": "^1.16.43",
49
- "@abtnode/models": "^1.16.43",
50
- "@abtnode/queue": "^1.16.43",
51
- "@blocklet/logger": "^1.16.43",
48
+ "@abtnode/cron": "^1.16.44",
49
+ "@abtnode/models": "^1.16.44",
50
+ "@abtnode/queue": "^1.16.44",
51
+ "@blocklet/logger": "^1.16.44",
52
52
  "@blocklet/puppeteer": "^22.11.3",
53
- "@blocklet/sdk": "^1.16.43",
53
+ "@blocklet/sdk": "^1.16.44",
54
54
  "@sequelize/core": "7.0.0-alpha.46",
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",
57
57
  "fs-extra": "^11.2.0",
58
58
  "lodash": "^4.17.21",
59
59
  "lru-cache": "^10.4.3",
60
+ "p-map": "^7.0.3",
60
61
  "robots-parser": "^3.0.1",
61
62
  "sitemap": "^7.1.2",
62
63
  "sqlite3": "^5.1.7",
63
- "ufo": "^1.5.4",
64
- "p-map": "^7.0.3"
64
+ "ufo": "^1.5.4"
65
65
  },
66
66
  "devDependencies": {
67
- "@blocklet/js-sdk": "^1.16.39",
68
67
  "@types/dotenv-flow": "^3.3.3",
69
68
  "@types/express": "^4.17.21",
70
69
  "@types/fs-extra": "^11.0.4",
71
70
  "@types/lodash": "^4.17.16",
72
71
  "@types/node": "^20.17.19",
73
- "express": "^4.21.2",
74
72
  "bumpp": "^9.11.1",
73
+ "express": "^4.21.2",
75
74
  "nodemon": "^3.1.9",
76
75
  "npm-run-all": "^4.1.5",
77
76
  "puppeteer": "^24.8.2",