npm - @arcblock/crawler - Versions diffs - 1.1.4 → 1.1.6 - Mend

@arcblock/crawler 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +1 -2
package/lib/cjs/config.d.ts +9 -3
package/lib/cjs/config.js +2 -10
package/lib/cjs/crawler.d.ts +1 -1
package/lib/cjs/crawler.js +24 -9
package/lib/cjs/cron.js +5 -0
package/lib/cjs/index.d.ts +1 -4
package/lib/cjs/index.js +3 -1
package/lib/cjs/services/snapshot.js +8 -1
package/lib/cjs/site.js +2 -1
package/lib/cjs/store/job.d.ts +6 -0
package/lib/cjs/store/snapshot.d.ts +6 -0
package/lib/cjs/utils.js +5 -0
package/lib/esm/config.d.ts +9 -3
package/lib/esm/config.js +2 -10
package/lib/esm/crawler.d.ts +1 -1
package/lib/esm/crawler.js +24 -9
package/lib/esm/cron.js +5 -0
package/lib/esm/index.d.ts +1 -4
package/lib/esm/index.js +3 -1
package/lib/esm/services/snapshot.js +8 -1
package/lib/esm/site.js +2 -1
package/lib/esm/store/job.d.ts +6 -0
package/lib/esm/store/snapshot.d.ts +6 -0
package/lib/esm/utils.js +5 -0
package/package.json +9 -10

package/README.md CHANGED Viewed

@@ -43,8 +43,7 @@ await initCrawler({
     immediate: !!env.preferences.cronImmediate,
     sites: env.preferences.cronSites,
     time: env.preferences.cronTime,
-    crawlConcurrency: env.preferences.crawlConcurrency,
-    sitemapConcurrency: env.preferences.sitemapConcurrency,
+    concurrency: env.preferences.concurrency,
   },
 });
 ```

package/lib/cjs/config.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 export type Site = {
     url: string;
     pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
     appUrl: string;
     cacheDir: string;
     puppeteerPath?: string;
-    siteCron: {
+    concurrency: number;
+    siteCron?: {
         sites: Site[];
         time: string;
         enabled: boolean;
         immediate: boolean;
-        crawlConcurrency: number;
-        sitemapConcurrency: number;
+        concurrency: number;
     };
+    cookies?: CookieParam[];
+    localStorage?: {
+        key: string;
+        value: string;
+    }[];
 };
 export declare const logger: any;
 export declare const config: Config;

package/lib/cjs/config.js CHANGED Viewed

@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
 exports.config = {
     isProd: process.env.NODE_ENV === 'production',
     dataDir: process.env.BLOCKLET_DATA_DIR,
-    appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
     cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
+    appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
     appUrl: process.env.BLOCKLET_APP_URL || '/',
     puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
-    // cron
-    siteCron: {
-        sites: [],
-        enabled: true,
-        time: '0 0 0 * * *',
-        immediate: false,
-        crawlConcurrency: 2,
-        sitemapConcurrency: 30,
-    },
+    concurrency: 2,
 };

package/lib/cjs/crawler.d.ts CHANGED Viewed

@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
     htmlDir: string;
     screenshotDir: string;
 }>;
-export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
+export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
     html: string | null;
     screenshot: Uint8Array<ArrayBufferLike> | null;
     meta: {

package/lib/cjs/crawler.js CHANGED Viewed

@@ -33,7 +33,7 @@ function createCrawlQueue() {
     const db = new BaseState(job_1.Job);
     crawlQueue = (0, queue_1.default)({
         store: new sequelize_1.default(db, 'crawler'),
-        concurrency: config_1.config.siteCron.crawlConcurrency,
+        concurrency: config_1.config.concurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             config_1.logger.info('Starting to execute crawl job', job);
             const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -57,13 +57,14 @@ function createCrawlQueue() {
             // } catch (error) {
             //   logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
             // }
+            const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
             try {
                 // get page content later
-                const result = yield (0, exports.getPageContent)(job);
+                const result = yield (0, exports.getPageContent)(formattedJob);
                 if (!result || (!result.html && !result.screenshot)) {
-                    config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
+                    config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
                     const snapshot = (0, snapshot_1.convertJobToSnapshot)({
-                        job,
+                        job: formattedJob,
                         snapshot: {
                             status: 'failed',
                             error: 'Failed to crawl content',
@@ -79,7 +80,7 @@ function createCrawlQueue() {
                 });
                 // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
                 const snapshot = (0, snapshot_1.convertJobToSnapshot)({
-                    job,
+                    job: formattedJob,
                     snapshot: {
                         status: 'success',
                         screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
@@ -91,9 +92,9 @@ function createCrawlQueue() {
                 return snapshot;
             }
             catch (error) {
-                config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
+                config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
                 const snapshot = (0, snapshot_1.convertJobToSnapshot)({
-                    job,
+                    job: formattedJob,
                     snapshot: {
                         status: 'failed',
                         error: 'Internal error',
@@ -137,7 +138,7 @@ function saveSnapshotToLocal(_a) {
         };
     });
 }
-const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
+const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
     const page = yield (0, puppeteer_1.initPage)();
     if (width && height) {
         yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -145,6 +146,21 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
     if (headers) {
         yield page.setExtraHTTPHeaders(headers);
     }
+    // handle cookies
+    if (cookies) {
+        const { hostname } = new URL(url);
+        const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
+        yield page.setCookie(...cookieParams);
+    }
+    // handle localStorage
+    if (localStorage) {
+        yield page.evaluateOnNewDocument((items) => {
+            items.forEach((item) => {
+                const value = item.value === 'now()' ? new Date().toISOString() : item.value;
+                window.localStorage.setItem(item.key, value);
+            });
+        }, localStorage);
+    }
     let html = null;
     let screenshot = null;
     const meta = {};
@@ -240,7 +256,6 @@ exports.getPageContent = getPageContent;
 // eslint-disable-next-line require-await
 function crawlUrl(params, callback) {
     return __awaiter(this, void 0, void 0, function* () {
-        params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
         // skip duplicate job
         const existsJob = yield job_1.Job.isExists(params);
         if (existsJob) {

package/lib/cjs/cron.js CHANGED Viewed

@@ -20,6 +20,8 @@ let cron = null;
 function initCron() {
     if (cron)
         return;
+    if (!config_1.config.siteCron)
+        return;
     config_1.logger.info('Init cron', { config: config_1.config.siteCron });
     cron = cron_1.default.init({
         context: {},
@@ -29,6 +31,9 @@ function initCron() {
                 time: config_1.config.siteCron.time,
                 options: { runOnInit: config_1.config.siteCron.immediate },
                 fn: () => __awaiter(this, void 0, void 0, function* () {
+                    var _a;
+                    if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
+                        return;
                     config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
                     for (const site of config_1.config.siteCron.sites) {
                         try {

package/lib/cjs/index.d.ts CHANGED Viewed

@@ -3,7 +3,4 @@ export * from './crawler';
 export * from './site';
 export * from './services/snapshot';
 export * as utils from './utils';
-type DeepPartial<T> = T extends object ? {
-    [P in keyof T]?: DeepPartial<T[P]>;
-} : T;
-export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
+export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;

package/lib/cjs/index.js CHANGED Viewed

@@ -50,6 +50,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.utils = void 0;
 exports.initCrawler = initCrawler;
+/* eslint-disable @typescript-eslint/indent */
 const merge_1 = __importDefault(require("lodash/merge"));
 const config_1 = require("./config");
 const crawler_1 = require("./crawler");
@@ -62,13 +63,14 @@ __exportStar(require("./services/snapshot"), exports);
 exports.utils = __importStar(require("./utils"));
 function initCrawler(params) {
     return __awaiter(this, void 0, void 0, function* () {
+        var _a;
         (0, merge_1.default)(config_1.config, params);
         config_1.logger.info('Init crawler', { params, config: config_1.config });
         try {
             yield (0, store_1.initDatabase)();
             yield (0, puppeteer_1.ensureBrowser)();
             yield (0, crawler_1.createCrawlQueue)();
-            if (config_1.config.siteCron.enabled) {
+            if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
                 yield (0, cron_1.initCron)();
             }
         }

package/lib/cjs/services/snapshot.js CHANGED Viewed

@@ -16,6 +16,7 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
 exports.formatSnapshot = formatSnapshot;
 exports.getSnapshot = getSnapshot;
 exports.getLatestSnapshot = getLatestSnapshot;
+const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
 const pick_1 = __importDefault(require("lodash/pick"));
 const promises_1 = __importDefault(require("node:fs/promises"));
 const node_path_1 = __importDefault(require("node:path"));
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
 }
 function formatSnapshot(snapshot, columns) {
     return __awaiter(this, void 0, void 0, function* () {
-        let data = Object.assign({}, snapshot);
+        let data = (0, cloneDeep_1.default)(snapshot);
         // format screenshot path to full url
         if (data.screenshot) {
             data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
             const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
             data.html = html.toString();
         }
+        // remove sensitive options that should not be returned
+        if (data.options) {
+            delete data.options.cookies;
+            delete data.options.localStorage;
+            delete data.options.headers;
+        }
         if (columns === null || columns === void 0 ? void 0 : columns.length) {
             data = (0, pick_1.default)(data, columns);
         }

package/lib/cjs/site.js CHANGED Viewed

@@ -27,6 +27,7 @@ function parseSitemapUrl(sitemapItem) {
     return urls.map((url) => ({ url, sitemapItem }));
 }
 const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
+    var _b;
     config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
     const key = `${url}-${pathname}`;
     if (crawlBlockletRunningMap.has(key)) {
@@ -72,7 +73,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
                 includeScreenshot: false,
                 includeHtml: true,
             });
-        }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
+        }), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
         config_1.logger.info('Enqueued jobs from sitemap finished', {
             url,
             pathname,

package/lib/cjs/store/job.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 import { Model, Sequelize } from '@sequelize/core';
 export interface JobState {
     id?: string;
@@ -12,6 +13,11 @@ export interface JobState {
     fullPage?: boolean;
     lastModified?: string;
     headers?: Record<string, string>;
+    cookies?: CookieParam[];
+    localStorage?: {
+        key: string;
+        value: string;
+    }[];
 }
 export interface JobModel {
     id: string;

package/lib/cjs/store/snapshot.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 import { FindOptions, Model, Sequelize } from '@sequelize/core';
 export interface SnapshotModel {
     jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
         quality?: number;
         fullPage?: boolean;
         headers?: Record<string, string>;
+        cookies?: CookieParam[];
+        localStorage?: {
+            key: string;
+            value: string;
+        }[];
     };
 }
 export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {

package/lib/cjs/utils.js CHANGED Viewed

@@ -134,6 +134,11 @@ const staticFileExtensions = [
     'xls',
     'xml',
     'zip',
+    'ts',
+    'json',
+    'md',
+    'yml',
+    'yaml',
 ];
 const sleep = (ms) => {
     return new Promise((resolve) => {

package/lib/esm/config.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 export type Site = {
     url: string;
     pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
     appUrl: string;
     cacheDir: string;
     puppeteerPath?: string;
-    siteCron: {
+    concurrency: number;
+    siteCron?: {
         sites: Site[];
         time: string;
         enabled: boolean;
         immediate: boolean;
-        crawlConcurrency: number;
-        sitemapConcurrency: number;
+        concurrency: number;
     };
+    cookies?: CookieParam[];
+    localStorage?: {
+        key: string;
+        value: string;
+    }[];
 };
 export declare const logger: any;
 export declare const config: Config;

package/lib/esm/config.js CHANGED Viewed

@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
 export const config = {
     isProd: process.env.NODE_ENV === 'production',
     dataDir: process.env.BLOCKLET_DATA_DIR,
-    appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
     cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
+    appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
     appUrl: process.env.BLOCKLET_APP_URL || '/',
     puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
-    // cron
-    siteCron: {
-        sites: [],
-        enabled: true,
-        time: '0 0 0 * * *',
-        immediate: false,
-        crawlConcurrency: 2,
-        sitemapConcurrency: 30,
-    },
+    concurrency: 2,
 };

package/lib/esm/crawler.d.ts CHANGED Viewed

@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
     htmlDir: string;
     screenshotDir: string;
 }>;
-export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
+export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
     html: string | null;
     screenshot: Uint8Array<ArrayBufferLike> | null;
     meta: {

package/lib/esm/crawler.js CHANGED Viewed

@@ -24,7 +24,7 @@ export function createCrawlQueue() {
     const db = new BaseState(Job);
     crawlQueue = createQueue({
         store: new SequelizeStore(db, 'crawler'),
-        concurrency: config.siteCron.crawlConcurrency,
+        concurrency: config.concurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             logger.info('Starting to execute crawl job', job);
             const canCrawl = yield isAcceptCrawler(job.url);
@@ -48,13 +48,14 @@ export function createCrawlQueue() {
             // } catch (error) {
             //   logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
             // }
+            const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
             try {
                 // get page content later
-                const result = yield getPageContent(job);
+                const result = yield getPageContent(formattedJob);
                 if (!result || (!result.html && !result.screenshot)) {
-                    logger.error(`failed to crawl ${job.url}, empty content`, job);
+                    logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
                     const snapshot = convertJobToSnapshot({
-                        job,
+                        job: formattedJob,
                         snapshot: {
                             status: 'failed',
                             error: 'Failed to crawl content',
@@ -70,7 +71,7 @@ export function createCrawlQueue() {
                 });
                 // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
                 const snapshot = convertJobToSnapshot({
-                    job,
+                    job: formattedJob,
                     snapshot: {
                         status: 'success',
                         screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
@@ -82,9 +83,9 @@ export function createCrawlQueue() {
                 return snapshot;
             }
             catch (error) {
-                logger.error(`Failed to crawl ${job.url}`, { error, job });
+                logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
                 const snapshot = convertJobToSnapshot({
-                    job,
+                    job: formattedJob,
                     snapshot: {
                         status: 'failed',
                         error: 'Internal error',
@@ -128,7 +129,7 @@ function saveSnapshotToLocal(_a) {
         };
     });
 }
-export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
+export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
     const page = yield initPage();
     if (width && height) {
         yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -136,6 +137,21 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
     if (headers) {
         yield page.setExtraHTTPHeaders(headers);
     }
+    // handle cookies
+    if (cookies) {
+        const { hostname } = new URL(url);
+        const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
+        yield page.setCookie(...cookieParams);
+    }
+    // handle localStorage
+    if (localStorage) {
+        yield page.evaluateOnNewDocument((items) => {
+            items.forEach((item) => {
+                const value = item.value === 'now()' ? new Date().toISOString() : item.value;
+                window.localStorage.setItem(item.key, value);
+            });
+        }, localStorage);
+    }
     let html = null;
     let screenshot = null;
     const meta = {};
@@ -230,7 +246,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
 // eslint-disable-next-line require-await
 export function crawlUrl(params, callback) {
     return __awaiter(this, void 0, void 0, function* () {
-        params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
         // skip duplicate job
         const existsJob = yield Job.isExists(params);
         if (existsJob) {

package/lib/esm/cron.js CHANGED Viewed

@@ -14,6 +14,8 @@ let cron = null;
 export function initCron() {
     if (cron)
         return;
+    if (!config.siteCron)
+        return;
     logger.info('Init cron', { config: config.siteCron });
     cron = Cron.init({
         context: {},
@@ -23,6 +25,9 @@ export function initCron() {
                 time: config.siteCron.time,
                 options: { runOnInit: config.siteCron.immediate },
                 fn: () => __awaiter(this, void 0, void 0, function* () {
+                    var _a;
+                    if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
+                        return;
                     logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
                     for (const site of config.siteCron.sites) {
                         try {

package/lib/esm/index.d.ts CHANGED Viewed

@@ -3,7 +3,4 @@ export * from './crawler';
 export * from './site';
 export * from './services/snapshot';
 export * as utils from './utils';
-type DeepPartial<T> = T extends object ? {
-    [P in keyof T]?: DeepPartial<T[P]>;
-} : T;
-export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
+export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;

package/lib/esm/index.js CHANGED Viewed

@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
         step((generator = generator.apply(thisArg, _arguments || [])).next());
     });
 };
+/* eslint-disable @typescript-eslint/indent */
 import merge from 'lodash/merge';
 import { config, logger } from './config';
 import { createCrawlQueue } from './crawler';
@@ -19,13 +20,14 @@ export * from './services/snapshot';
 export * as utils from './utils';
 export function initCrawler(params) {
     return __awaiter(this, void 0, void 0, function* () {
+        var _a;
         merge(config, params);
         logger.info('Init crawler', { params, config });
         try {
             yield initDatabase();
             yield ensureBrowser();
             yield createCrawlQueue();
-            if (config.siteCron.enabled) {
+            if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
                 yield initCron();
             }
         }

package/lib/esm/services/snapshot.js CHANGED Viewed

@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
         step((generator = generator.apply(thisArg, _arguments || [])).next());
     });
 };
+import cloneDeep from 'lodash/cloneDeep';
 import pick from 'lodash/pick';
 import fs from 'node:fs/promises';
 import path from 'node:path';
@@ -27,7 +28,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
 }
 export function formatSnapshot(snapshot, columns) {
     return __awaiter(this, void 0, void 0, function* () {
-        let data = Object.assign({}, snapshot);
+        let data = cloneDeep(snapshot);
         // format screenshot path to full url
         if (data.screenshot) {
             data.screenshot = joinURL(config.appUrl, data.screenshot);
@@ -37,6 +38,12 @@ export function formatSnapshot(snapshot, columns) {
             const html = yield fs.readFile(path.join(config.dataDir, data.html));
             data.html = html.toString();
         }
+        // remove sensitive options that should not be returned
+        if (data.options) {
+            delete data.options.cookies;
+            delete data.options.localStorage;
+            delete data.options.headers;
+        }
         if (columns === null || columns === void 0 ? void 0 : columns.length) {
             data = pick(data, columns);
         }

package/lib/esm/site.js CHANGED Viewed

@@ -21,6 +21,7 @@ function parseSitemapUrl(sitemapItem) {
     return urls.map((url) => ({ url, sitemapItem }));
 }
 export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
+    var _b;
     logger.info(`Start crawl from sitemap ${url}`, { pathname });
     const key = `${url}-${pathname}`;
     if (crawlBlockletRunningMap.has(key)) {
@@ -66,7 +67,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
                 includeScreenshot: false,
                 includeHtml: true,
             });
-        }), { concurrency: config.siteCron.sitemapConcurrency });
+        }), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
         logger.info('Enqueued jobs from sitemap finished', {
             url,
             pathname,

package/lib/esm/store/job.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 import { Model, Sequelize } from '@sequelize/core';
 export interface JobState {
     id?: string;
@@ -12,6 +13,11 @@ export interface JobState {
     fullPage?: boolean;
     lastModified?: string;
     headers?: Record<string, string>;
+    cookies?: CookieParam[];
+    localStorage?: {
+        key: string;
+        value: string;
+    }[];
 }
 export interface JobModel {
     id: string;

package/lib/esm/store/snapshot.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CookieParam } from '@blocklet/puppeteer';
 import { FindOptions, Model, Sequelize } from '@sequelize/core';
 export interface SnapshotModel {
     jobId: string;
@@ -19,6 +20,11 @@ export interface SnapshotModel {
         quality?: number;
         fullPage?: boolean;
         headers?: Record<string, string>;
+        cookies?: CookieParam[];
+        localStorage?: {
+            key: string;
+            value: string;
+        }[];
     };
 }
 export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {

package/lib/esm/utils.js CHANGED Viewed

@@ -123,6 +123,11 @@ const staticFileExtensions = [
     'xls',
     'xml',
     'zip',
+    'ts',
+    'json',
+    'md',
+    'yml',
+    'yaml',
 ];
 export const sleep = (ms) => {
     return new Promise((resolve) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arcblock/crawler",
-  "version": "1.1.4",
+  "version": "1.1.6",
   "main": "lib/cjs/index.js",
   "module": "lib/esm/index.js",
   "types": "lib/cjs/index.d.ts",
@@ -45,33 +45,32 @@
     ]
   },
   "dependencies": {
-    "@abtnode/cron": "^1.16.43",
-    "@abtnode/models": "^1.16.43",
-    "@abtnode/queue": "^1.16.43",
-    "@blocklet/logger": "^1.16.43",
+    "@abtnode/cron": "^1.16.44",
+    "@abtnode/models": "^1.16.44",
+    "@abtnode/queue": "^1.16.44",
+    "@blocklet/logger": "^1.16.44",
     "@blocklet/puppeteer": "^22.11.3",
-    "@blocklet/sdk": "^1.16.43",
+    "@blocklet/sdk": "^1.16.44",
     "@sequelize/core": "7.0.0-alpha.46",
     "@sequelize/sqlite3": "7.0.0-alpha.46",
     "axios": "^1.7.9",
     "fs-extra": "^11.2.0",
     "lodash": "^4.17.21",
     "lru-cache": "^10.4.3",
+    "p-map": "^7.0.3",
     "robots-parser": "^3.0.1",
     "sitemap": "^7.1.2",
     "sqlite3": "^5.1.7",
-    "ufo": "^1.5.4",
-    "p-map": "^7.0.3"
+    "ufo": "^1.5.4"
   },
   "devDependencies": {
-    "@blocklet/js-sdk": "^1.16.39",
     "@types/dotenv-flow": "^3.3.3",
     "@types/express": "^4.17.21",
     "@types/fs-extra": "^11.0.4",
     "@types/lodash": "^4.17.16",
     "@types/node": "^20.17.19",
-    "express": "^4.21.2",
     "bumpp": "^9.11.1",
+    "express": "^4.21.2",
     "nodemon": "^3.1.9",
     "npm-run-all": "^4.1.5",
     "puppeteer": "^24.8.2",