npm - @arcblock/crawler - Versions diffs - 1.1.1 → 1.1.2 - Mend

@arcblock/crawler 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +65 -0
package/lib/cjs/config.d.ts +4 -2
package/lib/cjs/config.js +7 -5
package/lib/cjs/crawler.d.ts +4 -0
package/lib/cjs/crawler.js +31 -17
package/lib/cjs/cron.js +1 -1
package/lib/cjs/index.js +4 -2
package/lib/cjs/site.d.ts +1 -1
package/lib/cjs/site.js +23 -20
package/lib/cjs/store/index.js +1 -1
package/lib/cjs/store/snapshot.d.ts +5 -0
package/lib/cjs/store/snapshot.js +4 -0
package/lib/esm/config.d.ts +4 -2
package/lib/esm/config.js +7 -5
package/lib/esm/crawler.d.ts +4 -0
package/lib/esm/crawler.js +31 -17
package/lib/esm/cron.js +1 -1
package/lib/esm/index.js +4 -2
package/lib/esm/site.d.ts +1 -1
package/lib/esm/site.js +23 -20
package/lib/esm/store/index.js +1 -1
package/lib/esm/store/snapshot.d.ts +5 -0
package/lib/esm/store/snapshot.js +4 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -1 +1,66 @@
 # @arcblock/crawler
+A crawler module designed for Blocklets. It supports batch crawling of HTML, webpage screenshots, title, description, and more, based on URL or Sitemap.
+## Usage
+```typescript
+import { crawlUrl, getSnapshot, initCrawler } from '@arcblock/crawler';
+await initCrawler();
+// Asynchronously crawl a page
+const jobId = await crawlUrl({ url: 'https://www.arcblock.io', includeScreenshot: true, includeHtml: true });
+// Get the crawling result (need to wait for the crawler to finish)
+const snapshot = await getSnapshot(jobId);
+```
+### initCrawler
+Initializes the crawler.
+### crawlUrl
+Crawls the specified page.
+### getSnapshot
+Gets the crawling result by jobId.
+### getLatestSnapshot
+Gets the latest crawling result by URL.
+## Schedule
+Passing siteCron to initCrawler will enable scheduled tasks to periodically crawl all pages of specified websites based on their sitemaps.
+```typescript
+await initCrawler({
+  siteCron: {
+    enabled: !!env.preferences.cronEnabled,
+    immediate: !!env.preferences.cronImmediate,
+    sites: env.preferences.cronSites,
+    time: env.preferences.cronTime,
+    crawlConcurrency: env.preferences.crawlConcurrency,
+    sitemapConcurrency: env.preferences.sitemapConcurrency,
+  },
+});
+```
+## Environment Variables
+- `PUPPETEER_EXECUTABLE_PATH`: The execution path for Puppeteer. This variable is not required if used within the `arcblock/snap-kit` Docker image. When developing locally, you can set it to the Chrome path: `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
+If not referenced by a Blocklet, some dependent Blocklet environment variables need to be provided:
+- `BLOCKLET_CACHE_DIR`: (Optional) The directory for automatic Puppeteer installation if `PUPPETEER_EXECUTABLE_PATH` is not provided. Defaults to `process.cwd()`.
+- `BLOCKLET_APP_URL`: (Optional) The domain prefix for screenshot. Defaults to `/`.
+- `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
+## SQLite
+When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.

package/lib/cjs/config.d.ts CHANGED Viewed

@@ -14,8 +14,10 @@ export type Config = {
     siteCron: {
         sites: Site[];
         time: string;
-        runOnInit: boolean;
-        concurrency: number;
+        enabled: boolean;
+        immediate: boolean;
+        crawlConcurrency: number;
+        sitemapConcurrency: number;
     };
 };
 export declare const logger: any;

package/lib/cjs/config.js CHANGED Viewed

@@ -10,14 +10,16 @@ exports.config = {
     isProd: process.env.NODE_ENV === 'production',
     dataDir: process.env.BLOCKLET_DATA_DIR,
     appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
-    appUrl: process.env.BLOCKLET_APP_URL,
+    cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
+    appUrl: process.env.BLOCKLET_APP_URL || '/',
     puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
-    cacheDir: process.env.BLOCKLET_CACHE_DIR,
     // cron
     siteCron: {
         sites: [],
-        time: '0 0 */12 * * *',
-        runOnInit: false,
-        concurrency: 5,
+        enabled: true,
+        time: '0 0 0 * * *',
+        immediate: false,
+        crawlConcurrency: 2,
+        sitemapConcurrency: 30,
     },
 };

package/lib/cjs/crawler.d.ts CHANGED Viewed

@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
 }) => Promise<{
     html: string;
     screenshot: Uint8Array<ArrayBufferLike> | null;
+    meta: {
+        title?: string;
+        description?: string;
+    };
 }>;
 /**
  * crawl url and return job id

package/lib/cjs/crawler.js CHANGED Viewed

@@ -33,7 +33,7 @@ function createCrawlQueue() {
     const db = new BaseState(job_1.Job);
     crawlQueue = (0, queue_1.default)({
         store: new sequelize_1.default(db, 'crawler'),
-        concurrency: 1,
+        concurrency: config_1.config.siteCron.crawlConcurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             config_1.logger.info('Starting to execute crawl job', job);
             const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -84,6 +84,7 @@ function createCrawlQueue() {
                         status: 'success',
                         screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
                         html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
+                        meta: result.meta,
                     },
                 });
                 yield snapshot_2.Snapshot.upsert(snapshot);
@@ -150,6 +151,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
     }
     let html = null;
     let screenshot = null;
+    const meta = {};
     try {
         const response = yield page.goto(url, { timeout });
         if (!response) {
@@ -170,7 +172,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
             // Try to find the tallest element and set the browser to the same height
             if (fullPage) {
                 const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
-                config_1.logger.info('findMaxScrollHeight', { maxScrollHeight });
+                config_1.logger.debug('findMaxScrollHeight', { maxScrollHeight });
                 if (maxScrollHeight) {
                     yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
                     yield page.evaluate((scrollHeight) => {
@@ -188,22 +190,33 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
             }
         }
         // get html
-        if (includeHtml) {
-            try {
-                html = yield page.evaluate(() => {
-                    // add meta tag to record crawler
-                    const meta = document.createElement('meta');
-                    meta.name = 'arcblock-crawler';
-                    meta.content = 'true';
-                    document.head.appendChild(meta);
-                    return document.documentElement.outerHTML;
-                });
-            }
-            catch (err) {
-                config_1.logger.error('Failed to get html:', err);
-                throw err;
+        try {
+            const data = yield page.evaluate(() => {
+                var _a;
+                // add meta tag to record crawler
+                const meta = document.createElement('meta');
+                meta.name = 'arcblock-crawler';
+                meta.content = 'true';
+                document.head.appendChild(meta);
+                // get title and meta description
+                const title = document.title || '';
+                const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
+                return {
+                    html: document.documentElement.outerHTML,
+                    title,
+                    description,
+                };
+            });
+            meta.title = data.title;
+            meta.description = data.description;
+            if (includeHtml) {
+                html = data.html;
             }
         }
+        catch (err) {
+            config_1.logger.error('Failed to get html:', err);
+            throw err;
+        }
     }
     catch (error) {
         config_1.logger.error('Failed to get page content:', error);
@@ -216,6 +229,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
     return {
         html,
         screenshot,
+        meta,
     };
 });
 exports.getPageContent = getPageContent;
@@ -238,7 +252,7 @@ function crawlUrl(params, callback) {
             fullPage: params.fullPage,
         })) || {};
         if (duplicateJob) {
-            config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
+            config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
             return duplicateJob.id;
         }
         config_1.logger.info('create crawl job', params);

package/lib/cjs/cron.js CHANGED Viewed

@@ -27,7 +27,7 @@ function initCron() {
             {
                 name: 'crawl-site',
                 time: config_1.config.siteCron.time,
-                options: { runOnInit: config_1.config.siteCron.runOnInit },
+                options: { runOnInit: config_1.config.siteCron.immediate },
                 fn: () => __awaiter(this, void 0, void 0, function* () {
                     config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
                     for (const site of config_1.config.siteCron.sites) {

package/lib/cjs/index.js CHANGED Viewed

@@ -62,13 +62,15 @@ __exportStar(require("./services/snapshot"), exports);
 exports.utils = __importStar(require("./utils"));
 function initCrawler(params) {
     return __awaiter(this, void 0, void 0, function* () {
-        config_1.logger.info('Init crawler', { params });
         (0, merge_1.default)(config_1.config, params);
+        config_1.logger.info('Init crawler', { params, config: config_1.config });
         try {
             yield (0, store_1.initDatabase)();
             yield (0, puppeteer_1.ensureBrowser)();
             yield (0, crawler_1.createCrawlQueue)();
-            yield (0, cron_1.initCron)();
+            if (config_1.config.siteCron.enabled) {
+                yield (0, cron_1.initCron)();
+            }
         }
         catch (err) {
             config_1.logger.error('Init crawler error', { err });

package/lib/cjs/site.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
 import { Site } from './config';
-export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
+export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;

package/lib/cjs/site.js CHANGED Viewed

@@ -28,6 +28,11 @@ function parseSitemapUrl(sitemapItem) {
 }
 const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
     config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
+    const key = `${url}-${pathname}`;
+    if (crawlBlockletRunningMap.has(key)) {
+        config_1.logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
+        return [];
+    }
     const sitemapList = yield (0, utils_1.getSitemapList)(url);
     const pathnameRegex = new RegExp(pathname);
     const sitemapItems = sitemapList
@@ -36,33 +41,31 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
         return parseSitemapUrl(sitemapItem);
     });
     config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
-    const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
-        const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
-        if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
-            const lastModified = new Date(snapshot.lastModified);
-            // skip if snapshot lastModified is greater than sitemap lastmod
-            if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
-                return null;
-            }
-            // skip if interval time has not been reached
-            if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
-                return null;
-            }
-        }
-        return { url, sitemapItem };
-    })))).filter(Boolean);
-    config_1.logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
-    const key = `${url}-${pathname}`;
-    crawlBlockletRunningMap.set(key, crawlableItems);
+    let processCount = 0;
+    crawlBlockletRunningMap.set(key, true);
     try {
-        const jobIds = yield (0, p_map_1.default)(crawlableItems, ({ url, sitemapItem }) => {
+        const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
+            processCount++;
+            const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
+            if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
+                const lastModified = new Date(snapshot.lastModified);
+                // skip if snapshot lastModified is greater than sitemap lastmod
+                if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
+                    return null;
+                }
+                // skip if interval time has not been reached
+                if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
+                    return null;
+                }
+            }
+            config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
             return (0, crawler_1.crawlUrl)({
                 url,
                 lastModified: sitemapItem.lastmod,
                 includeScreenshot: false,
                 includeHtml: true,
             });
-        }, { concurrency: config_1.config.siteCron.concurrency });
+        }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
         return jobIds;
     }
     catch (error) {

package/lib/cjs/store/index.js CHANGED Viewed

@@ -45,7 +45,7 @@ function initDatabase() {
                 sequelize.query('pragma journal_size_limit = 67108864;'),
             ]);
             yield sequelize.authenticate();
-            yield sequelize.sync();
+            yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
             config_1.logger.info('Successfully connected to database');
         }
         catch (error) {

package/lib/cjs/store/snapshot.d.ts CHANGED Viewed

@@ -7,6 +7,10 @@ export interface SnapshotModel {
     screenshot?: string | null;
     error?: string;
     lastModified?: string;
+    meta?: {
+        title?: string;
+        description?: string;
+    };
     options?: {
         width?: number;
         height?: number;
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
     screenshot?: SnapshotModel['screenshot'];
     error?: SnapshotModel['error'];
     lastModified?: SnapshotModel['lastModified'];
+    meta?: SnapshotModel['meta'];
     options: SnapshotModel['options'];
     static initModel(sequelize: Sequelize): typeof Snapshot;
     static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;

package/lib/cjs/store/snapshot.js CHANGED Viewed

@@ -44,6 +44,10 @@ class Snapshot extends core_1.Model {
                 type: core_1.DataTypes.STRING,
                 allowNull: true,
             },
+            meta: {
+                type: core_1.DataTypes.JSON,
+                allowNull: true,
+            },
             options: {
                 type: core_1.DataTypes.JSON,
                 allowNull: true,

package/lib/esm/config.d.ts CHANGED Viewed

@@ -14,8 +14,10 @@ export type Config = {
     siteCron: {
         sites: Site[];
         time: string;
-        runOnInit: boolean;
-        concurrency: number;
+        enabled: boolean;
+        immediate: boolean;
+        crawlConcurrency: number;
+        sitemapConcurrency: number;
     };
 };
 export declare const logger: any;

package/lib/esm/config.js CHANGED Viewed

@@ -4,14 +4,16 @@ export const config = {
     isProd: process.env.NODE_ENV === 'production',
     dataDir: process.env.BLOCKLET_DATA_DIR,
     appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
-    appUrl: process.env.BLOCKLET_APP_URL,
+    cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
+    appUrl: process.env.BLOCKLET_APP_URL || '/',
     puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
-    cacheDir: process.env.BLOCKLET_CACHE_DIR,
     // cron
     siteCron: {
         sites: [],
-        time: '0 0 */12 * * *',
-        runOnInit: false,
-        concurrency: 5,
+        enabled: true,
+        time: '0 0 0 * * *',
+        immediate: false,
+        crawlConcurrency: 2,
+        sitemapConcurrency: 30,
     },
 };

package/lib/esm/crawler.d.ts CHANGED Viewed

@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
 }) => Promise<{
     html: string;
     screenshot: Uint8Array<ArrayBufferLike> | null;
+    meta: {
+        title?: string;
+        description?: string;
+    };
 }>;
 /**
  * crawl url and return job id

package/lib/esm/crawler.js CHANGED Viewed

@@ -24,7 +24,7 @@ export function createCrawlQueue() {
     const db = new BaseState(Job);
     crawlQueue = createQueue({
         store: new SequelizeStore(db, 'crawler'),
-        concurrency: 1,
+        concurrency: config.siteCron.crawlConcurrency,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
             logger.info('Starting to execute crawl job', job);
             const canCrawl = yield isAcceptCrawler(job.url);
@@ -75,6 +75,7 @@ export function createCrawlQueue() {
                         status: 'success',
                         screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
                         html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
+                        meta: result.meta,
                     },
                 });
                 yield Snapshot.upsert(snapshot);
@@ -141,6 +142,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
     }
     let html = null;
     let screenshot = null;
+    const meta = {};
     try {
         const response = yield page.goto(url, { timeout });
         if (!response) {
@@ -161,7 +163,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
             // Try to find the tallest element and set the browser to the same height
             if (fullPage) {
                 const maxScrollHeight = yield findMaxScrollHeight(page);
-                logger.info('findMaxScrollHeight', { maxScrollHeight });
+                logger.debug('findMaxScrollHeight', { maxScrollHeight });
                 if (maxScrollHeight) {
                     yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
                     yield page.evaluate((scrollHeight) => {
@@ -179,22 +181,33 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
             }
         }
         // get html
-        if (includeHtml) {
-            try {
-                html = yield page.evaluate(() => {
-                    // add meta tag to record crawler
-                    const meta = document.createElement('meta');
-                    meta.name = 'arcblock-crawler';
-                    meta.content = 'true';
-                    document.head.appendChild(meta);
-                    return document.documentElement.outerHTML;
-                });
-            }
-            catch (err) {
-                logger.error('Failed to get html:', err);
-                throw err;
+        try {
+            const data = yield page.evaluate(() => {
+                var _a;
+                // add meta tag to record crawler
+                const meta = document.createElement('meta');
+                meta.name = 'arcblock-crawler';
+                meta.content = 'true';
+                document.head.appendChild(meta);
+                // get title and meta description
+                const title = document.title || '';
+                const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
+                return {
+                    html: document.documentElement.outerHTML,
+                    title,
+                    description,
+                };
+            });
+            meta.title = data.title;
+            meta.description = data.description;
+            if (includeHtml) {
+                html = data.html;
             }
         }
+        catch (err) {
+            logger.error('Failed to get html:', err);
+            throw err;
+        }
     }
     catch (error) {
         logger.error('Failed to get page content:', error);
@@ -207,6 +220,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
     return {
         html,
         screenshot,
+        meta,
     };
 });
 /**
@@ -228,7 +242,7 @@ export function crawlUrl(params, callback) {
             fullPage: params.fullPage,
         })) || {};
         if (duplicateJob) {
-            logger.warn(`Crawl job already exists for ${params.url}, skip`);
+            logger.info(`Crawl job already exists for ${params.url}, skip`);
             return duplicateJob.id;
         }
         logger.info('create crawl job', params);

package/lib/esm/cron.js CHANGED Viewed

@@ -21,7 +21,7 @@ export function initCron() {
             {
                 name: 'crawl-site',
                 time: config.siteCron.time,
-                options: { runOnInit: config.siteCron.runOnInit },
+                options: { runOnInit: config.siteCron.immediate },
                 fn: () => __awaiter(this, void 0, void 0, function* () {
                     logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
                     for (const site of config.siteCron.sites) {

package/lib/esm/index.js CHANGED Viewed

@@ -19,13 +19,15 @@ export * from './services/snapshot';
 export * as utils from './utils';
 export function initCrawler(params) {
     return __awaiter(this, void 0, void 0, function* () {
-        logger.info('Init crawler', { params });
         merge(config, params);
+        logger.info('Init crawler', { params, config });
         try {
             yield initDatabase();
             yield ensureBrowser();
             yield createCrawlQueue();
-            yield initCron();
+            if (config.siteCron.enabled) {
+                yield initCron();
+            }
         }
         catch (err) {
             logger.error('Init crawler error', { err });

package/lib/esm/site.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
 import { Site } from './config';
-export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
+export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;

package/lib/esm/site.js CHANGED Viewed

@@ -22,6 +22,11 @@ function parseSitemapUrl(sitemapItem) {
 }
 export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
     logger.info(`Start crawl from sitemap ${url}`, { pathname });
+    const key = `${url}-${pathname}`;
+    if (crawlBlockletRunningMap.has(key)) {
+        logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
+        return [];
+    }
     const sitemapList = yield getSitemapList(url);
     const pathnameRegex = new RegExp(pathname);
     const sitemapItems = sitemapList
@@ -30,33 +35,31 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
         return parseSitemapUrl(sitemapItem);
     });
     logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
-    const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
-        const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
-        if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
-            const lastModified = new Date(snapshot.lastModified);
-            // skip if snapshot lastModified is greater than sitemap lastmod
-            if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
-                return null;
-            }
-            // skip if interval time has not been reached
-            if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
-                return null;
-            }
-        }
-        return { url, sitemapItem };
-    })))).filter(Boolean);
-    logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
-    const key = `${url}-${pathname}`;
-    crawlBlockletRunningMap.set(key, crawlableItems);
+    let processCount = 0;
+    crawlBlockletRunningMap.set(key, true);
     try {
-        const jobIds = yield pMap(crawlableItems, ({ url, sitemapItem }) => {
+        const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
+            processCount++;
+            const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
+            if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
+                const lastModified = new Date(snapshot.lastModified);
+                // skip if snapshot lastModified is greater than sitemap lastmod
+                if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
+                    return null;
+                }
+                // skip if interval time has not been reached
+                if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
+                    return null;
+                }
+            }
+            logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
             return crawlUrl({
                 url,
                 lastModified: sitemapItem.lastmod,
                 includeScreenshot: false,
                 includeHtml: true,
             });
-        }, { concurrency: config.siteCron.concurrency });
+        }), { concurrency: config.siteCron.sitemapConcurrency });
         return jobIds;
     }
     catch (error) {

package/lib/esm/store/index.js CHANGED Viewed

@@ -39,7 +39,7 @@ export function initDatabase() {
                 sequelize.query('pragma journal_size_limit = 67108864;'),
             ]);
             yield sequelize.authenticate();
-            yield sequelize.sync();
+            yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
             logger.info('Successfully connected to database');
         }
         catch (error) {

package/lib/esm/store/snapshot.d.ts CHANGED Viewed

@@ -7,6 +7,10 @@ export interface SnapshotModel {
     screenshot?: string | null;
     error?: string;
     lastModified?: string;
+    meta?: {
+        title?: string;
+        description?: string;
+    };
     options?: {
         width?: number;
         height?: number;
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
     screenshot?: SnapshotModel['screenshot'];
     error?: SnapshotModel['error'];
     lastModified?: SnapshotModel['lastModified'];
+    meta?: SnapshotModel['meta'];
     options: SnapshotModel['options'];
     static initModel(sequelize: Sequelize): typeof Snapshot;
     static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;

package/lib/esm/store/snapshot.js CHANGED Viewed

@@ -41,6 +41,10 @@ export class Snapshot extends Model {
                 type: DataTypes.STRING,
                 allowNull: true,
             },
+            meta: {
+                type: DataTypes.JSON,
+                allowNull: true,
+            },
             options: {
                 type: DataTypes.JSON,
                 allowNull: true,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arcblock/crawler",
-  "version": "1.1.1",
+  "version": "1.1.2",
   "main": "lib/cjs/index.js",
   "module": "lib/esm/index.js",
   "types": "lib/cjs/index.d.ts",