npm - @arcblock/crawler - Versions diffs - 1.0.5 → 1.1.1 - Mend

@arcblock/crawler 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/README.md +1 -0
package/lib/cjs/config.d.ts +22 -0
package/{dist → lib/cjs}/config.js +9 -3
package/lib/cjs/crawler.d.ts +26 -0
package/{dist → lib/cjs}/crawler.js +56 -113
package/lib/cjs/cron.d.ts +1 -0
package/lib/cjs/cron.js +49 -0
package/lib/cjs/index.d.ts +9 -0
package/lib/cjs/index.js +78 -0
package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
package/{dist → lib/cjs}/puppeteer.js +43 -54
package/lib/cjs/services/snapshot.d.ts +12 -0
package/lib/cjs/services/snapshot.js +84 -0
package/lib/cjs/site.d.ts +2 -0
package/lib/cjs/site.js +76 -0
package/lib/cjs/store/index.d.ts +3 -0
package/{dist/db → lib/cjs/store}/index.js +21 -5
package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
package/lib/cjs/store/job.js +110 -0
package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
package/lib/cjs/store/snapshot.js +68 -0
package/lib/cjs/utils.d.ts +32 -0
package/{dist → lib/cjs}/utils.js +88 -78
package/lib/esm/config.d.ts +22 -0
package/{esm → lib/esm}/config.js +9 -3
package/lib/esm/crawler.d.ts +26 -0
package/{esm → lib/esm}/crawler.js +48 -102
package/lib/esm/cron.d.ts +1 -0
package/lib/esm/cron.js +43 -0
package/lib/esm/index.d.ts +9 -0
package/{esm → lib/esm}/index.js +19 -10
package/{dist → lib/esm}/puppeteer.d.ts +2 -2
package/{esm → lib/esm}/puppeteer.js +26 -37
package/lib/esm/services/snapshot.d.ts +12 -0
package/lib/esm/services/snapshot.js +75 -0
package/lib/esm/site.d.ts +2 -0
package/lib/esm/site.js +69 -0
package/lib/esm/store/index.d.ts +3 -0
package/{esm/db → lib/esm/store}/index.js +22 -6
package/{esm/db → lib/esm/store}/job.d.ts +4 -3
package/lib/esm/store/job.js +73 -0
package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
package/lib/esm/store/snapshot.js +64 -0
package/lib/esm/utils.d.ts +32 -0
package/{esm → lib/esm}/utils.js +84 -71
package/package.json +22 -33
package/third.d.ts +0 -0
package/dist/blocklet.d.ts +0 -6
package/dist/blocklet.js +0 -199
package/dist/cache.d.ts +0 -10
package/dist/cache.js +0 -119
package/dist/config.d.ts +0 -10
package/dist/crawler.d.ts +0 -28
package/dist/db/index.d.ts +0 -1
package/dist/db/job.js +0 -54
package/dist/db/snapshot.js +0 -52
package/dist/index.d.ts +0 -6
package/dist/index.js +0 -45
package/dist/middleware.d.ts +0 -4
package/dist/middleware.js +0 -44
package/dist/utils.d.ts +0 -15
package/esm/blocklet.d.ts +0 -6
package/esm/blocklet.js +0 -190
package/esm/cache.d.ts +0 -10
package/esm/cache.js +0 -114
package/esm/config.d.ts +0 -10
package/esm/crawler.d.ts +0 -28
package/esm/db/index.d.ts +0 -1
package/esm/db/job.js +0 -50
package/esm/db/snapshot.js +0 -48
package/esm/index.d.ts +0 -6
package/esm/middleware.d.ts +0 -4
package/esm/middleware.js +0 -41
package/esm/utils.d.ts +0 -15

package/lib/cjs/store/snapshot.js ADDED Viewed

@@ -0,0 +1,68 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.Snapshot = void 0;
+const core_1 = require("@sequelize/core");
+class Snapshot extends core_1.Model {
+    static initModel(sequelize) {
+        return Snapshot.init({
+            jobId: {
+                type: core_1.DataTypes.STRING,
+                primaryKey: true,
+                allowNull: false,
+            },
+            url: {
+                type: core_1.DataTypes.STRING,
+                allowNull: false,
+                index: true,
+            },
+            status: {
+                type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
+                allowNull: false,
+            },
+            html: {
+                type: core_1.DataTypes.TEXT,
+                allowNull: true,
+            },
+            screenshot: {
+                type: core_1.DataTypes.STRING,
+                allowNull: true,
+            },
+            error: {
+                type: core_1.DataTypes.STRING,
+                allowNull: true,
+            },
+            lastModified: {
+                type: core_1.DataTypes.STRING,
+                allowNull: true,
+            },
+            options: {
+                type: core_1.DataTypes.JSON,
+                allowNull: true,
+            },
+        }, {
+            sequelize,
+            modelName: 'snapshot',
+            tableName: 'snap',
+            timestamps: true,
+        });
+    }
+    static findSnapshot(condition) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const snapshot = yield Snapshot.findOne(Object.assign({ order: [
+                    ['lastModified', 'DESC'],
+                    ['updatedAt', 'DESC'],
+                ] }, condition));
+            return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
+        });
+    }
+}
+exports.Snapshot = Snapshot;

package/lib/cjs/utils.d.ts ADDED Viewed

@@ -0,0 +1,32 @@
+import { Page } from '@blocklet/puppeteer';
+import { Request } from 'express';
+export declare const axios: import("axios").AxiosInstance;
+export declare const CRAWLER_FLAG = "x-arcblock-crawler";
+export declare const sleep: (ms: number) => Promise<unknown>;
+/**
+ * Check if the request is a arcblock crawler
+ */
+export declare const isSelfCrawler: (req: Request) => boolean;
+/**
+ * Check if the request is a static file
+ */
+export declare function isStaticFile(req: Request): boolean;
+/**
+ * Check if the request is a spider
+ */
+export declare function isSpider(req: Request): boolean;
+/**
+ * Get and parse the robots.txt by `robots-parser`
+ */
+export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
+/**
+ * Check if the url is allowed to crawl from robots.txt
+ */
+export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
+/**
+ * Get and parse the sitemap.xml by `sitemap` package
+ */
+export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
+export declare const formatUrl: (url: string) => string;
+export declare function md5(content: string | Uint8Array): string;
+export declare function findMaxScrollHeight(page: Page): Promise<number>;

package/{dist → lib/cjs}/utils.js RENAMED Viewed

@@ -12,10 +12,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
     return (mod && mod.__esModule) ? mod : { "default": mod };
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
+exports.formatUrl = exports.getSitemapList = exports.isAcceptCrawler = exports.isSelfCrawler = exports.sleep = exports.CRAWLER_FLAG = exports.axios = void 0;
+exports.isStaticFile = isStaticFile;
+exports.isSpider = isSpider;
 exports.getRobots = getRobots;
 exports.md5 = md5;
-const config_1 = require("@blocklet/sdk/lib/config");
+exports.findMaxScrollHeight = findMaxScrollHeight;
 const axios_1 = __importDefault(require("axios"));
 const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
 const uniq_1 = __importDefault(require("lodash/uniq"));
@@ -24,24 +26,14 @@ const robots_parser_1 = __importDefault(require("robots-parser"));
 const sitemap_1 = require("sitemap");
 const stream_1 = require("stream");
 const ufo_1 = require("ufo");
-exports.api = axios_1.default.create({
-    timeout: 1000 * 10,
+const config_1 = require("./config");
+exports.axios = axios_1.default.create({
+    timeout: 1000 * 30,
     headers: {
         'Content-Type': 'application/json',
     },
 });
-const sleep = (ms) => {
-    return new Promise((resolve) => {
-        setTimeout(resolve, ms);
-    });
-};
-exports.sleep = sleep;
-exports.CRAWLER_FLAG = 'x-crawler';
-const isSelfCrawler = (req) => {
-    const ua = req.get('user-agent') || '';
-    return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
-};
-exports.isSelfCrawler = isSelfCrawler;
+exports.CRAWLER_FLAG = 'x-arcblock-crawler';
 /**
  * A default set of user agent patterns for bots/crawlers that do not perform
  * well with pages that require JavaScript.
@@ -97,12 +89,8 @@ const botUserAgents = [
     /AlibabaGroup/i,
     /adaptive-edge-crawler/i,
 ];
-const isSpider = (ua) => botUserAgents.some((spider) => {
-    return spider.test(ua);
-});
 /**
- * A default set of file extensions for static assets that do not need to be
- * proxied.
+ * A default set of file extensions for static assets that do not need to be proxied.
  */
 const staticFileExtensions = [
     'ai',
@@ -147,89 +135,91 @@ const staticFileExtensions = [
     'xml',
     'zip',
 ];
-const getDefaultRobotsUrl = (url) => {
-    const { origin } = new URL(url);
-    return (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
+const sleep = (ms) => {
+    return new Promise((resolve) => {
+        setTimeout(resolve, ms);
+    });
 };
-exports.getDefaultRobotsUrl = getDefaultRobotsUrl;
+exports.sleep = sleep;
+/**
+ * Check if the request is a arcblock crawler
+ */
+const isSelfCrawler = (req) => {
+    const ua = req.get('user-agent') || '';
+    return req.get(exports.CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
+};
+exports.isSelfCrawler = isSelfCrawler;
+/**
+ * Check if the request is a static file
+ */
+function isStaticFile(req) {
+    const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
+    return excludeUrlPattern.test(req.path);
+}
+/**
+ * Check if the request is a spider
+ */
+function isSpider(req) {
+    const ua = req.get('user-agent') || '';
+    return botUserAgents.some((spider) => spider.test(ua));
+}
+/**
+ * Get and parse the robots.txt by `robots-parser`
+ */
 function getRobots(url) {
     return __awaiter(this, void 0, void 0, function* () {
         const { origin } = new URL(url);
         const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
-        const { data } = yield exports.api.get(robotsUrl).catch(() => ({
-            data: '',
-        }));
+        const { data } = yield exports.axios.get(robotsUrl).catch((error) => {
+            config_1.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
+            return { data: null };
+        });
         return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
     });
 }
-const getDefaultSitemapUrl = (url) => {
-    const { origin } = new URL(url);
-    return (0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1');
-};
-exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
+/**
+ * Check if the url is allowed to crawl from robots.txt
+ */
 const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
     const robots = yield getRobots(url);
     const isAllowed = robots ? yield robots.isAllowed(url) : true;
     return isAllowed;
 });
 exports.isAcceptCrawler = isAcceptCrawler;
+/**
+ * Get and parse the sitemap.xml by `sitemap` package
+ */
 const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
-    let sitemapUrlList = [(0, exports.getDefaultSitemapUrl)(url)];
+    let sitemapUrlList = [];
     const robots = yield getRobots(url);
     if (robots) {
-        const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
-        if (robotsTxtSitemapUrlList.length > 0) {
-            sitemapUrlList = robotsTxtSitemapUrlList;
-        }
+        sitemapUrlList = (yield robots.getSitemaps()) || [];
+    }
+    if (!sitemapUrlList.length) {
+        const { origin } = new URL(url);
+        sitemapUrlList.push((0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1'));
     }
     // loop site map url list
     const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
-        const newUrl = new URL(sitemapUrl);
-        newUrl.searchParams.set('nocache', '1');
-        sitemapUrl = newUrl.toString();
-        const { data: sitemapTxt } = yield exports.api.get(sitemapUrl).catch(() => ({
-            data: '',
-        }));
-        if (sitemapTxt) {
-            const stream = stream_1.Readable.from([sitemapTxt]);
-            const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
-            return sitemapJson;
+        sitemapUrl = (0, ufo_1.withQuery)(sitemapUrl, { nocache: '1' });
+        try {
+            const { data: sitemapTxt } = yield exports.axios.get(sitemapUrl).catch(() => ({
+                data: '',
+            }));
+            if (sitemapTxt) {
+                const stream = stream_1.Readable.from([sitemapTxt]);
+                const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
+                return sitemapJson;
+            }
+        }
+        catch (error) {
+            config_1.logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
         }
         return [];
     })));
     return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
 });
 exports.getSitemapList = getSitemapList;
-const isBotUserAgent = (req) => {
-    const ua = req.get('user-agent');
-    const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
-    if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
-        return false;
-    }
-    return true;
-};
-exports.isBotUserAgent = isBotUserAgent;
-const getComponentInfo = () => {
-    return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
-};
-exports.getComponentInfo = getComponentInfo;
-const getFullUrl = (req) => {
-    const blockletPathname = req.headers['x-path-prefix']
-        ? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
-        : req.originalUrl;
-    return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
-};
-exports.getFullUrl = getFullUrl;
-const getRelativePath = (url) => {
-    try {
-        return new URL(url).pathname;
-    }
-    catch (error) {
-        // ignore error
-    }
-    return url;
-};
-exports.getRelativePath = getRelativePath;
 const formatUrl = (url) => {
     return url.replace(/\/$/, '').trim();
 };
@@ -237,3 +227,23 @@ exports.formatUrl = formatUrl;
 function md5(content) {
     return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
 }
+function findMaxScrollHeight(page) {
+    return __awaiter(this, void 0, void 0, function* () {
+        const maxHeightHandler = yield page.evaluateHandle(() => {
+            const elements = Array.from(document.querySelectorAll('*'));
+            let maxHeight = document.body.scrollHeight;
+            for (const el of elements) {
+                const style = window.getComputedStyle(el);
+                if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
+                    if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
+                        maxHeight = el.scrollHeight;
+                    }
+                }
+            }
+            return maxHeight;
+        });
+        const maxHeight = yield maxHeightHandler.jsonValue();
+        maxHeightHandler.dispose();
+        return maxHeight;
+    });
+}

package/lib/esm/config.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+export type Site = {
+    url: string;
+    pathname: string;
+    /** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
+    interval?: number;
+};
+export type Config = {
+    isProd: boolean;
+    dataDir: string;
+    appDir: string;
+    appUrl: string;
+    cacheDir: string;
+    puppeteerPath?: string;
+    siteCron: {
+        sites: Site[];
+        time: string;
+        runOnInit: boolean;
+        concurrency: number;
+    };
+};
+export declare const logger: any;
+export declare const config: Config;

package/{esm → lib/esm}/config.js RENAMED Viewed

@@ -1,11 +1,17 @@
 import createLogger from '@blocklet/logger';
-export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
+export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
 export const config = {
-    redisUrl: process.env.REDIS_URL,
+    isProd: process.env.NODE_ENV === 'production',
     dataDir: process.env.BLOCKLET_DATA_DIR,
     appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
     appUrl: process.env.BLOCKLET_APP_URL,
     puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
     cacheDir: process.env.BLOCKLET_CACHE_DIR,
-    testOnInitialize: process.env.NODE_ENV === 'production',
+    // cron
+    siteCron: {
+        sites: [],
+        time: '0 0 */12 * * *',
+        runOnInit: false,
+        concurrency: 5,
+    },
 };

package/lib/esm/crawler.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+import { JobState } from './store/job';
+import { SnapshotModel } from './store/snapshot';
+export declare function createCrawlQueue(): void;
+export declare function getDataDir(): Promise<{
+    htmlDir: string;
+    screenshotDir: string;
+}>;
+export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
+    url: string;
+    includeScreenshot?: boolean;
+    includeHtml?: boolean;
+    width?: number;
+    height?: number;
+    quality?: number;
+    timeout?: number;
+    fullPage?: boolean;
+}) => Promise<{
+    html: string;
+    screenshot: Uint8Array<ArrayBufferLike> | null;
+}>;
+/**
+ * crawl url and return job id
+ * @param params
+ * @param callback callback when job finished
+ */
+export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;

package/{esm → lib/esm}/crawler.js RENAMED Viewed

@@ -9,17 +9,15 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
 };
 import createQueue from '@abtnode/queue';
 import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
-import sequelize from '@sequelize/core';
 import { randomUUID } from 'crypto';
 import fs from 'fs-extra';
-import pick from 'lodash/pick';
 import path from 'path';
-import { joinURL } from 'ufo';
 import { config, logger } from './config';
-import { Job } from './db/job';
-import { Snapshot } from './db/snapshot';
 import { initPage } from './puppeteer';
-import { formatUrl, isAcceptCrawler, md5 } from './utils';
+import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
+import { Job } from './store/job';
+import { Snapshot } from './store/snapshot';
+import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
 const { BaseState } = require('@abtnode/models');
 let crawlQueue;
 export function createCrawlQueue() {
@@ -28,7 +26,7 @@ export function createCrawlQueue() {
         store: new SequelizeStore(db, 'crawler'),
         concurrency: 1,
         onJob: (job) => __awaiter(this, void 0, void 0, function* () {
-            logger.debug('job start:', job);
+            logger.info('Starting to execute crawl job', job);
             const canCrawl = yield isAcceptCrawler(job.url);
             if (!canCrawl) {
                 logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
@@ -81,19 +79,9 @@ export function createCrawlQueue() {
                 });
                 yield Snapshot.upsert(snapshot);
                 return snapshot;
-                // save to redis
-                // if (saveToRedis) {
-                //   useCache.set(url, {
-                //     html: result.html || '',
-                //     lastModified,
-                //   });
-                //   logger.info(`success to crawl ${url}`, job);
-                //   return result;
-                // }
             }
             catch (error) {
                 logger.error(`Failed to crawl ${job.url}`, { error, job });
-                console.error(error.stack);
                 const snapshot = convertJobToSnapshot({
                     job,
                     snapshot: {
@@ -145,11 +133,11 @@ function formatHtml(htmlString) {
     }
     return htmlString;
 }
-export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
+export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
     logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
     const page = yield initPage();
     if (width && height) {
-        yield page.setViewport({ width, height });
+        yield page.setViewport({ width, height, deviceScaleFactor: 2 });
     }
     let html = null;
     let screenshot = null;
@@ -164,26 +152,47 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
             throw new Error(`Request failed with status ${statusCode}, in ${url}`);
         }
         // await for networkidle0
-        // https://pptr.dev/api/puppeteer.page.goforward/#remarks
+        // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
         yield page.waitForNetworkIdle({
-            idleTime: 2 * 1000,
+            idleTime: 1.5 * 1000,
         });
         // get screenshot
         if (includeScreenshot) {
+            // Try to find the tallest element and set the browser to the same height
+            if (fullPage) {
+                const maxScrollHeight = yield findMaxScrollHeight(page);
+                logger.info('findMaxScrollHeight', { maxScrollHeight });
+                if (maxScrollHeight) {
+                    yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
+                    yield page.evaluate((scrollHeight) => {
+                        window.scrollTo(0, scrollHeight || 0);
+                        document.documentElement.scrollTo(0, scrollHeight || 0);
+                    }, maxScrollHeight);
+                }
+            }
             try {
                 screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
             }
             catch (err) {
                 logger.error('Failed to get screenshot:', err);
+                throw err;
             }
         }
         // get html
         if (includeHtml) {
-            if (formatPageContent) {
-                html = yield formatPageContent({ page, url });
+            try {
+                html = yield page.evaluate(() => {
+                    // add meta tag to record crawler
+                    const meta = document.createElement('meta');
+                    meta.name = 'arcblock-crawler';
+                    meta.content = 'true';
+                    document.head.appendChild(meta);
+                    return document.documentElement.outerHTML;
+                });
             }
-            else {
-                html = yield page.content();
+            catch (err) {
+                logger.error('Failed to get html:', err);
+                throw err;
             }
         }
     }
@@ -200,11 +209,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
         screenshot,
     };
 });
-export function createCrawlJob(params, callback) {
+/**
+ * crawl url and return job id
+ * @param params
+ * @param callback callback when job finished
+ */
+export function crawlUrl(params, callback) {
     return __awaiter(this, void 0, void 0, function* () {
         params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
         // skip duplicate job
-        const existsJob = yield getJob({
+        const { job: duplicateJob } = (yield Job.findJob({
             url: params.url,
             includeScreenshot: params.includeScreenshot,
             includeHtml: params.includeHtml,
@@ -212,18 +226,18 @@ export function createCrawlJob(params, callback) {
             width: params.width,
             height: params.height,
             fullPage: params.fullPage,
-        });
-        logger.info('create crawl job', params);
-        if (existsJob) {
+        })) || {};
+        if (duplicateJob) {
             logger.warn(`Crawl job already exists for ${params.url}, skip`);
-            return existsJob.id;
+            return duplicateJob.id;
         }
+        logger.info('create crawl job', params);
         const jobId = randomUUID();
         const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
-        job.on('finished', ({ result }) => {
+        job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
             logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
-            callback === null || callback === void 0 ? void 0 : callback(result);
-        });
+            callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
+        }));
         job.on('failed', ({ error }) => {
             logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
             callback === null || callback === void 0 ? void 0 : callback(null);
@@ -231,71 +245,3 @@ export function createCrawlJob(params, callback) {
         return jobId;
     });
 }
-// @ts-ignore
-export function getJob(condition) {
-    return __awaiter(this, void 0, void 0, function* () {
-        const where = Object.keys(condition)
-            .filter((key) => condition[key] !== undefined)
-            .map((key) => {
-            return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
-        });
-        const job = yield crawlQueue.store.db.findOne({
-            where: {
-                [sequelize.Op.and]: where,
-            },
-        });
-        if (job) {
-            return job.job;
-        }
-        return null;
-    });
-}
-function convertJobToSnapshot({ job, snapshot }) {
-    return Object.assign({
-        // @ts-ignore
-        jobId: job.jobId || job.id, url: job.url, options: {
-            width: job.width,
-            height: job.height,
-            includeScreenshot: job.includeScreenshot,
-            includeHtml: job.includeHtml,
-            quality: job.quality,
-            fullPage: job.fullPage,
-        } }, snapshot);
-}
-export function formatSnapshot(snapshot, columns) {
-    return __awaiter(this, void 0, void 0, function* () {
-        let data = Object.assign({}, snapshot);
-        // format screenshot path to full url
-        if (data.screenshot) {
-            data.screenshot = joinURL(config.appUrl, data.screenshot);
-        }
-        // format html path to string
-        if (data.html) {
-            const html = yield fs.readFile(path.join(config.dataDir, data.html));
-            data.html = html.toString();
-        }
-        if (columns === null || columns === void 0 ? void 0 : columns.length) {
-            data = pick(data, columns);
-        }
-        return data;
-    });
-}
-/**
- * get snapshot from db or crawl queue
- */
-export function getSnapshot(jobId) {
-    return __awaiter(this, void 0, void 0, function* () {
-        const snapshotModel = yield Snapshot.findByPk(jobId);
-        if (snapshotModel) {
-            return snapshotModel.toJSON();
-        }
-        const job = yield getJob({ id: jobId });
-        if (job) {
-            return {
-                jobId,
-                status: 'pending',
-            };
-        }
-        return null;
-    });
-}

package/lib/esm/cron.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function initCron(): any;