@arcblock/crawler 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/blocklet.d.ts +6 -0
  2. package/dist/blocklet.js +199 -0
  3. package/dist/cache.d.ts +10 -0
  4. package/dist/cache.js +119 -0
  5. package/dist/config.d.ts +10 -0
  6. package/dist/config.js +17 -0
  7. package/dist/crawler.d.ts +28 -0
  8. package/dist/crawler.js +314 -0
  9. package/dist/db/index.d.ts +1 -0
  10. package/dist/db/index.js +41 -0
  11. package/dist/db/job.d.ts +33 -0
  12. package/dist/db/job.js +54 -0
  13. package/dist/db/snapshot.d.ts +31 -0
  14. package/dist/db/snapshot.js +52 -0
  15. package/dist/index.d.ts +6 -0
  16. package/dist/index.js +45 -0
  17. package/dist/middleware.d.ts +4 -0
  18. package/dist/middleware.js +44 -0
  19. package/dist/puppeteer.d.ts +16 -0
  20. package/dist/puppeteer.js +318 -0
  21. package/dist/utils.d.ts +15 -0
  22. package/dist/utils.js +239 -0
  23. package/esm/blocklet.d.ts +6 -0
  24. package/esm/blocklet.js +190 -0
  25. package/esm/cache.d.ts +10 -0
  26. package/esm/cache.js +114 -0
  27. package/esm/config.d.ts +10 -0
  28. package/esm/config.js +11 -0
  29. package/esm/crawler.d.ts +28 -0
  30. package/esm/crawler.js +301 -0
  31. package/esm/db/index.d.ts +1 -0
  32. package/esm/db/index.js +35 -0
  33. package/esm/db/job.d.ts +33 -0
  34. package/esm/db/job.js +50 -0
  35. package/esm/db/snapshot.d.ts +31 -0
  36. package/esm/db/snapshot.js +48 -0
  37. package/esm/index.d.ts +6 -0
  38. package/esm/index.js +26 -0
  39. package/esm/middleware.d.ts +4 -0
  40. package/esm/middleware.js +41 -0
  41. package/esm/puppeteer.d.ts +16 -0
  42. package/esm/puppeteer.js +272 -0
  43. package/esm/utils.d.ts +15 -0
  44. package/esm/utils.js +220 -0
  45. package/package.json +10 -3
  46. package/src/blocklet.ts +0 -223
  47. package/src/cache.ts +0 -117
  48. package/src/config.ts +0 -13
  49. package/src/crawler.ts +0 -364
  50. package/src/db/index.ts +0 -27
  51. package/src/db/job.ts +0 -93
  52. package/src/db/snapshot.ts +0 -89
  53. package/src/index.ts +0 -19
  54. package/src/middleware.ts +0 -46
  55. package/src/puppeteer.ts +0 -296
  56. package/src/utils.ts +0 -240
  57. package/third.d.ts +0 -1
  58. package/tsconfig.json +0 -9
@@ -0,0 +1,190 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import Cron from '@abtnode/cron';
11
+ import { components } from '@blocklet/sdk/lib/config';
12
+ import debounce from 'lodash/debounce';
13
+ import { joinURL } from 'ufo';
14
+ import { useCache } from './cache';
15
+ import { config, logger } from './config';
16
+ import { createCrawlJob } from './crawler';
17
+ import { closeBrowser, getBrowser } from './puppeteer';
18
+ import { getComponentInfo, getRelativePath, getSitemapList } from './utils';
19
+ // record crawl blocklet running
20
+ const crawlBlockletRunningMap = new Map();
21
+ // crawl blocklet sitemap urls
22
+ export const crawlBlocklet = () => __awaiter(void 0, void 0, void 0, function* () {
23
+ // @ts-ignore
24
+ const { mountPoint, did } = getComponentInfo();
25
+ if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
26
+ logger.info(`Crawler blocklet ${did} is running, skip it`);
27
+ return;
28
+ }
29
+ // check has browser can use
30
+ try {
31
+ const browser = yield getBrowser();
32
+ if (!browser) {
33
+ throw new Error('No Browser can use');
34
+ }
35
+ logger.info('Crawler blocklet existing can use browser');
36
+ }
37
+ catch (error) {
38
+ logger.info(`Crawler blocklet abort by error: ${(error === null || error === void 0 ? void 0 : error.message) || (error === null || error === void 0 ? void 0 : error.reason) || error}`);
39
+ return;
40
+ }
41
+ const { appUrl } = config;
42
+ if (!appUrl) {
43
+ throw new Error('appUrl not found');
44
+ }
45
+ const sitemapList = yield getSitemapList(appUrl);
46
+ const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
47
+ const otherMountPointList = components
48
+ .filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
49
+ .map((item) => item.mountPoint);
50
+ // get can use loc
51
+ const blockletLocList = sitemapList.filter((item) => {
52
+ var _a;
53
+ if (mountPoint !== '/') {
54
+ return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(matchMountPoint)) > -1;
55
+ }
56
+ // if mountPoint is /, skip other mountPoint
57
+ return otherMountPointList.every((mountPoint) => { var _a; return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(mountPoint)) === -1; });
58
+ });
59
+ const canUseBlockletLocList = [];
60
+ const lastmodMap = new Map();
61
+ let skipBlockletLocTotal = 0;
62
+ let blockletLocTotal = 0;
63
+ yield Promise.all(blockletLocList.map((item) => __awaiter(void 0, void 0, void 0, function* () {
64
+ var _a;
65
+ let tempLocList = [];
66
+ if (item.url) {
67
+ tempLocList.push(item.url);
68
+ }
69
+ if (((_a = item === null || item === void 0 ? void 0 : item.links) === null || _a === void 0 ? void 0 : _a.length) > 0) {
70
+ tempLocList.push(...item.links.map((ytem) => ytem.url));
71
+ }
72
+ blockletLocTotal += tempLocList.length;
73
+ // @ts-ignore
74
+ tempLocList = (yield Promise.all(tempLocList.map((loc) => __awaiter(void 0, void 0, void 0, function* () {
75
+ try {
76
+ const { lastModified: cacheLastModified } = yield useCache.get(getRelativePath(loc));
77
+ // sitemap item lastmod is same as cache lastModified, skip it
78
+ if (item.lastmod &&
79
+ cacheLastModified &&
80
+ new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()) {
81
+ skipBlockletLocTotal++;
82
+ return false;
83
+ }
84
+ return loc;
85
+ }
86
+ catch (error) {
87
+ // ignore error
88
+ }
89
+ // if can not get cache, return loc
90
+ return loc;
91
+ })))).filter(Boolean);
92
+ tempLocList.forEach((loc) => {
93
+ if (item.lastmod)
94
+ lastmodMap.set(loc, item.lastmod);
95
+ });
96
+ canUseBlockletLocList.push(...tempLocList);
97
+ })));
98
+ const crawlerLogText = (step = '') => [
99
+ `Crawler sitemap.xml about ${did} ${step}: `,
100
+ {
101
+ blockletLocTotal,
102
+ canUseBlockletLocTotal: canUseBlockletLocList.length,
103
+ skipBlockletLocTotal,
104
+ lastmodMapTotal: lastmodMap.size,
105
+ },
106
+ ];
107
+ logger.info(...crawlerLogText('start'));
108
+ try {
109
+ // record crawl blocklet running
110
+ crawlBlockletRunningMap.set(did, true);
111
+ yield createCrawlJob({
112
+ // @ts-ignore
113
+ urls: canUseBlockletLocList,
114
+ saveToRedis: true,
115
+ lastmodMap,
116
+ // formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
117
+ // const pageContent = await page.evaluate(() => {
118
+ // const removeElements = (tagName: string) => {
119
+ // const elements = document.querySelectorAll(tagName);
120
+ // for (let i = elements.length - 1; i >= 0; i--) {
121
+ // try {
122
+ // elements[i]?.parentNode?.removeChild(elements[i] as Node);
123
+ // } catch (error) {
124
+ // // do noting
125
+ // }
126
+ // }
127
+ // };
128
+ // // remove script, style, link, noscript
129
+ // // removeElements('script');
130
+ // // removeElements('style');
131
+ // // removeElements('link');
132
+ // // removeElements('noscript');
133
+ // // remove uploader
134
+ // removeElements('[id="uploader-container"]');
135
+ // removeElements('[class^="uppy-"]');
136
+ // // remove point up component
137
+ // removeElements('[id="point-up-component"]');
138
+ // // add meta tag to record crawler
139
+ // const meta = document.createElement('meta');
140
+ // meta.name = 'blocklet-crawler';
141
+ // meta.content = 'true';
142
+ // document.head.appendChild(meta);
143
+ // return document.documentElement.outerHTML;
144
+ // });
145
+ // return pageContent;
146
+ // },
147
+ });
148
+ logger.info(...crawlerLogText('success'));
149
+ yield closeBrowser({
150
+ trimCache: true,
151
+ });
152
+ }
153
+ catch (error) {
154
+ logger.info('Crawler blocklet abort by error', error);
155
+ }
156
+ finally {
157
+ // delete crawl blocklet running
158
+ crawlBlockletRunningMap.delete(did);
159
+ }
160
+ });
161
+ const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
162
+ let cronCrawlBlockletJob = null;
163
+ // init cron crawl blocklet
164
+ export const initCronCrawlBlocklet = ({ time = '0 0 */12 * * *', // every 12 hours
165
+ options, } = {}) => {
166
+ if (!cronCrawlBlockletJob) {
167
+ cronCrawlBlockletJob = Cron.init({
168
+ context: {},
169
+ jobs: [
170
+ {
171
+ name: CRON_CRAWL_BLOCKLET_KEY,
172
+ time,
173
+ fn: debounce(crawlBlocklet),
174
+ options: Object.assign({ runOnInit: false }, options),
175
+ },
176
+ ],
177
+ onError: (err) => {
178
+ console.error('run job failed', err);
179
+ },
180
+ });
181
+ }
182
+ return cronCrawlBlockletJob;
183
+ };
184
+ export const cancelCronCrawlBlocklet = () => {
185
+ if (cronCrawlBlockletJob) {
186
+ cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
187
+ cronCrawlBlockletJob = null;
188
+ logger.info('Cron crawl blocklet stop, clear crawl queue');
189
+ }
190
+ };
package/esm/cache.d.ts ADDED
@@ -0,0 +1,10 @@
1
+ export declare const cachePool: import("generic-pool").Pool<any>;
2
+ export declare const memoryPool: import("generic-pool").Pool<Map<string, any>>;
3
+ export declare const withCache: (cb: Function) => Promise<any>;
4
+ export declare const formatKey: (key: string) => string;
5
+ export declare const useCache: {
6
+ get: (key: string) => Promise<any>;
7
+ set: (key: string, value: any, options?: any) => Promise<any>;
8
+ remove: (key: string) => Promise<any>;
9
+ list: (key?: string) => Promise<any>;
10
+ };
package/esm/cache.js ADDED
@@ -0,0 +1,114 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ var _a;
11
+ import { createPool } from 'generic-pool';
12
+ import { createClient } from 'redis';
13
+ import { config, logger } from './config';
14
+ const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
15
+ const MAX_REDIS_RETRY = 3;
16
+ const ttl = 1000 * 60 * 60 * 24 * 7;
17
+ export const cachePool = createPool({
18
+ create: () => __awaiter(void 0, void 0, void 0, function* () {
19
+ try {
20
+ const { redisUrl } = config;
21
+ const redisClient = createClient({
22
+ url: redisUrl,
23
+ socket: {
24
+ // @ts-ignore
25
+ reconnectStrategy: (retries) => {
26
+ if (retries >= MAX_REDIS_RETRY) {
27
+ return new Error('Retry Time Exhausted');
28
+ }
29
+ return Math.min(retries * 500, 1000 * 3);
30
+ },
31
+ },
32
+ });
33
+ redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
34
+ yield redisClient.connect();
35
+ logger.info(`Successfully connected to Redis: ${redisUrl}`);
36
+ return redisClient;
37
+ }
38
+ catch (error) {
39
+ logger.warn('Redis connection failed', error);
40
+ return null;
41
+ }
42
+ }),
43
+ destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
44
+ // if is redis client
45
+ if (client.isReady) {
46
+ yield client.quit();
47
+ }
48
+ }),
49
+ }, {
50
+ max: 2, // 2 clients
51
+ min: 0,
52
+ // evictionRunIntervalMillis: 0,
53
+ });
54
+ export const memoryPool = createPool({
55
+ create: () => {
56
+ const map = new Map();
57
+ // @ts-ignore
58
+ map.del = map.delete;
59
+ return Promise.resolve(map);
60
+ },
61
+ destroy: (client) => {
62
+ client.clear();
63
+ return Promise.resolve();
64
+ },
65
+ }, {
66
+ max: 10,
67
+ min: 0,
68
+ });
69
+ export const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
70
+ const pool = config.redisUrl ? cachePool : memoryPool;
71
+ const client = yield pool.acquire();
72
+ if (client) {
73
+ try {
74
+ return cb(client);
75
+ }
76
+ finally {
77
+ // release client to pool, let other use
78
+ yield pool.release(client);
79
+ }
80
+ }
81
+ });
82
+ export const formatKey = (key) => {
83
+ return `${cacheKeyPrefix}${key}`;
84
+ };
85
+ export const useCache = {
86
+ get: (key) => {
87
+ return withCache((client) => __awaiter(void 0, void 0, void 0, function* () {
88
+ const value = yield client.get(formatKey(key));
89
+ try {
90
+ return JSON.parse(value);
91
+ }
92
+ catch (error) {
93
+ // ignore error
94
+ }
95
+ return value;
96
+ }));
97
+ },
98
+ set: (key, value, options) => {
99
+ return withCache((client) => {
100
+ const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
101
+ return client.set(formatKey(key), formatValue, Object.assign({ PX: ttl }, options));
102
+ });
103
+ },
104
+ remove: (key) => {
105
+ return withCache((client) => {
106
+ return client.del(formatKey(key));
107
+ });
108
+ },
109
+ list: (key = '*') => {
110
+ return withCache((client) => {
111
+ return client.keys(formatKey(key));
112
+ });
113
+ },
114
+ };
@@ -0,0 +1,10 @@
1
+ export declare const logger: any;
2
+ export declare const config: {
3
+ redisUrl: string;
4
+ dataDir: string;
5
+ appDir: string;
6
+ appUrl: string;
7
+ puppeteerPath: string;
8
+ cacheDir: string;
9
+ testOnInitialize: boolean;
10
+ };
package/esm/config.js ADDED
@@ -0,0 +1,11 @@
1
+ import createLogger from '@blocklet/logger';
2
+ export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
3
+ export const config = {
4
+ redisUrl: process.env.REDIS_URL,
5
+ dataDir: process.env.BLOCKLET_DATA_DIR,
6
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
+ appUrl: process.env.BLOCKLET_APP_URL,
8
+ puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
9
+ cacheDir: process.env.BLOCKLET_CACHE_DIR,
10
+ testOnInitialize: process.env.NODE_ENV === 'production',
11
+ };
@@ -0,0 +1,28 @@
1
+ import { JobState } from './db/job';
2
+ import { SnapshotModel } from './db/snapshot';
3
+ export declare function createCrawlQueue(): void;
4
+ export declare function getDataDir(): Promise<{
5
+ htmlDir: string;
6
+ screenshotDir: string;
7
+ }>;
8
+ export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
+ url: string;
10
+ formatPageContent?: Function;
11
+ includeScreenshot?: boolean;
12
+ includeHtml?: boolean;
13
+ width?: number;
14
+ height?: number;
15
+ quality?: number;
16
+ timeout?: number;
17
+ fullPage?: boolean;
18
+ }) => Promise<{
19
+ html: string;
20
+ screenshot: Uint8Array<ArrayBufferLike> | null;
21
+ }>;
22
+ export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
23
+ export declare function getJob(condition: Partial<JobState>): Promise<any>;
24
+ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
25
+ /**
26
+ * get snapshot from db or crawl queue
27
+ */
28
+ export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
package/esm/crawler.js ADDED
@@ -0,0 +1,301 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import createQueue from '@abtnode/queue';
11
+ import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
12
+ import sequelize from '@sequelize/core';
13
+ import { randomUUID } from 'crypto';
14
+ import fs from 'fs-extra';
15
+ import pick from 'lodash/pick';
16
+ import path from 'path';
17
+ import { joinURL } from 'ufo';
18
+ import { config, logger } from './config';
19
+ import { Job } from './db/job';
20
+ import { Snapshot } from './db/snapshot';
21
+ import { initPage } from './puppeteer';
22
+ import { formatUrl, isAcceptCrawler, md5 } from './utils';
23
+ const { BaseState } = require('@abtnode/models');
24
+ let crawlQueue;
25
+ export function createCrawlQueue() {
26
+ const db = new BaseState(Job);
27
+ crawlQueue = createQueue({
28
+ store: new SequelizeStore(db, 'crawler'),
29
+ concurrency: 1,
30
+ onJob: (job) => __awaiter(this, void 0, void 0, function* () {
31
+ logger.debug('job start:', job);
32
+ const canCrawl = yield isAcceptCrawler(job.url);
33
+ if (!canCrawl) {
34
+ logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
35
+ const snapshot = convertJobToSnapshot({
36
+ job,
37
+ snapshot: {
38
+ status: 'failed',
39
+ error: 'Denied by robots.txt',
40
+ },
41
+ });
42
+ yield Snapshot.upsert(snapshot);
43
+ return snapshot;
44
+ }
45
+ // if index reach autoCloseBrowserCount, close browser
46
+ // try {
47
+ // if (index >= autoCloseBrowserCount) {
48
+ // await closeBrowser({ trimCache: false });
49
+ // }
50
+ // } catch (error) {
51
+ // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
52
+ // }
53
+ try {
54
+ // get page content later
55
+ const result = yield getPageContent(job);
56
+ if (!result || (!result.html && !result.screenshot)) {
57
+ logger.error(`failed to crawl ${job.url}, empty content`, job);
58
+ const snapshot = convertJobToSnapshot({
59
+ job,
60
+ snapshot: {
61
+ status: 'failed',
62
+ error: 'Failed to crawl content',
63
+ },
64
+ });
65
+ yield Snapshot.upsert(snapshot);
66
+ return snapshot;
67
+ }
68
+ // save html and screenshot to data dir
69
+ const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
70
+ screenshot: result.screenshot,
71
+ html: result.html,
72
+ });
73
+ // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
74
+ const snapshot = convertJobToSnapshot({
75
+ job,
76
+ snapshot: {
77
+ status: 'success',
78
+ screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
79
+ html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
80
+ },
81
+ });
82
+ yield Snapshot.upsert(snapshot);
83
+ return snapshot;
84
+ // save to redis
85
+ // if (saveToRedis) {
86
+ // useCache.set(url, {
87
+ // html: result.html || '',
88
+ // lastModified,
89
+ // });
90
+ // logger.info(`success to crawl ${url}`, job);
91
+ // return result;
92
+ // }
93
+ }
94
+ catch (error) {
95
+ logger.error(`Failed to crawl ${job.url}`, { error, job });
96
+ console.error(error.stack);
97
+ const snapshot = convertJobToSnapshot({
98
+ job,
99
+ snapshot: {
100
+ status: 'failed',
101
+ error: 'Internal error',
102
+ },
103
+ });
104
+ yield Snapshot.upsert(snapshot);
105
+ return snapshot;
106
+ }
107
+ }),
108
+ });
109
+ }
110
+ export function getDataDir() {
111
+ return __awaiter(this, void 0, void 0, function* () {
112
+ const htmlDir = path.join(config.dataDir, 'data', 'html');
113
+ const screenshotDir = path.join(config.dataDir, 'data', 'screenshot');
114
+ yield fs.ensureDir(htmlDir);
115
+ yield fs.ensureDir(screenshotDir);
116
+ return { htmlDir, screenshotDir };
117
+ });
118
+ }
119
+ function saveSnapshotToLocal(_a) {
120
+ return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
121
+ const { htmlDir, screenshotDir } = yield getDataDir();
122
+ let screenshotPath = null;
123
+ let htmlPath = null;
124
+ if (screenshot) {
125
+ const hash = md5(screenshot);
126
+ screenshotPath = path.join(screenshotDir, `${hash}.webp`);
127
+ logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
128
+ yield fs.writeFile(screenshotPath, screenshot);
129
+ }
130
+ if (html) {
131
+ const hash = md5(html);
132
+ htmlPath = path.join(htmlDir, `${hash}.html`);
133
+ logger.debug('saveSnapshotToLocal.html', { htmlPath });
134
+ yield fs.writeFile(htmlPath, html);
135
+ }
136
+ return {
137
+ screenshotPath,
138
+ htmlPath,
139
+ };
140
+ });
141
+ }
142
+ function formatHtml(htmlString) {
143
+ if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
144
+ return '';
145
+ }
146
+ return htmlString;
147
+ }
148
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
149
+ logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
150
+ const page = yield initPage();
151
+ if (width && height) {
152
+ yield page.setViewport({ width, height });
153
+ }
154
+ let html = null;
155
+ let screenshot = null;
156
+ try {
157
+ const response = yield page.goto(url, { timeout });
158
+ if (!response) {
159
+ throw new Error(`Failed to load page: response is null for ${url}`);
160
+ }
161
+ const statusCode = response.status();
162
+ logger.debug('getPageContent.response', { response, statusCode });
163
+ if (![200, 304].includes(statusCode)) {
164
+ throw new Error(`Request failed with status ${statusCode}, in ${url}`);
165
+ }
166
+ // await for networkidle0
167
+ // https://pptr.dev/api/puppeteer.page.goforward/#remarks
168
+ yield page.waitForNetworkIdle({
169
+ idleTime: 2 * 1000,
170
+ });
171
+ // get screenshot
172
+ if (includeScreenshot) {
173
+ try {
174
+ screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
175
+ }
176
+ catch (err) {
177
+ logger.error('Failed to get screenshot:', err);
178
+ }
179
+ }
180
+ // get html
181
+ if (includeHtml) {
182
+ if (formatPageContent) {
183
+ html = yield formatPageContent({ page, url });
184
+ }
185
+ else {
186
+ html = yield page.content();
187
+ }
188
+ }
189
+ }
190
+ catch (error) {
191
+ logger.error('Failed to get page content:', error);
192
+ throw error;
193
+ }
194
+ finally {
195
+ yield page.close();
196
+ }
197
+ html = formatHtml(html || '');
198
+ return {
199
+ html,
200
+ screenshot,
201
+ };
202
+ });
203
+ export function createCrawlJob(params, callback) {
204
+ return __awaiter(this, void 0, void 0, function* () {
205
+ params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
206
+ // skip duplicate job
207
+ const existsJob = yield getJob({
208
+ url: params.url,
209
+ includeScreenshot: params.includeScreenshot,
210
+ includeHtml: params.includeHtml,
211
+ quality: params.quality,
212
+ width: params.width,
213
+ height: params.height,
214
+ fullPage: params.fullPage,
215
+ });
216
+ logger.info('create crawl job', params);
217
+ if (existsJob) {
218
+ logger.warn(`Crawl job already exists for ${params.url}, skip`);
219
+ return existsJob.id;
220
+ }
221
+ const jobId = randomUUID();
222
+ const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
223
+ job.on('finished', ({ result }) => {
224
+ logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
225
+ callback === null || callback === void 0 ? void 0 : callback(result);
226
+ });
227
+ job.on('failed', ({ error }) => {
228
+ logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
229
+ callback === null || callback === void 0 ? void 0 : callback(null);
230
+ });
231
+ return jobId;
232
+ });
233
+ }
234
+ // @ts-ignore
235
+ export function getJob(condition) {
236
+ return __awaiter(this, void 0, void 0, function* () {
237
+ const where = Object.keys(condition)
238
+ .filter((key) => condition[key] !== undefined)
239
+ .map((key) => {
240
+ return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
241
+ });
242
+ const job = yield crawlQueue.store.db.findOne({
243
+ where: {
244
+ [sequelize.Op.and]: where,
245
+ },
246
+ });
247
+ if (job) {
248
+ return job.job;
249
+ }
250
+ return null;
251
+ });
252
+ }
253
+ function convertJobToSnapshot({ job, snapshot }) {
254
+ return Object.assign({
255
+ // @ts-ignore
256
+ jobId: job.jobId || job.id, url: job.url, options: {
257
+ width: job.width,
258
+ height: job.height,
259
+ includeScreenshot: job.includeScreenshot,
260
+ includeHtml: job.includeHtml,
261
+ quality: job.quality,
262
+ fullPage: job.fullPage,
263
+ } }, snapshot);
264
+ }
265
+ export function formatSnapshot(snapshot, columns) {
266
+ return __awaiter(this, void 0, void 0, function* () {
267
+ let data = Object.assign({}, snapshot);
268
+ // format screenshot path to full url
269
+ if (data.screenshot) {
270
+ data.screenshot = joinURL(config.appUrl, data.screenshot);
271
+ }
272
+ // format html path to string
273
+ if (data.html) {
274
+ const html = yield fs.readFile(path.join(config.dataDir, data.html));
275
+ data.html = html.toString();
276
+ }
277
+ if (columns === null || columns === void 0 ? void 0 : columns.length) {
278
+ data = pick(data, columns);
279
+ }
280
+ return data;
281
+ });
282
+ }
283
+ /**
284
+ * get snapshot from db or crawl queue
285
+ */
286
+ export function getSnapshot(jobId) {
287
+ return __awaiter(this, void 0, void 0, function* () {
288
+ const snapshotModel = yield Snapshot.findByPk(jobId);
289
+ if (snapshotModel) {
290
+ return snapshotModel.toJSON();
291
+ }
292
+ const job = yield getJob({ id: jobId });
293
+ if (job) {
294
+ return {
295
+ jobId,
296
+ status: 'pending',
297
+ };
298
+ }
299
+ return null;
300
+ });
301
+ }
@@ -0,0 +1 @@
1
+ export declare function ensureDatabase(): Promise<void>;