@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
package/dist/db/job.js DELETED
@@ -1,54 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.Job = void 0;
4
- exports.initJobModel = initJobModel;
5
- const core_1 = require("@sequelize/core");
6
- class Job extends core_1.Model {
7
- }
8
- exports.Job = Job;
9
- function initJobModel(sequelize) {
10
- Job.init({
11
- id: {
12
- type: core_1.DataTypes.STRING(40),
13
- primaryKey: true,
14
- },
15
- queue: {
16
- type: core_1.DataTypes.STRING(32),
17
- allowNull: false,
18
- },
19
- job: {
20
- type: core_1.DataTypes.JSON,
21
- allowNull: false,
22
- },
23
- retryCount: {
24
- type: core_1.DataTypes.INTEGER,
25
- },
26
- delay: {
27
- type: core_1.DataTypes.INTEGER,
28
- },
29
- willRunAt: {
30
- type: core_1.DataTypes.INTEGER,
31
- },
32
- cancelled: {
33
- type: core_1.DataTypes.BOOLEAN,
34
- defaultValue: false,
35
- },
36
- createdAt: {
37
- type: core_1.DataTypes.DATE,
38
- defaultValue: core_1.DataTypes.NOW,
39
- index: true,
40
- },
41
- updatedAt: {
42
- type: core_1.DataTypes.DATE,
43
- defaultValue: core_1.DataTypes.NOW,
44
- index: true,
45
- },
46
- }, {
47
- sequelize,
48
- indexes: [{ fields: ['queue'] }],
49
- modelName: 'job',
50
- tableName: 'jobs',
51
- timestamps: true,
52
- });
53
- return Job;
54
- }
@@ -1,52 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.Snapshot = void 0;
4
- exports.initSnapshotModel = initSnapshotModel;
5
- const core_1 = require("@sequelize/core");
6
- class Snapshot extends core_1.Model {
7
- }
8
- exports.Snapshot = Snapshot;
9
- function initSnapshotModel(sequelize) {
10
- Snapshot.init({
11
- jobId: {
12
- type: core_1.DataTypes.STRING,
13
- primaryKey: true,
14
- allowNull: false,
15
- },
16
- url: {
17
- type: core_1.DataTypes.STRING,
18
- allowNull: false,
19
- index: true,
20
- },
21
- status: {
22
- type: core_1.DataTypes.ENUM('success', 'failed'),
23
- allowNull: false,
24
- },
25
- html: {
26
- type: core_1.DataTypes.TEXT,
27
- allowNull: true,
28
- },
29
- screenshot: {
30
- type: core_1.DataTypes.STRING,
31
- allowNull: true,
32
- },
33
- error: {
34
- type: core_1.DataTypes.STRING,
35
- allowNull: true,
36
- },
37
- lastModified: {
38
- type: core_1.DataTypes.STRING,
39
- allowNull: true,
40
- },
41
- options: {
42
- type: core_1.DataTypes.JSON,
43
- allowNull: true,
44
- },
45
- }, {
46
- sequelize,
47
- modelName: 'snapshot',
48
- tableName: 'snap',
49
- timestamps: true,
50
- });
51
- return Snapshot;
52
- }
package/dist/index.d.ts DELETED
@@ -1,6 +0,0 @@
1
- import { config } from './config';
2
- export * from './blocklet';
3
- export * from './crawler';
4
- export * from './middleware';
5
- export { Snapshot } from './db/snapshot';
6
- export declare function initCrawler(_config: Partial<typeof config>): Promise<void>;
package/dist/index.js DELETED
@@ -1,45 +0,0 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
- for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
- };
16
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
17
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
18
- return new (P || (P = Promise))(function (resolve, reject) {
19
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
20
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
21
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
22
- step((generator = generator.apply(thisArg, _arguments || [])).next());
23
- });
24
- };
25
- Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.Snapshot = void 0;
27
- exports.initCrawler = initCrawler;
28
- const config_1 = require("./config");
29
- const crawler_1 = require("./crawler");
30
- const db_1 = require("./db");
31
- const puppeteer_1 = require("./puppeteer");
32
- __exportStar(require("./blocklet"), exports);
33
- __exportStar(require("./crawler"), exports);
34
- __exportStar(require("./middleware"), exports);
35
- var snapshot_1 = require("./db/snapshot");
36
- Object.defineProperty(exports, "Snapshot", { enumerable: true, get: function () { return snapshot_1.Snapshot; } });
37
- function initCrawler(_config) {
38
- return __awaiter(this, void 0, void 0, function* () {
39
- Object.assign(config_1.config, _config);
40
- config_1.logger.debug('init crawler', config_1.config);
41
- yield (0, db_1.ensureDatabase)();
42
- yield (0, crawler_1.createCrawlQueue)();
43
- yield (0, puppeteer_1.ensureBrowser)();
44
- });
45
- }
@@ -1,4 +0,0 @@
1
- export declare function initSEOMiddleware({ autoReturnHtml, allowCrawler, }: {
2
- autoReturnHtml?: Boolean;
3
- allowCrawler?: Boolean | Function;
4
- }): (req: any, res: any, next: Function) => Promise<any>;
@@ -1,44 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- Object.defineProperty(exports, "__esModule", { value: true });
12
- exports.initSEOMiddleware = initSEOMiddleware;
13
- const cache_1 = require("./cache");
14
- const utils_1 = require("./utils");
15
- function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
16
- return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
17
- const isBot = (0, utils_1.isBotUserAgent)(req);
18
- const isSelf = (0, utils_1.isSelfCrawler)(req);
19
- if (!isBot || isSelf) {
20
- return next();
21
- }
22
- const fullUrl = (0, utils_1.getFullUrl)(req);
23
- const canCrawl = yield (0, utils_1.isAcceptCrawler)(fullUrl);
24
- const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
25
- // can not crawl, skip
26
- if (!canCrawl || !allowCrawlerResult) {
27
- return next();
28
- }
29
- const cacheData = yield cache_1.useCache.get(fullUrl);
30
- // add cached html to req
31
- req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
32
- // add cached lastModified to req, ISO string to GMT string
33
- req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
34
- if (req.cachedLastmod) {
35
- res.setHeader('Last-Modified', req.cachedLastmod);
36
- }
37
- if (autoReturnHtml && req.cachedHtml) {
38
- res.send(req.cachedHtml);
39
- return;
40
- }
41
- // missing cache
42
- next();
43
- });
44
- }
package/dist/utils.d.ts DELETED
@@ -1,17 +0,0 @@
1
- import { Page } from '@blocklet/puppeteer';
2
- export declare const api: import("axios").AxiosInstance;
3
- export declare const sleep: (ms: number) => Promise<unknown>;
4
- export declare const CRAWLER_FLAG = "x-crawler";
5
- export declare const isSelfCrawler: (req: any) => boolean;
6
- export declare const getDefaultRobotsUrl: (url: string) => string;
7
- export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
8
- export declare const getDefaultSitemapUrl: (url: string) => string;
9
- export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
10
- export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
11
- export declare const isBotUserAgent: (req: any) => boolean;
12
- export declare const getComponentInfo: () => {};
13
- export declare const getFullUrl: (req: any) => string;
14
- export declare const getRelativePath: (url: string) => string;
15
- export declare const formatUrl: (url: string) => string;
16
- export declare function md5(content: string | Uint8Array): string;
17
- export declare function findMaxScrollHeight(page: Page): Promise<number>;
package/esm/blocklet.d.ts DELETED
@@ -1,6 +0,0 @@
1
- export declare const crawlBlocklet: () => Promise<void>;
2
- export declare const initCronCrawlBlocklet: ({ time, options, }?: {
3
- time: string;
4
- options: any;
5
- }) => any;
6
- export declare const cancelCronCrawlBlocklet: () => void;
package/esm/blocklet.js DELETED
@@ -1,190 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- import Cron from '@abtnode/cron';
11
- import { components } from '@blocklet/sdk/lib/config';
12
- import debounce from 'lodash/debounce';
13
- import { joinURL } from 'ufo';
14
- import { useCache } from './cache';
15
- import { config, logger } from './config';
16
- import { createCrawlJob } from './crawler';
17
- import { closeBrowser, getBrowser } from './puppeteer';
18
- import { getComponentInfo, getRelativePath, getSitemapList } from './utils';
19
- // record crawl blocklet running
20
- const crawlBlockletRunningMap = new Map();
21
- // crawl blocklet sitemap urls
22
- export const crawlBlocklet = () => __awaiter(void 0, void 0, void 0, function* () {
23
- // @ts-ignore
24
- const { mountPoint, did } = getComponentInfo();
25
- if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
26
- logger.info(`Crawler blocklet ${did} is running, skip it`);
27
- return;
28
- }
29
- // check has browser can use
30
- try {
31
- const browser = yield getBrowser();
32
- if (!browser) {
33
- throw new Error('No Browser can use');
34
- }
35
- logger.info('Crawler blocklet existing can use browser');
36
- }
37
- catch (error) {
38
- logger.info(`Crawler blocklet abort by error: ${(error === null || error === void 0 ? void 0 : error.message) || (error === null || error === void 0 ? void 0 : error.reason) || error}`);
39
- return;
40
- }
41
- const { appUrl } = config;
42
- if (!appUrl) {
43
- throw new Error('appUrl not found');
44
- }
45
- const sitemapList = yield getSitemapList(appUrl);
46
- const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
47
- const otherMountPointList = components
48
- .filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
49
- .map((item) => item.mountPoint);
50
- // get can use loc
51
- const blockletLocList = sitemapList.filter((item) => {
52
- var _a;
53
- if (mountPoint !== '/') {
54
- return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(matchMountPoint)) > -1;
55
- }
56
- // if mountPoint is /, skip other mountPoint
57
- return otherMountPointList.every((mountPoint) => { var _a; return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(mountPoint)) === -1; });
58
- });
59
- const canUseBlockletLocList = [];
60
- const lastmodMap = new Map();
61
- let skipBlockletLocTotal = 0;
62
- let blockletLocTotal = 0;
63
- yield Promise.all(blockletLocList.map((item) => __awaiter(void 0, void 0, void 0, function* () {
64
- var _a;
65
- let tempLocList = [];
66
- if (item.url) {
67
- tempLocList.push(item.url);
68
- }
69
- if (((_a = item === null || item === void 0 ? void 0 : item.links) === null || _a === void 0 ? void 0 : _a.length) > 0) {
70
- tempLocList.push(...item.links.map((ytem) => ytem.url));
71
- }
72
- blockletLocTotal += tempLocList.length;
73
- // @ts-ignore
74
- tempLocList = (yield Promise.all(tempLocList.map((loc) => __awaiter(void 0, void 0, void 0, function* () {
75
- try {
76
- const { lastModified: cacheLastModified } = yield useCache.get(getRelativePath(loc));
77
- // sitemap item lastmod is same as cache lastModified, skip it
78
- if (item.lastmod &&
79
- cacheLastModified &&
80
- new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()) {
81
- skipBlockletLocTotal++;
82
- return false;
83
- }
84
- return loc;
85
- }
86
- catch (error) {
87
- // ignore error
88
- }
89
- // if can not get cache, return loc
90
- return loc;
91
- })))).filter(Boolean);
92
- tempLocList.forEach((loc) => {
93
- if (item.lastmod)
94
- lastmodMap.set(loc, item.lastmod);
95
- });
96
- canUseBlockletLocList.push(...tempLocList);
97
- })));
98
- const crawlerLogText = (step = '') => [
99
- `Crawler sitemap.xml about ${did} ${step}: `,
100
- {
101
- blockletLocTotal,
102
- canUseBlockletLocTotal: canUseBlockletLocList.length,
103
- skipBlockletLocTotal,
104
- lastmodMapTotal: lastmodMap.size,
105
- },
106
- ];
107
- logger.info(...crawlerLogText('start'));
108
- try {
109
- // record crawl blocklet running
110
- crawlBlockletRunningMap.set(did, true);
111
- yield createCrawlJob({
112
- // @ts-ignore
113
- urls: canUseBlockletLocList,
114
- saveToRedis: true,
115
- lastmodMap,
116
- // formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
117
- // const pageContent = await page.evaluate(() => {
118
- // const removeElements = (tagName: string) => {
119
- // const elements = document.querySelectorAll(tagName);
120
- // for (let i = elements.length - 1; i >= 0; i--) {
121
- // try {
122
- // elements[i]?.parentNode?.removeChild(elements[i] as Node);
123
- // } catch (error) {
124
- // // do noting
125
- // }
126
- // }
127
- // };
128
- // // remove script, style, link, noscript
129
- // // removeElements('script');
130
- // // removeElements('style');
131
- // // removeElements('link');
132
- // // removeElements('noscript');
133
- // // remove uploader
134
- // removeElements('[id="uploader-container"]');
135
- // removeElements('[class^="uppy-"]');
136
- // // remove point up component
137
- // removeElements('[id="point-up-component"]');
138
- // // add meta tag to record crawler
139
- // const meta = document.createElement('meta');
140
- // meta.name = 'blocklet-crawler';
141
- // meta.content = 'true';
142
- // document.head.appendChild(meta);
143
- // return document.documentElement.outerHTML;
144
- // });
145
- // return pageContent;
146
- // },
147
- });
148
- logger.info(...crawlerLogText('success'));
149
- yield closeBrowser({
150
- trimCache: true,
151
- });
152
- }
153
- catch (error) {
154
- logger.info('Crawler blocklet abort by error', error);
155
- }
156
- finally {
157
- // delete crawl blocklet running
158
- crawlBlockletRunningMap.delete(did);
159
- }
160
- });
161
- const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
162
- let cronCrawlBlockletJob = null;
163
- // init cron crawl blocklet
164
- export const initCronCrawlBlocklet = ({ time = '0 0 */12 * * *', // every 12 hours
165
- options, } = {}) => {
166
- if (!cronCrawlBlockletJob) {
167
- cronCrawlBlockletJob = Cron.init({
168
- context: {},
169
- jobs: [
170
- {
171
- name: CRON_CRAWL_BLOCKLET_KEY,
172
- time,
173
- fn: debounce(crawlBlocklet),
174
- options: Object.assign({ runOnInit: false }, options),
175
- },
176
- ],
177
- onError: (err) => {
178
- console.error('run job failed', err);
179
- },
180
- });
181
- }
182
- return cronCrawlBlockletJob;
183
- };
184
- export const cancelCronCrawlBlocklet = () => {
185
- if (cronCrawlBlockletJob) {
186
- cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
187
- cronCrawlBlockletJob = null;
188
- logger.info('Cron crawl blocklet stop, clear crawl queue');
189
- }
190
- };
package/esm/cache.d.ts DELETED
@@ -1,10 +0,0 @@
1
- export declare const cachePool: import("generic-pool").Pool<any>;
2
- export declare const memoryPool: import("generic-pool").Pool<Map<string, any>>;
3
- export declare const withCache: (cb: Function) => Promise<any>;
4
- export declare const formatKey: (key: string) => string;
5
- export declare const useCache: {
6
- get: (key: string) => Promise<any>;
7
- set: (key: string, value: any, options?: any) => Promise<any>;
8
- remove: (key: string) => Promise<any>;
9
- list: (key?: string) => Promise<any>;
10
- };
package/esm/cache.js DELETED
@@ -1,114 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- var _a;
11
- import { createPool } from 'generic-pool';
12
- import { createClient } from 'redis';
13
- import { config, logger } from './config';
14
- const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
15
- const MAX_REDIS_RETRY = 3;
16
- const ttl = 1000 * 60 * 60 * 24 * 7;
17
- export const cachePool = createPool({
18
- create: () => __awaiter(void 0, void 0, void 0, function* () {
19
- try {
20
- const { redisUrl } = config;
21
- const redisClient = createClient({
22
- url: redisUrl,
23
- socket: {
24
- // @ts-ignore
25
- reconnectStrategy: (retries) => {
26
- if (retries >= MAX_REDIS_RETRY) {
27
- return new Error('Retry Time Exhausted');
28
- }
29
- return Math.min(retries * 500, 1000 * 3);
30
- },
31
- },
32
- });
33
- redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
34
- yield redisClient.connect();
35
- logger.info(`Successfully connected to Redis: ${redisUrl}`);
36
- return redisClient;
37
- }
38
- catch (error) {
39
- logger.warn('Redis connection failed', error);
40
- return null;
41
- }
42
- }),
43
- destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
44
- // if is redis client
45
- if (client.isReady) {
46
- yield client.quit();
47
- }
48
- }),
49
- }, {
50
- max: 2, // 2 clients
51
- min: 0,
52
- // evictionRunIntervalMillis: 0,
53
- });
54
- export const memoryPool = createPool({
55
- create: () => {
56
- const map = new Map();
57
- // @ts-ignore
58
- map.del = map.delete;
59
- return Promise.resolve(map);
60
- },
61
- destroy: (client) => {
62
- client.clear();
63
- return Promise.resolve();
64
- },
65
- }, {
66
- max: 10,
67
- min: 0,
68
- });
69
- export const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
70
- const pool = config.redisUrl ? cachePool : memoryPool;
71
- const client = yield pool.acquire();
72
- if (client) {
73
- try {
74
- return cb(client);
75
- }
76
- finally {
77
- // release client to pool, let other use
78
- yield pool.release(client);
79
- }
80
- }
81
- });
82
- export const formatKey = (key) => {
83
- return `${cacheKeyPrefix}${key}`;
84
- };
85
- export const useCache = {
86
- get: (key) => {
87
- return withCache((client) => __awaiter(void 0, void 0, void 0, function* () {
88
- const value = yield client.get(formatKey(key));
89
- try {
90
- return JSON.parse(value);
91
- }
92
- catch (error) {
93
- // ignore error
94
- }
95
- return value;
96
- }));
97
- },
98
- set: (key, value, options) => {
99
- return withCache((client) => {
100
- const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
101
- return client.set(formatKey(key), formatValue, Object.assign({ PX: ttl }, options));
102
- });
103
- },
104
- remove: (key) => {
105
- return withCache((client) => {
106
- return client.del(formatKey(key));
107
- });
108
- },
109
- list: (key = '*') => {
110
- return withCache((client) => {
111
- return client.keys(formatKey(key));
112
- });
113
- },
114
- };
package/esm/config.d.ts DELETED
@@ -1,10 +0,0 @@
1
- export declare const logger: any;
2
- export declare const config: {
3
- redisUrl: string;
4
- dataDir: string;
5
- appDir: string;
6
- appUrl: string;
7
- puppeteerPath: string;
8
- cacheDir: string;
9
- testOnInitialize: boolean;
10
- };
package/esm/crawler.d.ts DELETED
@@ -1,28 +0,0 @@
1
- import { JobState } from './db/job';
2
- import { SnapshotModel } from './db/snapshot';
3
- export declare function createCrawlQueue(): void;
4
- export declare function getDataDir(): Promise<{
5
- htmlDir: string;
6
- screenshotDir: string;
7
- }>;
8
- export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- formatPageContent?: Function;
11
- includeScreenshot?: boolean;
12
- includeHtml?: boolean;
13
- width?: number;
14
- height?: number;
15
- quality?: number;
16
- timeout?: number;
17
- fullPage?: boolean;
18
- }) => Promise<{
19
- html: string;
20
- screenshot: Uint8Array<ArrayBufferLike> | null;
21
- }>;
22
- export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
23
- export declare function getJob(condition: Partial<JobState>): Promise<any>;
24
- export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
25
- /**
26
- * get snapshot from db or crawl queue
27
- */
28
- export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
package/esm/db/index.d.ts DELETED
@@ -1 +0,0 @@
1
- export declare function ensureDatabase(): Promise<void>;
package/esm/db/job.js DELETED
@@ -1,50 +0,0 @@
1
- import { DataTypes, Model } from '@sequelize/core';
2
- class Job extends Model {
3
- }
4
- export { Job };
5
- export function initJobModel(sequelize) {
6
- Job.init({
7
- id: {
8
- type: DataTypes.STRING(40),
9
- primaryKey: true,
10
- },
11
- queue: {
12
- type: DataTypes.STRING(32),
13
- allowNull: false,
14
- },
15
- job: {
16
- type: DataTypes.JSON,
17
- allowNull: false,
18
- },
19
- retryCount: {
20
- type: DataTypes.INTEGER,
21
- },
22
- delay: {
23
- type: DataTypes.INTEGER,
24
- },
25
- willRunAt: {
26
- type: DataTypes.INTEGER,
27
- },
28
- cancelled: {
29
- type: DataTypes.BOOLEAN,
30
- defaultValue: false,
31
- },
32
- createdAt: {
33
- type: DataTypes.DATE,
34
- defaultValue: DataTypes.NOW,
35
- index: true,
36
- },
37
- updatedAt: {
38
- type: DataTypes.DATE,
39
- defaultValue: DataTypes.NOW,
40
- index: true,
41
- },
42
- }, {
43
- sequelize,
44
- indexes: [{ fields: ['queue'] }],
45
- modelName: 'job',
46
- tableName: 'jobs',
47
- timestamps: true,
48
- });
49
- return Job;
50
- }