@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
@@ -7,31 +7,22 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
- import { components, env } from '@blocklet/sdk/lib/config';
11
- import axios from 'axios';
10
+ import Axios from 'axios';
12
11
  import flattenDeep from 'lodash/flattenDeep';
13
12
  import uniq from 'lodash/uniq';
14
13
  import { createHash } from 'node:crypto';
15
14
  import robotsParser from 'robots-parser';
16
15
  import { parseSitemap } from 'sitemap';
17
16
  import { Readable } from 'stream';
18
- import { joinURL } from 'ufo';
19
- export const api = axios.create({
20
- timeout: 1000 * 10,
17
+ import { joinURL, withQuery } from 'ufo';
18
+ import { logger } from './config';
19
+ export const axios = Axios.create({
20
+ timeout: 1000 * 30,
21
21
  headers: {
22
22
  'Content-Type': 'application/json',
23
23
  },
24
24
  });
25
- export const sleep = (ms) => {
26
- return new Promise((resolve) => {
27
- setTimeout(resolve, ms);
28
- });
29
- };
30
- export const CRAWLER_FLAG = 'x-crawler';
31
- export const isSelfCrawler = (req) => {
32
- const ua = req.get('user-agent') || '';
33
- return req.get(CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
34
- };
25
+ export const CRAWLER_FLAG = 'x-arcblock-crawler';
35
26
  /**
36
27
  * A default set of user agent patterns for bots/crawlers that do not perform
37
28
  * well with pages that require JavaScript.
@@ -87,12 +78,8 @@ const botUserAgents = [
87
78
  /AlibabaGroup/i,
88
79
  /adaptive-edge-crawler/i,
89
80
  ];
90
- const isSpider = (ua) => botUserAgents.some((spider) => {
91
- return spider.test(ua);
92
- });
93
81
  /**
94
- * A default set of file extensions for static assets that do not need to be
95
- * proxied.
82
+ * A default set of file extensions for static assets that do not need to be proxied.
96
83
  */
97
84
  const staticFileExtensions = [
98
85
  'ai',
@@ -137,81 +124,87 @@ const staticFileExtensions = [
137
124
  'xml',
138
125
  'zip',
139
126
  ];
140
- export const getDefaultRobotsUrl = (url) => {
141
- const { origin } = new URL(url);
142
- return joinURL(origin, 'robots.txt?nocache=1');
127
+ export const sleep = (ms) => {
128
+ return new Promise((resolve) => {
129
+ setTimeout(resolve, ms);
130
+ });
131
+ };
132
+ /**
133
+ * Check if the request is a arcblock crawler
134
+ */
135
+ export const isSelfCrawler = (req) => {
136
+ const ua = req.get('user-agent') || '';
137
+ return req.get(CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
143
138
  };
139
+ /**
140
+ * Check if the request is a static file
141
+ */
142
+ export function isStaticFile(req) {
143
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
144
+ return excludeUrlPattern.test(req.path);
145
+ }
146
+ /**
147
+ * Check if the request is a spider
148
+ */
149
+ export function isSpider(req) {
150
+ const ua = req.get('user-agent') || '';
151
+ return botUserAgents.some((spider) => spider.test(ua));
152
+ }
153
+ /**
154
+ * Get and parse the robots.txt by `robots-parser`
155
+ */
144
156
  export function getRobots(url) {
145
157
  return __awaiter(this, void 0, void 0, function* () {
146
158
  const { origin } = new URL(url);
147
159
  const robotsUrl = joinURL(origin, 'robots.txt?nocache=1');
148
- const { data } = yield api.get(robotsUrl).catch(() => ({
149
- data: '',
150
- }));
160
+ const { data } = yield axios.get(robotsUrl).catch((error) => {
161
+ logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
162
+ return { data: null };
163
+ });
151
164
  return data ? robotsParser(robotsUrl, data) : null;
152
165
  });
153
166
  }
154
- export const getDefaultSitemapUrl = (url) => {
155
- const { origin } = new URL(url);
156
- return joinURL(origin, 'sitemap.xml?nocache=1');
157
- };
167
+ /**
168
+ * Check if the url is allowed to crawl from robots.txt
169
+ */
158
170
  export const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
159
171
  const robots = yield getRobots(url);
160
172
  const isAllowed = robots ? yield robots.isAllowed(url) : true;
161
173
  return isAllowed;
162
174
  });
175
+ /**
176
+ * Get and parse the sitemap.xml by `sitemap` package
177
+ */
163
178
  export const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
164
- let sitemapUrlList = [getDefaultSitemapUrl(url)];
179
+ let sitemapUrlList = [];
165
180
  const robots = yield getRobots(url);
166
181
  if (robots) {
167
- const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
168
- if (robotsTxtSitemapUrlList.length > 0) {
169
- sitemapUrlList = robotsTxtSitemapUrlList;
170
- }
182
+ sitemapUrlList = (yield robots.getSitemaps()) || [];
183
+ }
184
+ if (!sitemapUrlList.length) {
185
+ const { origin } = new URL(url);
186
+ sitemapUrlList.push(joinURL(origin, 'sitemap.xml?nocache=1'));
171
187
  }
172
188
  // loop site map url list
173
189
  const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
174
- const newUrl = new URL(sitemapUrl);
175
- newUrl.searchParams.set('nocache', '1');
176
- sitemapUrl = newUrl.toString();
177
- const { data: sitemapTxt } = yield api.get(sitemapUrl).catch(() => ({
178
- data: '',
179
- }));
180
- if (sitemapTxt) {
181
- const stream = Readable.from([sitemapTxt]);
182
- const sitemapJson = yield parseSitemap(stream);
183
- return sitemapJson;
190
+ sitemapUrl = withQuery(sitemapUrl, { nocache: '1' });
191
+ try {
192
+ const { data: sitemapTxt } = yield axios.get(sitemapUrl).catch(() => ({
193
+ data: '',
194
+ }));
195
+ if (sitemapTxt) {
196
+ const stream = Readable.from([sitemapTxt]);
197
+ const sitemapJson = yield parseSitemap(stream);
198
+ return sitemapJson;
199
+ }
200
+ }
201
+ catch (error) {
202
+ logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
184
203
  }
185
204
  return [];
186
205
  })));
187
206
  return uniq(flattenDeep(sitemapList.filter(Boolean)));
188
207
  });
189
- export const isBotUserAgent = (req) => {
190
- const ua = req.get('user-agent');
191
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
192
- if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
193
- return false;
194
- }
195
- return true;
196
- };
197
- export const getComponentInfo = () => {
198
- return components.find((item) => item.did === env.componentDid) || {};
199
- };
200
- export const getFullUrl = (req) => {
201
- const blockletPathname = req.headers['x-path-prefix']
202
- ? joinURL(req.headers['x-path-prefix'], req.originalUrl)
203
- : req.originalUrl;
204
- return joinURL(env.appUrl, blockletPathname);
205
- };
206
- export const getRelativePath = (url) => {
207
- try {
208
- return new URL(url).pathname;
209
- }
210
- catch (error) {
211
- // ignore error
212
- }
213
- return url;
214
- };
215
208
  export const formatUrl = (url) => {
216
209
  return url.replace(/\/$/, '').trim();
217
210
  };
package/package.json CHANGED
@@ -1,16 +1,28 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.0.6",
4
- "main": "dist/index.js",
5
- "module": "esm/index.js",
6
- "types": "dist/index.d.ts",
3
+ "version": "1.1.1",
4
+ "main": "lib/cjs/index.js",
5
+ "module": "lib/esm/index.js",
6
+ "types": "lib/cjs/index.d.ts",
7
7
  "publishConfig": {
8
8
  "access": "public"
9
9
  },
10
10
  "files": [
11
- "dist",
12
- "esm"
11
+ "lib",
12
+ "*.d.ts"
13
13
  ],
14
+ "exports": {
15
+ ".": {
16
+ "require": "./lib/cjs/index.js",
17
+ "import": "./lib/esm/index.js",
18
+ "types": "./lib/cjs/index.d.ts"
19
+ },
20
+ "./utils": {
21
+ "require": "./lib/cjs/utils.js",
22
+ "import": "./lib/esm/utils.js",
23
+ "types": "./lib/cjs/utils.d.ts"
24
+ }
25
+ },
14
26
  "lint-staged": {
15
27
  "*.{mjs,js,jsx,ts,tsx}": [
16
28
  "prettier --write",
@@ -36,61 +48,37 @@
36
48
  "@abtnode/cron": "^1.16.43",
37
49
  "@abtnode/models": "^1.16.43",
38
50
  "@abtnode/queue": "^1.16.43",
39
- "@arcblock/did-auth": "^1.19.15",
40
- "@arcblock/did-auth-storage-nedb": "^1.7.1",
41
51
  "@blocklet/logger": "^1.16.43",
42
52
  "@blocklet/puppeteer": "^22.11.3",
43
53
  "@blocklet/sdk": "^1.16.43",
44
- "@ocap/client": "^1.19.15",
45
- "@ocap/mcrypto": "^1.19.15",
46
- "@ocap/util": "^1.20.11",
47
- "@ocap/wallet": "^1.19.15",
48
54
  "@sequelize/core": "7.0.0-alpha.46",
49
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
50
56
  "axios": "^1.7.9",
51
- "cookie-parser": "^1.4.7",
52
- "cors": "^2.8.5",
53
- "dotenv-flow": "^4.1.0",
54
- "express": "^4.21.2",
55
- "express-async-errors": "^3.1.1",
56
57
  "fs-extra": "^11.2.0",
57
58
  "generic-pool": "^3.9.0",
58
59
  "lodash": "^4.17.21",
59
60
  "lru-cache": "^10.4.3",
60
- "p-queue": "6.6.2",
61
- "p-wait-for": "^5.0.2",
62
61
  "redis": "^4.7.0",
63
62
  "robots-parser": "^3.0.1",
64
63
  "sequelize": "^6.37.7",
65
64
  "sitemap": "^7.1.2",
66
65
  "sqlite3": "^5.1.7",
67
66
  "ufo": "^1.5.4",
68
- "url-join": "^4.0.1"
67
+ "p-map": "^7.0.3"
69
68
  },
70
69
  "devDependencies": {
71
70
  "@blocklet/js-sdk": "^1.16.39",
72
- "@types/cookie-parser": "^1.4.8",
73
- "@types/cors": "^2.8.17",
74
71
  "@types/dotenv-flow": "^3.3.3",
75
72
  "@types/express": "^4.17.21",
76
73
  "@types/fs-extra": "^11.0.4",
77
74
  "@types/lodash": "^4.17.16",
78
75
  "@types/node": "^20.17.19",
79
- "@types/react": "^18.3.18",
80
- "@types/react-dom": "^18.3.5",
81
- "@vitejs/plugin-react": "^4.3.4",
76
+ "express": "^4.21.2",
82
77
  "bumpp": "^9.11.1",
83
78
  "nodemon": "^3.1.9",
84
79
  "npm-run-all": "^4.1.5",
85
80
  "puppeteer": "^24.8.2",
86
- "react": "~18.2.0",
87
- "react-dom": "~18.2.0",
88
- "react-router-dom": "^6.29.0",
89
- "rimraf": "^5.0.10",
90
81
  "tsx": "^4.19.3",
91
- "vite": "^5.4.14",
92
- "vite-plugin-blocklet": "^0.9.32",
93
- "vite-plugin-svgr": "^4.3.0",
94
82
  "zx": "^8.3.2"
95
83
  },
96
84
  "importSort": {
package/third.d.ts ADDED
File without changes
@@ -1,6 +0,0 @@
1
- export declare const crawlBlocklet: () => Promise<void>;
2
- export declare const initCronCrawlBlocklet: ({ time, options, }?: {
3
- time: string;
4
- options: any;
5
- }) => any;
6
- export declare const cancelCronCrawlBlocklet: () => void;
package/dist/blocklet.js DELETED
@@ -1,199 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __importDefault = (this && this.__importDefault) || function (mod) {
12
- return (mod && mod.__esModule) ? mod : { "default": mod };
13
- };
14
- Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.cancelCronCrawlBlocklet = exports.initCronCrawlBlocklet = exports.crawlBlocklet = void 0;
16
- const cron_1 = __importDefault(require("@abtnode/cron"));
17
- const config_1 = require("@blocklet/sdk/lib/config");
18
- const debounce_1 = __importDefault(require("lodash/debounce"));
19
- const ufo_1 = require("ufo");
20
- const cache_1 = require("./cache");
21
- const config_2 = require("./config");
22
- const crawler_1 = require("./crawler");
23
- const puppeteer_1 = require("./puppeteer");
24
- const utils_1 = require("./utils");
25
- // record crawl blocklet running
26
- const crawlBlockletRunningMap = new Map();
27
- // crawl blocklet sitemap urls
28
- const crawlBlocklet = () => __awaiter(void 0, void 0, void 0, function* () {
29
- // @ts-ignore
30
- const { mountPoint, did } = (0, utils_1.getComponentInfo)();
31
- if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
32
- config_2.logger.info(`Crawler blocklet ${did} is running, skip it`);
33
- return;
34
- }
35
- // check has browser can use
36
- try {
37
- const browser = yield (0, puppeteer_1.getBrowser)();
38
- if (!browser) {
39
- throw new Error('No Browser can use');
40
- }
41
- config_2.logger.info('Crawler blocklet existing can use browser');
42
- }
43
- catch (error) {
44
- config_2.logger.info(`Crawler blocklet abort by error: ${(error === null || error === void 0 ? void 0 : error.message) || (error === null || error === void 0 ? void 0 : error.reason) || error}`);
45
- return;
46
- }
47
- const { appUrl } = config_2.config;
48
- if (!appUrl) {
49
- throw new Error('appUrl not found');
50
- }
51
- const sitemapList = yield (0, utils_1.getSitemapList)(appUrl);
52
- const matchMountPoint = (0, ufo_1.joinURL)(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
53
- const otherMountPointList = config_1.components
54
- .filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
55
- .map((item) => item.mountPoint);
56
- // get can use loc
57
- const blockletLocList = sitemapList.filter((item) => {
58
- var _a;
59
- if (mountPoint !== '/') {
60
- return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(matchMountPoint)) > -1;
61
- }
62
- // if mountPoint is /, skip other mountPoint
63
- return otherMountPointList.every((mountPoint) => { var _a; return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(mountPoint)) === -1; });
64
- });
65
- const canUseBlockletLocList = [];
66
- const lastmodMap = new Map();
67
- let skipBlockletLocTotal = 0;
68
- let blockletLocTotal = 0;
69
- yield Promise.all(blockletLocList.map((item) => __awaiter(void 0, void 0, void 0, function* () {
70
- var _a;
71
- let tempLocList = [];
72
- if (item.url) {
73
- tempLocList.push(item.url);
74
- }
75
- if (((_a = item === null || item === void 0 ? void 0 : item.links) === null || _a === void 0 ? void 0 : _a.length) > 0) {
76
- tempLocList.push(...item.links.map((ytem) => ytem.url));
77
- }
78
- blockletLocTotal += tempLocList.length;
79
- // @ts-ignore
80
- tempLocList = (yield Promise.all(tempLocList.map((loc) => __awaiter(void 0, void 0, void 0, function* () {
81
- try {
82
- const { lastModified: cacheLastModified } = yield cache_1.useCache.get((0, utils_1.getRelativePath)(loc));
83
- // sitemap item lastmod is same as cache lastModified, skip it
84
- if (item.lastmod &&
85
- cacheLastModified &&
86
- new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()) {
87
- skipBlockletLocTotal++;
88
- return false;
89
- }
90
- return loc;
91
- }
92
- catch (error) {
93
- // ignore error
94
- }
95
- // if can not get cache, return loc
96
- return loc;
97
- })))).filter(Boolean);
98
- tempLocList.forEach((loc) => {
99
- if (item.lastmod)
100
- lastmodMap.set(loc, item.lastmod);
101
- });
102
- canUseBlockletLocList.push(...tempLocList);
103
- })));
104
- const crawlerLogText = (step = '') => [
105
- `Crawler sitemap.xml about ${did} ${step}: `,
106
- {
107
- blockletLocTotal,
108
- canUseBlockletLocTotal: canUseBlockletLocList.length,
109
- skipBlockletLocTotal,
110
- lastmodMapTotal: lastmodMap.size,
111
- },
112
- ];
113
- config_2.logger.info(...crawlerLogText('start'));
114
- try {
115
- // record crawl blocklet running
116
- crawlBlockletRunningMap.set(did, true);
117
- yield (0, crawler_1.createCrawlJob)({
118
- // @ts-ignore
119
- urls: canUseBlockletLocList,
120
- saveToRedis: true,
121
- lastmodMap,
122
- // formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
123
- // const pageContent = await page.evaluate(() => {
124
- // const removeElements = (tagName: string) => {
125
- // const elements = document.querySelectorAll(tagName);
126
- // for (let i = elements.length - 1; i >= 0; i--) {
127
- // try {
128
- // elements[i]?.parentNode?.removeChild(elements[i] as Node);
129
- // } catch (error) {
130
- // // do noting
131
- // }
132
- // }
133
- // };
134
- // // remove script, style, link, noscript
135
- // // removeElements('script');
136
- // // removeElements('style');
137
- // // removeElements('link');
138
- // // removeElements('noscript');
139
- // // remove uploader
140
- // removeElements('[id="uploader-container"]');
141
- // removeElements('[class^="uppy-"]');
142
- // // remove point up component
143
- // removeElements('[id="point-up-component"]');
144
- // // add meta tag to record crawler
145
- // const meta = document.createElement('meta');
146
- // meta.name = 'blocklet-crawler';
147
- // meta.content = 'true';
148
- // document.head.appendChild(meta);
149
- // return document.documentElement.outerHTML;
150
- // });
151
- // return pageContent;
152
- // },
153
- });
154
- config_2.logger.info(...crawlerLogText('success'));
155
- yield (0, puppeteer_1.closeBrowser)({
156
- trimCache: true,
157
- });
158
- }
159
- catch (error) {
160
- config_2.logger.info('Crawler blocklet abort by error', error);
161
- }
162
- finally {
163
- // delete crawl blocklet running
164
- crawlBlockletRunningMap.delete(did);
165
- }
166
- });
167
- exports.crawlBlocklet = crawlBlocklet;
168
- const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
169
- let cronCrawlBlockletJob = null;
170
- // init cron crawl blocklet
171
- const initCronCrawlBlocklet = ({ time = '0 0 */12 * * *', // every 12 hours
172
- options, } = {}) => {
173
- if (!cronCrawlBlockletJob) {
174
- cronCrawlBlockletJob = cron_1.default.init({
175
- context: {},
176
- jobs: [
177
- {
178
- name: CRON_CRAWL_BLOCKLET_KEY,
179
- time,
180
- fn: (0, debounce_1.default)(exports.crawlBlocklet),
181
- options: Object.assign({ runOnInit: false }, options),
182
- },
183
- ],
184
- onError: (err) => {
185
- console.error('run job failed', err);
186
- },
187
- });
188
- }
189
- return cronCrawlBlockletJob;
190
- };
191
- exports.initCronCrawlBlocklet = initCronCrawlBlocklet;
192
- const cancelCronCrawlBlocklet = () => {
193
- if (cronCrawlBlockletJob) {
194
- cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
195
- cronCrawlBlockletJob = null;
196
- config_2.logger.info('Cron crawl blocklet stop, clear crawl queue');
197
- }
198
- };
199
- exports.cancelCronCrawlBlocklet = cancelCronCrawlBlocklet;
package/dist/cache.d.ts DELETED
@@ -1,10 +0,0 @@
1
- export declare const cachePool: import("generic-pool").Pool<any>;
2
- export declare const memoryPool: import("generic-pool").Pool<Map<string, any>>;
3
- export declare const withCache: (cb: Function) => Promise<any>;
4
- export declare const formatKey: (key: string) => string;
5
- export declare const useCache: {
6
- get: (key: string) => Promise<any>;
7
- set: (key: string, value: any, options?: any) => Promise<any>;
8
- remove: (key: string) => Promise<any>;
9
- list: (key?: string) => Promise<any>;
10
- };
package/dist/cache.js DELETED
@@ -1,119 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var _a;
12
- Object.defineProperty(exports, "__esModule", { value: true });
13
- exports.useCache = exports.formatKey = exports.withCache = exports.memoryPool = exports.cachePool = void 0;
14
- const generic_pool_1 = require("generic-pool");
15
- const redis_1 = require("redis");
16
- const config_1 = require("./config");
17
- const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
18
- const MAX_REDIS_RETRY = 3;
19
- const ttl = 1000 * 60 * 60 * 24 * 7;
20
- exports.cachePool = (0, generic_pool_1.createPool)({
21
- create: () => __awaiter(void 0, void 0, void 0, function* () {
22
- try {
23
- const { redisUrl } = config_1.config;
24
- const redisClient = (0, redis_1.createClient)({
25
- url: redisUrl,
26
- socket: {
27
- // @ts-ignore
28
- reconnectStrategy: (retries) => {
29
- if (retries >= MAX_REDIS_RETRY) {
30
- return new Error('Retry Time Exhausted');
31
- }
32
- return Math.min(retries * 500, 1000 * 3);
33
- },
34
- },
35
- });
36
- redisClient.on('error', (err) => config_1.logger.warn('Redis Client Error:', err));
37
- yield redisClient.connect();
38
- config_1.logger.info(`Successfully connected to Redis: ${redisUrl}`);
39
- return redisClient;
40
- }
41
- catch (error) {
42
- config_1.logger.warn('Redis connection failed', error);
43
- return null;
44
- }
45
- }),
46
- destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
47
- // if is redis client
48
- if (client.isReady) {
49
- yield client.quit();
50
- }
51
- }),
52
- }, {
53
- max: 2, // 2 clients
54
- min: 0,
55
- // evictionRunIntervalMillis: 0,
56
- });
57
- exports.memoryPool = (0, generic_pool_1.createPool)({
58
- create: () => {
59
- const map = new Map();
60
- // @ts-ignore
61
- map.del = map.delete;
62
- return Promise.resolve(map);
63
- },
64
- destroy: (client) => {
65
- client.clear();
66
- return Promise.resolve();
67
- },
68
- }, {
69
- max: 10,
70
- min: 0,
71
- });
72
- const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
73
- const pool = config_1.config.redisUrl ? exports.cachePool : exports.memoryPool;
74
- const client = yield pool.acquire();
75
- if (client) {
76
- try {
77
- return cb(client);
78
- }
79
- finally {
80
- // release client to pool, let other use
81
- yield pool.release(client);
82
- }
83
- }
84
- });
85
- exports.withCache = withCache;
86
- const formatKey = (key) => {
87
- return `${cacheKeyPrefix}${key}`;
88
- };
89
- exports.formatKey = formatKey;
90
- exports.useCache = {
91
- get: (key) => {
92
- return (0, exports.withCache)((client) => __awaiter(void 0, void 0, void 0, function* () {
93
- const value = yield client.get((0, exports.formatKey)(key));
94
- try {
95
- return JSON.parse(value);
96
- }
97
- catch (error) {
98
- // ignore error
99
- }
100
- return value;
101
- }));
102
- },
103
- set: (key, value, options) => {
104
- return (0, exports.withCache)((client) => {
105
- const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
106
- return client.set((0, exports.formatKey)(key), formatValue, Object.assign({ PX: ttl }, options));
107
- });
108
- },
109
- remove: (key) => {
110
- return (0, exports.withCache)((client) => {
111
- return client.del((0, exports.formatKey)(key));
112
- });
113
- },
114
- list: (key = '*') => {
115
- return (0, exports.withCache)((client) => {
116
- return client.keys((0, exports.formatKey)(key));
117
- });
118
- },
119
- };
package/dist/config.d.ts DELETED
@@ -1,10 +0,0 @@
1
- export declare const logger: any;
2
- export declare const config: {
3
- redisUrl: string;
4
- dataDir: string;
5
- appDir: string;
6
- appUrl: string;
7
- puppeteerPath: string;
8
- cacheDir: string;
9
- testOnInitialize: boolean;
10
- };
package/dist/crawler.d.ts DELETED
@@ -1,28 +0,0 @@
1
- import { JobState } from './db/job';
2
- import { SnapshotModel } from './db/snapshot';
3
- export declare function createCrawlQueue(): void;
4
- export declare function getDataDir(): Promise<{
5
- htmlDir: string;
6
- screenshotDir: string;
7
- }>;
8
- export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- formatPageContent?: Function;
11
- includeScreenshot?: boolean;
12
- includeHtml?: boolean;
13
- width?: number;
14
- height?: number;
15
- quality?: number;
16
- timeout?: number;
17
- fullPage?: boolean;
18
- }) => Promise<{
19
- html: string;
20
- screenshot: Uint8Array<ArrayBufferLike> | null;
21
- }>;
22
- export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
23
- export declare function getJob(condition: Partial<JobState>): Promise<any>;
24
- export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
25
- /**
26
- * get snapshot from db or crawl queue
27
- */
28
- export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
@@ -1 +0,0 @@
1
- export declare function ensureDatabase(): Promise<void>;