@arcblock/crawler 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/blocklet.d.ts +6 -0
  2. package/dist/blocklet.js +199 -0
  3. package/dist/cache.d.ts +10 -0
  4. package/dist/cache.js +119 -0
  5. package/dist/config.d.ts +10 -0
  6. package/dist/config.js +17 -0
  7. package/dist/crawler.d.ts +28 -0
  8. package/dist/crawler.js +314 -0
  9. package/dist/db/index.d.ts +1 -0
  10. package/dist/db/index.js +41 -0
  11. package/dist/db/job.d.ts +33 -0
  12. package/dist/db/job.js +54 -0
  13. package/dist/db/snapshot.d.ts +31 -0
  14. package/dist/db/snapshot.js +52 -0
  15. package/dist/index.d.ts +6 -0
  16. package/dist/index.js +45 -0
  17. package/dist/middleware.d.ts +4 -0
  18. package/dist/middleware.js +44 -0
  19. package/dist/puppeteer.d.ts +16 -0
  20. package/dist/puppeteer.js +318 -0
  21. package/dist/utils.d.ts +15 -0
  22. package/dist/utils.js +239 -0
  23. package/esm/blocklet.d.ts +6 -0
  24. package/esm/blocklet.js +190 -0
  25. package/esm/cache.d.ts +10 -0
  26. package/esm/cache.js +114 -0
  27. package/esm/config.d.ts +10 -0
  28. package/esm/config.js +11 -0
  29. package/esm/crawler.d.ts +28 -0
  30. package/esm/crawler.js +301 -0
  31. package/esm/db/index.d.ts +1 -0
  32. package/esm/db/index.js +35 -0
  33. package/esm/db/job.d.ts +33 -0
  34. package/esm/db/job.js +50 -0
  35. package/esm/db/snapshot.d.ts +31 -0
  36. package/esm/db/snapshot.js +48 -0
  37. package/esm/index.d.ts +6 -0
  38. package/esm/index.js +26 -0
  39. package/esm/middleware.d.ts +4 -0
  40. package/esm/middleware.js +41 -0
  41. package/esm/puppeteer.d.ts +16 -0
  42. package/esm/puppeteer.js +272 -0
  43. package/esm/utils.d.ts +15 -0
  44. package/esm/utils.js +220 -0
  45. package/package.json +10 -3
  46. package/src/blocklet.ts +0 -223
  47. package/src/cache.ts +0 -117
  48. package/src/config.ts +0 -13
  49. package/src/crawler.ts +0 -364
  50. package/src/db/index.ts +0 -27
  51. package/src/db/job.ts +0 -93
  52. package/src/db/snapshot.ts +0 -89
  53. package/src/index.ts +0 -19
  54. package/src/middleware.ts +0 -46
  55. package/src/puppeteer.ts +0 -296
  56. package/src/utils.ts +0 -240
  57. package/third.d.ts +0 -1
  58. package/tsconfig.json +0 -9
package/src/puppeteer.ts DELETED
@@ -1,296 +0,0 @@
1
- // import fs from 'fs-extra';
2
- // import path from 'path';
3
- import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
4
- import { env } from '@blocklet/sdk/lib/config';
5
- import fs from 'fs-extra';
6
- import path from 'path';
7
- import { clearInterval, setInterval } from 'timers';
8
-
9
- import { useCache } from './cache';
10
- import { config, logger } from './config';
11
- import { CRAWLER_FLAG, sleep } from './utils';
12
-
13
- // let puppeteerConfig: {
14
- // cacheDirectory: string;
15
- // temporaryDirectory: string;
16
- // };
17
-
18
- const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${env.appId || 'unknown'}`;
19
-
20
- const BrowserStatus = {
21
- Launching: 'Launching',
22
- Ready: 'Ready',
23
- };
24
-
25
- let browser: Browser | null;
26
- let browserActivatedTimer: NodeJS.Timeout | null;
27
-
28
- export { puppeteer };
29
-
30
- export async function ensurePuppeteerrc() {
31
- const cacheDirectory = path.join(config.cacheDir, 'puppeteer', 'cache');
32
- const temporaryDirectory = path.join(config.cacheDir, 'puppeteer', 'tmp');
33
- const puppeteerrcPath = path.join(config.appDir, '.puppeteerrc.js');
34
-
35
- // ensure directory exists
36
- await Promise.all([fs.ensureDir(cacheDirectory), fs.ensureDir(temporaryDirectory), fs.ensureFile(puppeteerrcPath)]);
37
-
38
- const puppeteerConfig = {
39
- cacheDirectory,
40
- temporaryDirectory,
41
- };
42
-
43
- const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
44
- await fs.writeFile(puppeteerrcPath, fileContent);
45
-
46
- logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
47
-
48
- return puppeteerConfig;
49
- }
50
-
51
- export async function ensureBrowser() {
52
- const puppeteerConfig = await ensurePuppeteerrc();
53
-
54
- const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
55
-
56
- logger.info('executablePath', executablePath);
57
-
58
- if (!fs.existsSync(executablePath)) {
59
- logger.info('start download browser', puppeteerConfig);
60
- const { downloadBrowser } = await (async () => {
61
- try {
62
- // @ts-ignore
63
- // eslint-disable-next-line import/extensions
64
- return await import('@blocklet/puppeteer/internal/node/install.js');
65
- } catch (err) {
66
- logger.warn(
67
- 'Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.',
68
- );
69
- }
70
- })();
71
-
72
- if (downloadBrowser) {
73
- await downloadBrowser();
74
- logger.info('Browser download completed successfully');
75
- }
76
- }
77
-
78
- // try to launch browser
79
- if (config.testOnInitialize) {
80
- const browser = await launchBrowser();
81
- if (!browser) {
82
- throw new Error('Failed to launch browser');
83
- }
84
- await closeBrowser();
85
- }
86
-
87
- logger.info('Puppeteer is ready');
88
- }
89
-
90
- export async function connectBrowser() {
91
- const browserWSEndpoint = await useCache.get(BROWSER_WS_ENDPOINT_KEY);
92
-
93
- if (!browserWSEndpoint) {
94
- return null;
95
- }
96
-
97
- // retry if browser is launching
98
- if (browserWSEndpoint.status === BrowserStatus.Launching) {
99
- await sleep(Math.floor(Math.random() * 1000));
100
- return connectBrowser();
101
- }
102
-
103
- try {
104
- browser = await puppeteer.connect({
105
- browserWSEndpoint: browserWSEndpoint.endpoint,
106
- });
107
- logger.info('Connect browser success');
108
- } catch (err) {
109
- logger.warn('Connect browser failed, clear endpoint', err);
110
- await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
111
- return null;
112
- }
113
-
114
- return browser;
115
- }
116
-
117
- export async function launchBrowser() {
118
- await useCache.set(BROWSER_WS_ENDPOINT_KEY, {
119
- endpoint: null,
120
- status: BrowserStatus.Launching,
121
- });
122
-
123
- try {
124
- // @ts-ignore
125
- browser = await puppeteer.launch({
126
- headless: true,
127
- args: [
128
- // docs: https://peter.sh/experiments/chromium-command-line-switches/
129
- '--no-first-run',
130
- '--hide-scrollbars',
131
- '--no-sandbox',
132
- '--no-zygote',
133
- '--disable-setuid-sandbox',
134
- '--disable-gpu',
135
- '--disable-dev-shm-usage',
136
- '--disable-site-isolation-trials',
137
- '--disable-accelerated-2d-canvas',
138
- '--disable-extensions',
139
- '--js-flags=--max_old_space_size=512', // 限制V8内存
140
- '--disable-background-networking',
141
- '--disable-default-apps',
142
- // '--disable-web-security', // 允许跨域请求
143
- '--disable-software-rasterizer',
144
- '--disable-crash-reporter',
145
- '--disable-service-workers',
146
- '--disable-notifications',
147
- '--disable-infobars',
148
- '--font-render-hinting=none',
149
- ],
150
- });
151
- logger.info('Launch browser success');
152
- } catch (error) {
153
- logger.error('launch browser failed: ', error);
154
- // cleanup browser endpoint
155
- await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
156
- throw error;
157
- }
158
-
159
- // save browserWSEndpoint to cache
160
- const endpoint = await browser!.wsEndpoint();
161
- await useCache.set(BROWSER_WS_ENDPOINT_KEY, {
162
- endpoint,
163
- status: BrowserStatus.Ready,
164
- });
165
-
166
- return browser;
167
- }
168
-
169
- function checkBrowserActivated() {
170
- clearBrowserActivatedTimer();
171
-
172
- let count = 0;
173
-
174
- browserActivatedTimer = setInterval(async () => {
175
- if (browser) {
176
- const pages = await browser.pages().catch(() => [] as Page[]);
177
- if (pages.length === 1 && pages[0]?.url() === 'about:blank') {
178
- count++;
179
- logger.debug(`Browser inactive count: ${count}/3`);
180
- } else {
181
- count = 0; // 重置计数器!
182
- }
183
- if (count >= 3) {
184
- logger.info('Browser inactive for 3 minutes, closing...');
185
- await closeBrowser({
186
- trimCache: true,
187
- });
188
- }
189
- }
190
- }, 1000 * 60);
191
- }
192
-
193
- function clearBrowserActivatedTimer() {
194
- if (browserActivatedTimer) {
195
- clearInterval(browserActivatedTimer);
196
- browserActivatedTimer = null;
197
- }
198
- }
199
-
200
- export const getBrowser = async () => {
201
- if (browser) return browser;
202
-
203
- // sleep random time (0 ~ 5s),to avoid concurrent blocklet
204
- await sleep(Math.floor(Math.random() * 1000 * 5));
205
-
206
- // try to connect browser
207
- const connectedBrowser = await connectBrowser();
208
- if (connectedBrowser) {
209
- logger.debug('getBrowser.connectedBrowser');
210
- browser = connectedBrowser;
211
- return browser;
212
- }
213
-
214
- // try to launch browser
215
- const launchedBrowser = await launchBrowser();
216
- if (launchedBrowser) {
217
- logger.debug('getBrowser.launchedBrowser');
218
- browser = launchedBrowser;
219
- checkBrowserActivated();
220
- return browser;
221
- }
222
-
223
- throw new Error('No browser to use, should install redis or browser');
224
- };
225
-
226
- export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean } = {}) => {
227
- if (!browser) return;
228
-
229
- // close all pages
230
- try {
231
- const pages = await browser.pages();
232
- await Promise.all(pages.map((page) => page.close()));
233
- } catch (err) {
234
- logger.error('Failed to close all pages:', err);
235
- }
236
-
237
- // close browser
238
- try {
239
- await browser.close();
240
- } catch (err) {
241
- logger.error('Failed to close browser:', err);
242
- }
243
-
244
- // clear cache
245
- try {
246
- if (trimCache) {
247
- await puppeteer.trimCache();
248
- logger.info('Trim cache success');
249
- }
250
-
251
- // try to clear temporary directory
252
- // if (puppeteerConfig) {
253
- // await fs.emptyDir(puppeteerConfig.temporaryDirectory);
254
- // }
255
-
256
- if (global.gc) {
257
- global.gc();
258
- }
259
- } catch (err) {
260
- logger.error('Failed to clear browser cache:', err);
261
- }
262
-
263
- browser = null;
264
-
265
- clearBrowserActivatedTimer();
266
- await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
267
-
268
- logger.info('Close browser success');
269
- };
270
-
271
- export async function initPage({ abortResourceTypes = [] } = {}) {
272
- const browser = await getBrowser();
273
- const page = await browser.newPage();
274
- await page.setViewport({ width: 1440, height: 900 });
275
-
276
- // page setting
277
- // add custom headers
278
- await page.setExtraHTTPHeaders({
279
- [CRAWLER_FLAG]: 'true',
280
- });
281
-
282
- // abort resource types
283
- if (abortResourceTypes.length > 0) {
284
- await page.setRequestInterception(true);
285
-
286
- page.on('request', (req: any) => {
287
- // @ts-ignore
288
- if (abortResourceTypes.includes(req.resourceType())) {
289
- return req.abort();
290
- }
291
- return req.continue();
292
- });
293
- }
294
-
295
- return page;
296
- }
package/src/utils.ts DELETED
@@ -1,240 +0,0 @@
1
- import { components, env } from '@blocklet/sdk/lib/config';
2
- import axios from 'axios';
3
- import flattenDeep from 'lodash/flattenDeep';
4
- import uniq from 'lodash/uniq';
5
- import { createHash } from 'node:crypto';
6
- import robotsParser from 'robots-parser';
7
- import { parseSitemap } from 'sitemap';
8
- import { Readable } from 'stream';
9
- import { joinURL } from 'ufo';
10
-
11
- export const api = axios.create({
12
- timeout: 1000 * 10,
13
- headers: {
14
- 'Content-Type': 'application/json',
15
- },
16
- });
17
-
18
- export const sleep = (ms: number) => {
19
- return new Promise((resolve) => {
20
- setTimeout(resolve, ms);
21
- });
22
- };
23
-
24
- export const CRAWLER_FLAG = 'x-crawler';
25
-
26
- export const isSelfCrawler = (req: any) => {
27
- const ua = req.get('user-agent') || '';
28
- return req.get(CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
29
- };
30
-
31
- /**
32
- * A default set of user agent patterns for bots/crawlers that do not perform
33
- * well with pages that require JavaScript.
34
- */
35
- const botUserAgents = [
36
- /bot/i,
37
- /spider/i,
38
- /facebookexternalhit/i,
39
- /simplepie/i,
40
- /yahooseeker/i,
41
- /embedly/i,
42
- /quora link preview/i,
43
- /outbrain/i,
44
- /vkshare/i,
45
- /monit/i,
46
- /Pingability/i,
47
- /Monitoring/i,
48
- /WinHttpRequest/i,
49
- /Apache-HttpClient/i,
50
- /getprismatic.com/i,
51
- /python-requests/i,
52
- /Twurly/i,
53
- /yandex/i,
54
- /browserproxy/i,
55
- /crawler/i,
56
- /Qwantify/i,
57
- /Yahoo/i,
58
- /pinterest/i,
59
- /Tumblr/i,
60
- /Tumblr Agent/i,
61
- /WhatsApp/i,
62
- /Google-Structured-Data-Testing-Tool/i,
63
- /Google-InspectionTool/i,
64
- /Googlebot/i,
65
- /GPTBot/i,
66
- /Applebot/i,
67
-
68
- // AI bots
69
- /Anthropic-ai/i,
70
- /Claude-Web/i,
71
- /anthropic-ai-scraper/i,
72
- /Google-Extended/i,
73
- /GoogleOther/i,
74
- /CCBot\/\d/i,
75
- /Bytespider/i,
76
- /BingBot/i,
77
- /Baiduspider/i,
78
- /Sogou/i,
79
- /Perplexity/i,
80
- /Cohere-ai/i,
81
- /xlts-bot/i,
82
- /THAAS/i,
83
- /YisouSpider/i,
84
- /AlibabaGroup/i,
85
- /adaptive-edge-crawler/i,
86
- ];
87
-
88
- const isSpider = (ua: string) =>
89
- botUserAgents.some((spider) => {
90
- return spider.test(ua);
91
- });
92
-
93
- /**
94
- * A default set of file extensions for static assets that do not need to be
95
- * proxied.
96
- */
97
- const staticFileExtensions = [
98
- 'ai',
99
- 'avi',
100
- 'css',
101
- 'dat',
102
- 'dmg',
103
- 'doc',
104
- 'doc',
105
- 'exe',
106
- 'flv',
107
- 'gif',
108
- 'ico',
109
- 'iso',
110
- 'jpeg',
111
- 'jpg',
112
- 'js',
113
- 'less',
114
- 'm4a',
115
- 'm4v',
116
- 'mov',
117
- 'mp3',
118
- 'mp4',
119
- 'mpeg',
120
- 'mpg',
121
- 'pdf',
122
- 'png',
123
- 'ppt',
124
- 'psd',
125
- 'rar',
126
- 'rss',
127
- 'svg',
128
- 'swf',
129
- 'tif',
130
- 'torrent',
131
- 'ttf',
132
- 'txt',
133
- 'wav',
134
- 'wmv',
135
- 'woff',
136
- 'xls',
137
- 'xml',
138
- 'zip',
139
- ];
140
-
141
- export const getDefaultRobotsUrl = (url: string) => {
142
- const { origin } = new URL(url);
143
- return joinURL(origin, 'robots.txt?nocache=1');
144
- };
145
-
146
- export async function getRobots(url: string) {
147
- const { origin } = new URL(url);
148
- const robotsUrl = joinURL(origin, 'robots.txt?nocache=1');
149
- const { data } = await api.get(robotsUrl).catch(() => ({
150
- data: '',
151
- }));
152
-
153
- return data ? robotsParser(robotsUrl, data) : null;
154
- }
155
-
156
- export const getDefaultSitemapUrl = (url: string) => {
157
- const { origin } = new URL(url);
158
- return joinURL(origin, 'sitemap.xml?nocache=1');
159
- };
160
-
161
- export const isAcceptCrawler = async (url: string) => {
162
- const robots = await getRobots(url);
163
- const isAllowed = robots ? await robots.isAllowed(url) : true;
164
- return isAllowed;
165
- };
166
-
167
- export const getSitemapList = async (url: string) => {
168
- let sitemapUrlList = [getDefaultSitemapUrl(url)];
169
- const robots = await getRobots(url);
170
-
171
- if (robots) {
172
- const robotsTxtSitemapUrlList = (await robots.getSitemaps()) || [];
173
- if (robotsTxtSitemapUrlList.length > 0) {
174
- sitemapUrlList = robotsTxtSitemapUrlList;
175
- }
176
- }
177
-
178
- // loop site map url list
179
- const sitemapList = await Promise.all(
180
- sitemapUrlList.map(async (sitemapUrl) => {
181
- const newUrl = new URL(sitemapUrl);
182
- newUrl.searchParams.set('nocache', '1');
183
- sitemapUrl = newUrl.toString();
184
-
185
- const { data: sitemapTxt } = await api.get(sitemapUrl).catch(() => ({
186
- data: '',
187
- }));
188
-
189
- if (sitemapTxt) {
190
- const stream = Readable.from([sitemapTxt]);
191
- const sitemapJson = await parseSitemap(stream);
192
- return sitemapJson;
193
- }
194
-
195
- return [];
196
- }),
197
- );
198
-
199
- return uniq(flattenDeep(sitemapList.filter(Boolean)));
200
- };
201
-
202
- export const isBotUserAgent = (req: any) => {
203
- const ua = req.get('user-agent');
204
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
205
-
206
- if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
207
- return false;
208
- }
209
-
210
- return true;
211
- };
212
-
213
- export const getComponentInfo = () => {
214
- return components.find((item) => item.did === env.componentDid) || {};
215
- };
216
-
217
- export const getFullUrl = (req) => {
218
- const blockletPathname = req.headers['x-path-prefix']
219
- ? joinURL(req.headers['x-path-prefix'], req.originalUrl)
220
- : req.originalUrl;
221
-
222
- return joinURL(env.appUrl, blockletPathname);
223
- };
224
-
225
- export const getRelativePath = (url: string) => {
226
- try {
227
- return new URL(url).pathname;
228
- } catch (error) {
229
- // ignore error
230
- }
231
- return url;
232
- };
233
-
234
- export const formatUrl = (url: string) => {
235
- return url.replace(/\/$/, '').trim();
236
- };
237
-
238
- export function md5(content: string | Uint8Array) {
239
- return createHash('md5').update(content).digest('hex');
240
- }
package/third.d.ts DELETED
@@ -1 +0,0 @@
1
- declare module '@blocklet/logger';
package/tsconfig.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../../tsconfig",
3
- "compilerOptions": {
4
- "outDir": "dist",
5
- "noEmit": false,
6
- "noEmitOnError": true
7
- },
8
- "include": ["src", "third.d.ts"]
9
- }