@arcblock/crawler 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,11 @@
1
1
  import { Page } from '@blocklet/puppeteer';
2
2
  import { JobState, SnapshotModel } from './store';
3
+ declare let crawlQueue: any;
4
+ declare let syncQueue: any;
5
+ declare let codeQueue: any;
6
+ declare let cronQueue: any;
7
+ export { crawlQueue, syncQueue, codeQueue, cronQueue };
8
+ export declare function initQueue(): void;
3
9
  type PageHandler = {
4
10
  handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
11
  handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
@@ -25,4 +31,3 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
25
31
  export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
26
32
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
33
  export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
- export {};
@@ -12,12 +12,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
12
12
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
13
  };
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.getPageContent = void 0;
15
+ exports.getPageContent = exports.cronQueue = exports.codeQueue = exports.syncQueue = exports.crawlQueue = void 0;
16
+ exports.initQueue = initQueue;
16
17
  exports.createCrawlQueue = createCrawlQueue;
17
18
  exports.getDataDir = getDataDir;
18
19
  exports.enqueue = enqueue;
19
20
  exports.crawlUrl = crawlUrl;
20
21
  exports.crawlCode = crawlCode;
22
+ /* eslint-disable import/no-mutable-exports */
21
23
  const queue_1 = __importDefault(require("@abtnode/queue"));
22
24
  const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
23
25
  const crypto_1 = require("crypto");
@@ -30,12 +32,18 @@ const snapshot_1 = require("./services/snapshot");
30
32
  const store_1 = require("./store");
31
33
  const utils_1 = require("./utils");
32
34
  const { BaseState } = require('@abtnode/models');
33
- // eslint-disable-next-line import/no-mutable-exports
34
- const crawlQueue = createCrawlQueue('urlCrawler');
35
- const syncQueue = createCrawlQueue('syncCrawler');
36
- const codeQueue = createCrawlQueue('codeCrawler', {
37
- handleScreenshot: carbon_1.createCarbonImage,
38
- });
35
+ let crawlQueue;
36
+ let syncQueue;
37
+ let codeQueue;
38
+ let cronQueue;
39
+ function initQueue() {
40
+ exports.crawlQueue = crawlQueue = createCrawlQueue('urlCrawler');
41
+ exports.syncQueue = syncQueue = createCrawlQueue('syncCrawler');
42
+ exports.codeQueue = codeQueue = createCrawlQueue('codeCrawler', {
43
+ handleScreenshot: carbon_1.createCarbonImage,
44
+ });
45
+ exports.cronQueue = cronQueue = createCrawlQueue('cronJobs');
46
+ }
39
47
  function createCrawlQueue(queue, handler) {
40
48
  const db = new BaseState(store_1.Job);
41
49
  return (0, queue_1.default)({
@@ -234,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
234
242
  // get html
235
243
  try {
236
244
  const data = yield page.evaluate(() => {
237
- var _a;
245
+ var _a, _b;
238
246
  // add meta tag to record crawler
239
247
  const meta = document.createElement('meta');
240
248
  meta.name = 'arcblock-crawler';
@@ -243,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
243
251
  // get title and meta description
244
252
  const title = document.title || '';
245
253
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
254
+ // remove document all <noscript> tags
255
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
256
+ if (el === null || el === void 0 ? void 0 : el.remove) {
257
+ el.remove();
258
+ }
259
+ });
246
260
  return {
247
261
  html: document.documentElement.outerHTML,
248
262
  title,
@@ -1,7 +1,5 @@
1
1
  import { Config } from './config';
2
2
  export * from './crawler';
3
- export * from './site';
4
3
  export * from './services/snapshot';
5
4
  export * as utils from './utils';
6
- export { migrate } from './store/migrate';
7
5
  export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/cjs/index.js CHANGED
@@ -48,25 +48,26 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
48
48
  return (mod && mod.__esModule) ? mod : { "default": mod };
49
49
  };
50
50
  Object.defineProperty(exports, "__esModule", { value: true });
51
- exports.migrate = exports.utils = void 0;
51
+ exports.utils = void 0;
52
52
  exports.initCrawler = initCrawler;
53
53
  /* eslint-disable @typescript-eslint/indent */
54
54
  const merge_1 = __importDefault(require("lodash/merge"));
55
55
  const config_1 = require("./config");
56
+ const crawler_1 = require("./crawler");
56
57
  const cron_1 = require("./cron");
57
58
  const puppeteer_1 = require("./puppeteer");
59
+ const migrate_1 = require("./store/migrate");
58
60
  __exportStar(require("./crawler"), exports);
59
- __exportStar(require("./site"), exports);
60
61
  __exportStar(require("./services/snapshot"), exports);
61
62
  exports.utils = __importStar(require("./utils"));
62
- var migrate_1 = require("./store/migrate");
63
- Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
64
63
  function initCrawler(params) {
65
64
  return __awaiter(this, void 0, void 0, function* () {
66
65
  var _a;
67
66
  (0, merge_1.default)(config_1.config, params);
68
67
  config_1.logger.info('Init crawler', { params, config: config_1.config });
69
68
  try {
69
+ yield (0, migrate_1.migrate)();
70
+ yield (0, crawler_1.initQueue)();
70
71
  yield (0, puppeteer_1.ensureBrowser)();
71
72
  if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
72
73
  yield (0, cron_1.initCron)();
@@ -1,37 +1,4 @@
1
1
  "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
2
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
3
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
4
  return new (P || (P = Promise))(function (resolve, reject) {
@@ -92,11 +59,12 @@ function ensureBrowser() {
92
59
  config_1.logger.debug('executablePath', executablePath);
93
60
  if (!executablePath || !fs_extra_1.default.existsSync(executablePath)) {
94
61
  config_1.logger.info('start download browser', puppeteerConfig);
62
+ // @ts-ignore
95
63
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
96
64
  try {
97
65
  // @ts-ignore
98
66
  // eslint-disable-next-line import/extensions
99
- return yield Promise.resolve().then(() => __importStar(require('@blocklet/puppeteer/internal/node/install.js')));
67
+ return yield import('@blocklet/puppeteer/internal/node/install.js');
100
68
  }
101
69
  catch (err) {
102
70
  config_1.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
package/lib/cjs/site.js CHANGED
@@ -15,13 +15,11 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.crawlSite = void 0;
16
16
  const uniq_1 = __importDefault(require("lodash/uniq"));
17
17
  const node_crypto_1 = require("node:crypto");
18
- const p_map_1 = __importDefault(require("p-map"));
19
18
  const config_1 = require("./config");
20
19
  const crawler_1 = require("./crawler");
21
20
  const store_1 = require("./store");
22
21
  const utils_1 = require("./utils");
23
22
  const crawlBlockletRunningMap = new Map();
24
- const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
25
23
  function parseSitemapUrl(sitemapItem) {
26
24
  var _a;
27
25
  const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
@@ -30,6 +28,7 @@ function parseSitemapUrl(sitemapItem) {
30
28
  }
31
29
  const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
32
30
  var _b;
31
+ const { default: pMap } = yield import('p-map');
33
32
  config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
34
33
  const key = `${url}-${pathname}`;
35
34
  if (crawlBlockletRunningMap.has(key)) {
@@ -48,7 +47,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
48
47
  let crawlCount = 0;
49
48
  crawlBlockletRunningMap.set(key, true);
50
49
  try {
51
- const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
50
+ const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
52
51
  processCount++;
53
52
  const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
54
53
  if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
@@ -70,7 +69,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
70
69
  });
71
70
  crawlCount++;
72
71
  const jobId = (0, node_crypto_1.randomUUID)();
73
- crawlQueue.push({
72
+ crawler_1.cronQueue.push({
74
73
  id: jobId,
75
74
  url,
76
75
  lastModified: sitemapItem.lastmod,
@@ -1,6 +1,7 @@
1
1
  import { Page } from '@blocklet/puppeteer';
2
+ import Axios from 'axios';
2
3
  import { Request } from 'express';
3
- export declare const axios: import("axios").AxiosInstance;
4
+ export declare const axios: Axios.AxiosInstance;
4
5
  export declare const CRAWLER_FLAG = "x-arcblock-crawler";
5
6
  export declare const sleep: (ms: number) => Promise<unknown>;
6
7
  /**
package/lib/cjs/utils.js CHANGED
@@ -70,24 +70,36 @@ const botUserAgents = [
70
70
  /Googlebot/i,
71
71
  /GPTBot/i,
72
72
  /Applebot/i,
73
- // AI bots
74
- /Anthropic-ai/i,
75
- /Claude-Web/i,
76
- /anthropic-ai-scraper/i,
77
- /Google-Extended/i,
78
- /GoogleOther/i,
79
- /CCBot\/\d/i,
80
- /Bytespider/i,
81
- /BingBot/i,
82
- /Baiduspider/i,
83
- /Sogou/i,
84
- /Perplexity/i,
85
- /Cohere-ai/i,
86
- /xlts-bot/i,
87
- /THAAS/i,
88
- /YisouSpider/i,
89
- /AlibabaGroup/i,
90
- /adaptive-edge-crawler/i,
73
+ // AI bots - condensed patterns
74
+ /-AI\b/i, // Matches any string ending with "-AI"
75
+ /-Bot\b/i, // Matches any string ending with "-Bot"
76
+ /-Agent\b/i, // Matches any string ending with "-Agent"
77
+ /-User\b/i, // Matches any string ending with "-User"
78
+ /\bAI\b/i, // Matches standalone "AI" word
79
+ /\bGPT/i, // GPT variants
80
+ /\bClaude/i, // Claude variants
81
+ /\bBard\b/i, // Google Bard
82
+ /\bGemini\b/i, // Google Gemini
83
+ /\bLlama\b/i, // Meta Llama
84
+ /\bChatGPT/i, // ChatGPT variants
85
+ /\bOpenAI/i, // OpenAI
86
+ /\bAnthropic/i, // Anthropic
87
+ /\bPerplexity/i, // Perplexity
88
+ /\bCohere/i, // Cohere
89
+ /\bHuggingFace/i, // Hugging Face
90
+ /\bStability/i, // Stability AI
91
+ /\bMidjourney/i, // Midjourney
92
+ /\bDALL-E/i, // DALL-E
93
+ /\bMeta-External/i, // Meta external agents
94
+ /\bGoogle-/i, // Google agents
95
+ /\bLLM/i, // LLM
96
+ /\bBytespider/i, // ByteDance spider
97
+ /\bBaiduspider/i, // Baidu spider
98
+ /\bYandexBot/i, // Yandex bot
99
+ /\bDuckDuckBot/i, // DuckDuckGo bot
100
+ /\bLinkedInBot/i, // LinkedIn bot
101
+ /\bTwitterbot/i, // Twitter bot
102
+ /\bCCBot/i, // Common Crawl bot
91
103
  ];
92
104
  /**
93
105
  * A default set of file extensions for static assets that do not need to be proxied.
@@ -1,5 +1,11 @@
1
1
  import { Page } from '@blocklet/puppeteer';
2
2
  import { JobState, SnapshotModel } from './store';
3
+ declare let crawlQueue: any;
4
+ declare let syncQueue: any;
5
+ declare let codeQueue: any;
6
+ declare let cronQueue: any;
7
+ export { crawlQueue, syncQueue, codeQueue, cronQueue };
8
+ export declare function initQueue(): void;
3
9
  type PageHandler = {
4
10
  handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
11
  handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
@@ -25,4 +31,3 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
25
31
  export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
26
32
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
33
  export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
- export {};
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ /* eslint-disable import/no-mutable-exports */
10
11
  import createQueue from '@abtnode/queue';
11
12
  import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
12
13
  import { randomUUID } from 'crypto';
@@ -19,12 +20,19 @@ import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './service
19
20
  import { Job, Snapshot, sequelize } from './store';
20
21
  import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
21
22
  const { BaseState } = require('@abtnode/models');
22
- // eslint-disable-next-line import/no-mutable-exports
23
- const crawlQueue = createCrawlQueue('urlCrawler');
24
- const syncQueue = createCrawlQueue('syncCrawler');
25
- const codeQueue = createCrawlQueue('codeCrawler', {
26
- handleScreenshot: createCarbonImage,
27
- });
23
+ let crawlQueue;
24
+ let syncQueue;
25
+ let codeQueue;
26
+ let cronQueue;
27
+ export { crawlQueue, syncQueue, codeQueue, cronQueue };
28
+ export function initQueue() {
29
+ crawlQueue = createCrawlQueue('urlCrawler');
30
+ syncQueue = createCrawlQueue('syncCrawler');
31
+ codeQueue = createCrawlQueue('codeCrawler', {
32
+ handleScreenshot: createCarbonImage,
33
+ });
34
+ cronQueue = createCrawlQueue('cronJobs');
35
+ }
28
36
  export function createCrawlQueue(queue, handler) {
29
37
  const db = new BaseState(Job);
30
38
  return createQueue({
@@ -223,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
223
231
  // get html
224
232
  try {
225
233
  const data = yield page.evaluate(() => {
226
- var _a;
234
+ var _a, _b;
227
235
  // add meta tag to record crawler
228
236
  const meta = document.createElement('meta');
229
237
  meta.name = 'arcblock-crawler';
@@ -232,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
232
240
  // get title and meta description
233
241
  const title = document.title || '';
234
242
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
243
+ // remove document all <noscript> tags
244
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
245
+ if (el === null || el === void 0 ? void 0 : el.remove) {
246
+ el.remove();
247
+ }
248
+ });
235
249
  return {
236
250
  html: document.documentElement.outerHTML,
237
251
  title,
@@ -1,7 +1,5 @@
1
1
  import { Config } from './config';
2
2
  export * from './crawler';
3
- export * from './site';
4
3
  export * from './services/snapshot';
5
4
  export * as utils from './utils';
6
- export { migrate } from './store/migrate';
7
5
  export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/esm/index.js CHANGED
@@ -10,19 +10,21 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
10
10
  /* eslint-disable @typescript-eslint/indent */
11
11
  import merge from 'lodash/merge';
12
12
  import { config, logger } from './config';
13
+ import { initQueue } from './crawler';
13
14
  import { initCron } from './cron';
14
15
  import { ensureBrowser } from './puppeteer';
16
+ import { migrate } from './store/migrate';
15
17
  export * from './crawler';
16
- export * from './site';
17
18
  export * from './services/snapshot';
18
19
  export * as utils from './utils';
19
- export { migrate } from './store/migrate';
20
20
  export function initCrawler(params) {
21
21
  return __awaiter(this, void 0, void 0, function* () {
22
22
  var _a;
23
23
  merge(config, params);
24
24
  logger.info('Init crawler', { params, config });
25
25
  try {
26
+ yield migrate();
27
+ yield initQueue();
26
28
  yield ensureBrowser();
27
29
  if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
28
30
  yield initCron();
@@ -48,6 +48,7 @@ export function ensureBrowser() {
48
48
  logger.debug('executablePath', executablePath);
49
49
  if (!executablePath || !fs.existsSync(executablePath)) {
50
50
  logger.info('start download browser', puppeteerConfig);
51
+ // @ts-ignore
51
52
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
52
53
  try {
53
54
  // @ts-ignore
package/lib/esm/site.js CHANGED
@@ -9,13 +9,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
9
9
  };
10
10
  import uniq from 'lodash/uniq';
11
11
  import { randomUUID } from 'node:crypto';
12
- import pMap from 'p-map';
13
12
  import { config, logger } from './config';
14
- import { createCrawlQueue } from './crawler';
13
+ import { cronQueue } from './crawler';
15
14
  import { Snapshot } from './store';
16
15
  import { formatUrl, getSitemapList } from './utils';
17
16
  const crawlBlockletRunningMap = new Map();
18
- const crawlQueue = createCrawlQueue('cronJobs');
19
17
  function parseSitemapUrl(sitemapItem) {
20
18
  var _a;
21
19
  const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
@@ -24,6 +22,7 @@ function parseSitemapUrl(sitemapItem) {
24
22
  }
25
23
  export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
26
24
  var _b;
25
+ const { default: pMap } = yield import('p-map');
27
26
  logger.info(`Start crawl from sitemap ${url}`, { pathname });
28
27
  const key = `${url}-${pathname}`;
29
28
  if (crawlBlockletRunningMap.has(key)) {
@@ -64,7 +63,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
64
63
  });
65
64
  crawlCount++;
66
65
  const jobId = randomUUID();
67
- crawlQueue.push({
66
+ cronQueue.push({
68
67
  id: jobId,
69
68
  url,
70
69
  lastModified: sitemapItem.lastmod,
package/lib/esm/utils.js CHANGED
@@ -59,24 +59,36 @@ const botUserAgents = [
59
59
  /Googlebot/i,
60
60
  /GPTBot/i,
61
61
  /Applebot/i,
62
- // AI bots
63
- /Anthropic-ai/i,
64
- /Claude-Web/i,
65
- /anthropic-ai-scraper/i,
66
- /Google-Extended/i,
67
- /GoogleOther/i,
68
- /CCBot\/\d/i,
69
- /Bytespider/i,
70
- /BingBot/i,
71
- /Baiduspider/i,
72
- /Sogou/i,
73
- /Perplexity/i,
74
- /Cohere-ai/i,
75
- /xlts-bot/i,
76
- /THAAS/i,
77
- /YisouSpider/i,
78
- /AlibabaGroup/i,
79
- /adaptive-edge-crawler/i,
62
+ // AI bots - condensed patterns
63
+ /-AI\b/i, // Matches any string ending with "-AI"
64
+ /-Bot\b/i, // Matches any string ending with "-Bot"
65
+ /-Agent\b/i, // Matches any string ending with "-Agent"
66
+ /-User\b/i, // Matches any string ending with "-User"
67
+ /\bAI\b/i, // Matches standalone "AI" word
68
+ /\bGPT/i, // GPT variants
69
+ /\bClaude/i, // Claude variants
70
+ /\bBard\b/i, // Google Bard
71
+ /\bGemini\b/i, // Google Gemini
72
+ /\bLlama\b/i, // Meta Llama
73
+ /\bChatGPT/i, // ChatGPT variants
74
+ /\bOpenAI/i, // OpenAI
75
+ /\bAnthropic/i, // Anthropic
76
+ /\bPerplexity/i, // Perplexity
77
+ /\bCohere/i, // Cohere
78
+ /\bHuggingFace/i, // Hugging Face
79
+ /\bStability/i, // Stability AI
80
+ /\bMidjourney/i, // Midjourney
81
+ /\bDALL-E/i, // DALL-E
82
+ /\bMeta-External/i, // Meta external agents
83
+ /\bGoogle-/i, // Google agents
84
+ /\bLLM/i, // LLM
85
+ /\bBytespider/i, // ByteDance spider
86
+ /\bBaiduspider/i, // Baidu spider
87
+ /\bYandexBot/i, // Yandex bot
88
+ /\bDuckDuckBot/i, // DuckDuckGo bot
89
+ /\bLinkedInBot/i, // LinkedIn bot
90
+ /\bTwitterbot/i, // Twitter bot
91
+ /\bCCBot/i, // Common Crawl bot
80
92
  ];
81
93
  /**
82
94
  * A default set of file extensions for static assets that do not need to be proxied.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.3.1",
3
+ "version": "1.3.3",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -45,12 +45,12 @@
45
45
  ]
46
46
  },
47
47
  "dependencies": {
48
- "@abtnode/cron": "^1.16.44",
49
- "@abtnode/models": "^1.16.44",
50
- "@abtnode/queue": "^1.16.44",
51
- "@blocklet/logger": "^1.16.44",
48
+ "@abtnode/cron": "^1.16.46",
49
+ "@abtnode/models": "^1.16.46",
50
+ "@abtnode/queue": "^1.16.46",
51
+ "@blocklet/logger": "^1.16.46",
52
52
  "@blocklet/puppeteer": "^22.11.3",
53
- "@blocklet/sdk": "^1.16.44",
53
+ "@blocklet/sdk": "^1.16.46",
54
54
  "@sequelize/core": "7.0.0-alpha.46",
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",