@arcblock/crawler 1.3.1 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +6 -1
- package/lib/cjs/crawler.js +22 -8
- package/lib/cjs/index.d.ts +0 -2
- package/lib/cjs/index.js +5 -4
- package/lib/cjs/puppeteer.js +2 -34
- package/lib/cjs/site.js +3 -4
- package/lib/cjs/utils.d.ts +2 -1
- package/lib/cjs/utils.js +30 -18
- package/lib/esm/crawler.d.ts +6 -1
- package/lib/esm/crawler.js +21 -7
- package/lib/esm/index.d.ts +0 -2
- package/lib/esm/index.js +4 -2
- package/lib/esm/puppeteer.js +1 -0
- package/lib/esm/site.js +3 -4
- package/lib/esm/utils.js +30 -18
- package/package.json +6 -6
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
import { Page } from '@blocklet/puppeteer';
|
|
2
2
|
import { JobState, SnapshotModel } from './store';
|
|
3
|
+
declare let crawlQueue: any;
|
|
4
|
+
declare let syncQueue: any;
|
|
5
|
+
declare let codeQueue: any;
|
|
6
|
+
declare let cronQueue: any;
|
|
7
|
+
export { crawlQueue, syncQueue, codeQueue, cronQueue };
|
|
8
|
+
export declare function initQueue(): void;
|
|
3
9
|
type PageHandler = {
|
|
4
10
|
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
11
|
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
@@ -25,4 +31,3 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
25
31
|
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
26
32
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
33
|
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
-
export {};
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -12,12 +12,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.getPageContent = void 0;
|
|
15
|
+
exports.getPageContent = exports.cronQueue = exports.codeQueue = exports.syncQueue = exports.crawlQueue = void 0;
|
|
16
|
+
exports.initQueue = initQueue;
|
|
16
17
|
exports.createCrawlQueue = createCrawlQueue;
|
|
17
18
|
exports.getDataDir = getDataDir;
|
|
18
19
|
exports.enqueue = enqueue;
|
|
19
20
|
exports.crawlUrl = crawlUrl;
|
|
20
21
|
exports.crawlCode = crawlCode;
|
|
22
|
+
/* eslint-disable import/no-mutable-exports */
|
|
21
23
|
const queue_1 = __importDefault(require("@abtnode/queue"));
|
|
22
24
|
const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
|
|
23
25
|
const crypto_1 = require("crypto");
|
|
@@ -30,12 +32,18 @@ const snapshot_1 = require("./services/snapshot");
|
|
|
30
32
|
const store_1 = require("./store");
|
|
31
33
|
const utils_1 = require("./utils");
|
|
32
34
|
const { BaseState } = require('@abtnode/models');
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
let crawlQueue;
|
|
36
|
+
let syncQueue;
|
|
37
|
+
let codeQueue;
|
|
38
|
+
let cronQueue;
|
|
39
|
+
function initQueue() {
|
|
40
|
+
exports.crawlQueue = crawlQueue = createCrawlQueue('urlCrawler');
|
|
41
|
+
exports.syncQueue = syncQueue = createCrawlQueue('syncCrawler');
|
|
42
|
+
exports.codeQueue = codeQueue = createCrawlQueue('codeCrawler', {
|
|
43
|
+
handleScreenshot: carbon_1.createCarbonImage,
|
|
44
|
+
});
|
|
45
|
+
exports.cronQueue = cronQueue = createCrawlQueue('cronJobs');
|
|
46
|
+
}
|
|
39
47
|
function createCrawlQueue(queue, handler) {
|
|
40
48
|
const db = new BaseState(store_1.Job);
|
|
41
49
|
return (0, queue_1.default)({
|
|
@@ -234,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
|
|
|
234
242
|
// get html
|
|
235
243
|
try {
|
|
236
244
|
const data = yield page.evaluate(() => {
|
|
237
|
-
var _a;
|
|
245
|
+
var _a, _b;
|
|
238
246
|
// add meta tag to record crawler
|
|
239
247
|
const meta = document.createElement('meta');
|
|
240
248
|
meta.name = 'arcblock-crawler';
|
|
@@ -243,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
|
|
|
243
251
|
// get title and meta description
|
|
244
252
|
const title = document.title || '';
|
|
245
253
|
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
254
|
+
// remove document all <noscript> tags
|
|
255
|
+
(_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
|
|
256
|
+
if (el === null || el === void 0 ? void 0 : el.remove) {
|
|
257
|
+
el.remove();
|
|
258
|
+
}
|
|
259
|
+
});
|
|
246
260
|
return {
|
|
247
261
|
html: document.documentElement.outerHTML,
|
|
248
262
|
title,
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import { Config } from './config';
|
|
2
2
|
export * from './crawler';
|
|
3
|
-
export * from './site';
|
|
4
3
|
export * from './services/snapshot';
|
|
5
4
|
export * as utils from './utils';
|
|
6
|
-
export { migrate } from './store/migrate';
|
|
7
5
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -48,25 +48,26 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
48
48
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
49
49
|
};
|
|
50
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
|
-
exports.
|
|
51
|
+
exports.utils = void 0;
|
|
52
52
|
exports.initCrawler = initCrawler;
|
|
53
53
|
/* eslint-disable @typescript-eslint/indent */
|
|
54
54
|
const merge_1 = __importDefault(require("lodash/merge"));
|
|
55
55
|
const config_1 = require("./config");
|
|
56
|
+
const crawler_1 = require("./crawler");
|
|
56
57
|
const cron_1 = require("./cron");
|
|
57
58
|
const puppeteer_1 = require("./puppeteer");
|
|
59
|
+
const migrate_1 = require("./store/migrate");
|
|
58
60
|
__exportStar(require("./crawler"), exports);
|
|
59
|
-
__exportStar(require("./site"), exports);
|
|
60
61
|
__exportStar(require("./services/snapshot"), exports);
|
|
61
62
|
exports.utils = __importStar(require("./utils"));
|
|
62
|
-
var migrate_1 = require("./store/migrate");
|
|
63
|
-
Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
|
|
64
63
|
function initCrawler(params) {
|
|
65
64
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
65
|
var _a;
|
|
67
66
|
(0, merge_1.default)(config_1.config, params);
|
|
68
67
|
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
69
68
|
try {
|
|
69
|
+
yield (0, migrate_1.migrate)();
|
|
70
|
+
yield (0, crawler_1.initQueue)();
|
|
70
71
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
71
72
|
if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
72
73
|
yield (0, cron_1.initCron)();
|
package/lib/cjs/puppeteer.js
CHANGED
|
@@ -1,37 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
3
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
4
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -92,11 +59,12 @@ function ensureBrowser() {
|
|
|
92
59
|
config_1.logger.debug('executablePath', executablePath);
|
|
93
60
|
if (!executablePath || !fs_extra_1.default.existsSync(executablePath)) {
|
|
94
61
|
config_1.logger.info('start download browser', puppeteerConfig);
|
|
62
|
+
// @ts-ignore
|
|
95
63
|
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
96
64
|
try {
|
|
97
65
|
// @ts-ignore
|
|
98
66
|
// eslint-disable-next-line import/extensions
|
|
99
|
-
return yield
|
|
67
|
+
return yield import('@blocklet/puppeteer/internal/node/install.js');
|
|
100
68
|
}
|
|
101
69
|
catch (err) {
|
|
102
70
|
config_1.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
|
package/lib/cjs/site.js
CHANGED
|
@@ -15,13 +15,11 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
exports.crawlSite = void 0;
|
|
16
16
|
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
17
17
|
const node_crypto_1 = require("node:crypto");
|
|
18
|
-
const p_map_1 = __importDefault(require("p-map"));
|
|
19
18
|
const config_1 = require("./config");
|
|
20
19
|
const crawler_1 = require("./crawler");
|
|
21
20
|
const store_1 = require("./store");
|
|
22
21
|
const utils_1 = require("./utils");
|
|
23
22
|
const crawlBlockletRunningMap = new Map();
|
|
24
|
-
const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
|
|
25
23
|
function parseSitemapUrl(sitemapItem) {
|
|
26
24
|
var _a;
|
|
27
25
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -30,6 +28,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
30
28
|
}
|
|
31
29
|
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
32
30
|
var _b;
|
|
31
|
+
const { default: pMap } = yield import('p-map');
|
|
33
32
|
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
34
33
|
const key = `${url}-${pathname}`;
|
|
35
34
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -48,7 +47,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
48
47
|
let crawlCount = 0;
|
|
49
48
|
crawlBlockletRunningMap.set(key, true);
|
|
50
49
|
try {
|
|
51
|
-
const jobIds = yield (
|
|
50
|
+
const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
52
51
|
processCount++;
|
|
53
52
|
const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
|
|
54
53
|
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
@@ -70,7 +69,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
70
69
|
});
|
|
71
70
|
crawlCount++;
|
|
72
71
|
const jobId = (0, node_crypto_1.randomUUID)();
|
|
73
|
-
|
|
72
|
+
crawler_1.cronQueue.push({
|
|
74
73
|
id: jobId,
|
|
75
74
|
url,
|
|
76
75
|
lastModified: sitemapItem.lastmod,
|
package/lib/cjs/utils.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import Axios from 'axios';
|
|
2
3
|
import { Request } from 'express';
|
|
3
|
-
export declare const axios:
|
|
4
|
+
export declare const axios: Axios.AxiosInstance;
|
|
4
5
|
export declare const CRAWLER_FLAG = "x-arcblock-crawler";
|
|
5
6
|
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
6
7
|
/**
|
package/lib/cjs/utils.js
CHANGED
|
@@ -70,24 +70,36 @@ const botUserAgents = [
|
|
|
70
70
|
/Googlebot/i,
|
|
71
71
|
/GPTBot/i,
|
|
72
72
|
/Applebot/i,
|
|
73
|
-
// AI bots
|
|
74
|
-
/
|
|
75
|
-
/
|
|
76
|
-
/
|
|
77
|
-
/
|
|
78
|
-
/
|
|
79
|
-
/
|
|
80
|
-
/
|
|
81
|
-
/
|
|
82
|
-
/
|
|
83
|
-
/
|
|
84
|
-
/
|
|
85
|
-
/
|
|
86
|
-
/
|
|
87
|
-
/
|
|
88
|
-
/
|
|
89
|
-
/
|
|
90
|
-
/
|
|
73
|
+
// AI bots - condensed patterns
|
|
74
|
+
/-AI\b/i, // Matches any string ending with "-AI"
|
|
75
|
+
/-Bot\b/i, // Matches any string ending with "-Bot"
|
|
76
|
+
/-Agent\b/i, // Matches any string ending with "-Agent"
|
|
77
|
+
/-User\b/i, // Matches any string ending with "-User"
|
|
78
|
+
/\bAI\b/i, // Matches standalone "AI" word
|
|
79
|
+
/\bGPT/i, // GPT variants
|
|
80
|
+
/\bClaude/i, // Claude variants
|
|
81
|
+
/\bBard\b/i, // Google Bard
|
|
82
|
+
/\bGemini\b/i, // Google Gemini
|
|
83
|
+
/\bLlama\b/i, // Meta Llama
|
|
84
|
+
/\bChatGPT/i, // ChatGPT variants
|
|
85
|
+
/\bOpenAI/i, // OpenAI
|
|
86
|
+
/\bAnthropic/i, // Anthropic
|
|
87
|
+
/\bPerplexity/i, // Perplexity
|
|
88
|
+
/\bCohere/i, // Cohere
|
|
89
|
+
/\bHuggingFace/i, // Hugging Face
|
|
90
|
+
/\bStability/i, // Stability AI
|
|
91
|
+
/\bMidjourney/i, // Midjourney
|
|
92
|
+
/\bDALL-E/i, // DALL-E
|
|
93
|
+
/\bMeta-External/i, // Meta external agents
|
|
94
|
+
/\bGoogle-/i, // Google agents
|
|
95
|
+
/\bLLM/i, // LLM
|
|
96
|
+
/\bBytespider/i, // ByteDance spider
|
|
97
|
+
/\bBaiduspider/i, // Baidu spider
|
|
98
|
+
/\bYandexBot/i, // Yandex bot
|
|
99
|
+
/\bDuckDuckBot/i, // DuckDuckGo bot
|
|
100
|
+
/\bLinkedInBot/i, // LinkedIn bot
|
|
101
|
+
/\bTwitterbot/i, // Twitter bot
|
|
102
|
+
/\bCCBot/i, // Common Crawl bot
|
|
91
103
|
];
|
|
92
104
|
/**
|
|
93
105
|
* A default set of file extensions for static assets that do not need to be proxied.
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
import { Page } from '@blocklet/puppeteer';
|
|
2
2
|
import { JobState, SnapshotModel } from './store';
|
|
3
|
+
declare let crawlQueue: any;
|
|
4
|
+
declare let syncQueue: any;
|
|
5
|
+
declare let codeQueue: any;
|
|
6
|
+
declare let cronQueue: any;
|
|
7
|
+
export { crawlQueue, syncQueue, codeQueue, cronQueue };
|
|
8
|
+
export declare function initQueue(): void;
|
|
3
9
|
type PageHandler = {
|
|
4
10
|
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
11
|
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
@@ -25,4 +31,3 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
25
31
|
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
26
32
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
33
|
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
-
export {};
|
package/lib/esm/crawler.js
CHANGED
|
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
/* eslint-disable import/no-mutable-exports */
|
|
10
11
|
import createQueue from '@abtnode/queue';
|
|
11
12
|
import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
|
|
12
13
|
import { randomUUID } from 'crypto';
|
|
@@ -19,12 +20,19 @@ import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './service
|
|
|
19
20
|
import { Job, Snapshot, sequelize } from './store';
|
|
20
21
|
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
|
|
21
22
|
const { BaseState } = require('@abtnode/models');
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
let crawlQueue;
|
|
24
|
+
let syncQueue;
|
|
25
|
+
let codeQueue;
|
|
26
|
+
let cronQueue;
|
|
27
|
+
export { crawlQueue, syncQueue, codeQueue, cronQueue };
|
|
28
|
+
export function initQueue() {
|
|
29
|
+
crawlQueue = createCrawlQueue('urlCrawler');
|
|
30
|
+
syncQueue = createCrawlQueue('syncCrawler');
|
|
31
|
+
codeQueue = createCrawlQueue('codeCrawler', {
|
|
32
|
+
handleScreenshot: createCarbonImage,
|
|
33
|
+
});
|
|
34
|
+
cronQueue = createCrawlQueue('cronJobs');
|
|
35
|
+
}
|
|
28
36
|
export function createCrawlQueue(queue, handler) {
|
|
29
37
|
const db = new BaseState(Job);
|
|
30
38
|
return createQueue({
|
|
@@ -223,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
|
|
|
223
231
|
// get html
|
|
224
232
|
try {
|
|
225
233
|
const data = yield page.evaluate(() => {
|
|
226
|
-
var _a;
|
|
234
|
+
var _a, _b;
|
|
227
235
|
// add meta tag to record crawler
|
|
228
236
|
const meta = document.createElement('meta');
|
|
229
237
|
meta.name = 'arcblock-crawler';
|
|
@@ -232,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
|
|
|
232
240
|
// get title and meta description
|
|
233
241
|
const title = document.title || '';
|
|
234
242
|
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
243
|
+
// remove document all <noscript> tags
|
|
244
|
+
(_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
|
|
245
|
+
if (el === null || el === void 0 ? void 0 : el.remove) {
|
|
246
|
+
el.remove();
|
|
247
|
+
}
|
|
248
|
+
});
|
|
235
249
|
return {
|
|
236
250
|
html: document.documentElement.outerHTML,
|
|
237
251
|
title,
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import { Config } from './config';
|
|
2
2
|
export * from './crawler';
|
|
3
|
-
export * from './site';
|
|
4
3
|
export * from './services/snapshot';
|
|
5
4
|
export * as utils from './utils';
|
|
6
|
-
export { migrate } from './store/migrate';
|
|
7
5
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -10,19 +10,21 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
10
10
|
/* eslint-disable @typescript-eslint/indent */
|
|
11
11
|
import merge from 'lodash/merge';
|
|
12
12
|
import { config, logger } from './config';
|
|
13
|
+
import { initQueue } from './crawler';
|
|
13
14
|
import { initCron } from './cron';
|
|
14
15
|
import { ensureBrowser } from './puppeteer';
|
|
16
|
+
import { migrate } from './store/migrate';
|
|
15
17
|
export * from './crawler';
|
|
16
|
-
export * from './site';
|
|
17
18
|
export * from './services/snapshot';
|
|
18
19
|
export * as utils from './utils';
|
|
19
|
-
export { migrate } from './store/migrate';
|
|
20
20
|
export function initCrawler(params) {
|
|
21
21
|
return __awaiter(this, void 0, void 0, function* () {
|
|
22
22
|
var _a;
|
|
23
23
|
merge(config, params);
|
|
24
24
|
logger.info('Init crawler', { params, config });
|
|
25
25
|
try {
|
|
26
|
+
yield migrate();
|
|
27
|
+
yield initQueue();
|
|
26
28
|
yield ensureBrowser();
|
|
27
29
|
if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
28
30
|
yield initCron();
|
package/lib/esm/puppeteer.js
CHANGED
|
@@ -48,6 +48,7 @@ export function ensureBrowser() {
|
|
|
48
48
|
logger.debug('executablePath', executablePath);
|
|
49
49
|
if (!executablePath || !fs.existsSync(executablePath)) {
|
|
50
50
|
logger.info('start download browser', puppeteerConfig);
|
|
51
|
+
// @ts-ignore
|
|
51
52
|
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
52
53
|
try {
|
|
53
54
|
// @ts-ignore
|
package/lib/esm/site.js
CHANGED
|
@@ -9,13 +9,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
};
|
|
10
10
|
import uniq from 'lodash/uniq';
|
|
11
11
|
import { randomUUID } from 'node:crypto';
|
|
12
|
-
import pMap from 'p-map';
|
|
13
12
|
import { config, logger } from './config';
|
|
14
|
-
import {
|
|
13
|
+
import { cronQueue } from './crawler';
|
|
15
14
|
import { Snapshot } from './store';
|
|
16
15
|
import { formatUrl, getSitemapList } from './utils';
|
|
17
16
|
const crawlBlockletRunningMap = new Map();
|
|
18
|
-
const crawlQueue = createCrawlQueue('cronJobs');
|
|
19
17
|
function parseSitemapUrl(sitemapItem) {
|
|
20
18
|
var _a;
|
|
21
19
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -24,6 +22,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
24
22
|
}
|
|
25
23
|
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
26
24
|
var _b;
|
|
25
|
+
const { default: pMap } = yield import('p-map');
|
|
27
26
|
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
28
27
|
const key = `${url}-${pathname}`;
|
|
29
28
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -64,7 +63,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
64
63
|
});
|
|
65
64
|
crawlCount++;
|
|
66
65
|
const jobId = randomUUID();
|
|
67
|
-
|
|
66
|
+
cronQueue.push({
|
|
68
67
|
id: jobId,
|
|
69
68
|
url,
|
|
70
69
|
lastModified: sitemapItem.lastmod,
|
package/lib/esm/utils.js
CHANGED
|
@@ -59,24 +59,36 @@ const botUserAgents = [
|
|
|
59
59
|
/Googlebot/i,
|
|
60
60
|
/GPTBot/i,
|
|
61
61
|
/Applebot/i,
|
|
62
|
-
// AI bots
|
|
63
|
-
/
|
|
64
|
-
/
|
|
65
|
-
/
|
|
66
|
-
/
|
|
67
|
-
/
|
|
68
|
-
/
|
|
69
|
-
/
|
|
70
|
-
/
|
|
71
|
-
/
|
|
72
|
-
/
|
|
73
|
-
/
|
|
74
|
-
/
|
|
75
|
-
/
|
|
76
|
-
/
|
|
77
|
-
/
|
|
78
|
-
/
|
|
79
|
-
/
|
|
62
|
+
// AI bots - condensed patterns
|
|
63
|
+
/-AI\b/i, // Matches any string ending with "-AI"
|
|
64
|
+
/-Bot\b/i, // Matches any string ending with "-Bot"
|
|
65
|
+
/-Agent\b/i, // Matches any string ending with "-Agent"
|
|
66
|
+
/-User\b/i, // Matches any string ending with "-User"
|
|
67
|
+
/\bAI\b/i, // Matches standalone "AI" word
|
|
68
|
+
/\bGPT/i, // GPT variants
|
|
69
|
+
/\bClaude/i, // Claude variants
|
|
70
|
+
/\bBard\b/i, // Google Bard
|
|
71
|
+
/\bGemini\b/i, // Google Gemini
|
|
72
|
+
/\bLlama\b/i, // Meta Llama
|
|
73
|
+
/\bChatGPT/i, // ChatGPT variants
|
|
74
|
+
/\bOpenAI/i, // OpenAI
|
|
75
|
+
/\bAnthropic/i, // Anthropic
|
|
76
|
+
/\bPerplexity/i, // Perplexity
|
|
77
|
+
/\bCohere/i, // Cohere
|
|
78
|
+
/\bHuggingFace/i, // Hugging Face
|
|
79
|
+
/\bStability/i, // Stability AI
|
|
80
|
+
/\bMidjourney/i, // Midjourney
|
|
81
|
+
/\bDALL-E/i, // DALL-E
|
|
82
|
+
/\bMeta-External/i, // Meta external agents
|
|
83
|
+
/\bGoogle-/i, // Google agents
|
|
84
|
+
/\bLLM/i, // LLM
|
|
85
|
+
/\bBytespider/i, // ByteDance spider
|
|
86
|
+
/\bBaiduspider/i, // Baidu spider
|
|
87
|
+
/\bYandexBot/i, // Yandex bot
|
|
88
|
+
/\bDuckDuckBot/i, // DuckDuckGo bot
|
|
89
|
+
/\bLinkedInBot/i, // LinkedIn bot
|
|
90
|
+
/\bTwitterbot/i, // Twitter bot
|
|
91
|
+
/\bCCBot/i, // Common Crawl bot
|
|
80
92
|
];
|
|
81
93
|
/**
|
|
82
94
|
* A default set of file extensions for static assets that do not need to be proxied.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.3.
|
|
3
|
+
"version": "1.3.3",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -45,12 +45,12 @@
|
|
|
45
45
|
]
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"@abtnode/cron": "^1.16.
|
|
49
|
-
"@abtnode/models": "^1.16.
|
|
50
|
-
"@abtnode/queue": "^1.16.
|
|
51
|
-
"@blocklet/logger": "^1.16.
|
|
48
|
+
"@abtnode/cron": "^1.16.46",
|
|
49
|
+
"@abtnode/models": "^1.16.46",
|
|
50
|
+
"@abtnode/queue": "^1.16.46",
|
|
51
|
+
"@blocklet/logger": "^1.16.46",
|
|
52
52
|
"@blocklet/puppeteer": "^22.11.3",
|
|
53
|
-
"@blocklet/sdk": "^1.16.
|
|
53
|
+
"@blocklet/sdk": "^1.16.46",
|
|
54
54
|
"@sequelize/core": "7.0.0-alpha.46",
|
|
55
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
56
56
|
"axios": "^1.7.9",
|