@lobehub/chat 1.66.5 → 1.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/README.md +1 -1
  3. package/README.zh-CN.md +1 -1
  4. package/changelog/v1.json +18 -0
  5. package/locales/ar/models.json +9 -3
  6. package/locales/ar/plugin.json +12 -0
  7. package/locales/bg-BG/models.json +9 -3
  8. package/locales/bg-BG/plugin.json +12 -0
  9. package/locales/de-DE/models.json +9 -3
  10. package/locales/de-DE/plugin.json +12 -0
  11. package/locales/en-US/models.json +9 -3
  12. package/locales/en-US/plugin.json +12 -0
  13. package/locales/es-ES/models.json +9 -3
  14. package/locales/es-ES/plugin.json +12 -0
  15. package/locales/fa-IR/models.json +9 -3
  16. package/locales/fa-IR/plugin.json +12 -0
  17. package/locales/fr-FR/models.json +9 -3
  18. package/locales/fr-FR/plugin.json +12 -0
  19. package/locales/it-IT/models.json +9 -3
  20. package/locales/it-IT/plugin.json +12 -0
  21. package/locales/ja-JP/models.json +9 -3
  22. package/locales/ja-JP/plugin.json +12 -0
  23. package/locales/ko-KR/models.json +9 -3
  24. package/locales/ko-KR/plugin.json +12 -0
  25. package/locales/nl-NL/models.json +9 -3
  26. package/locales/nl-NL/plugin.json +12 -0
  27. package/locales/pl-PL/models.json +9 -3
  28. package/locales/pl-PL/plugin.json +12 -0
  29. package/locales/pt-BR/models.json +9 -3
  30. package/locales/pt-BR/plugin.json +12 -0
  31. package/locales/ru-RU/models.json +9 -3
  32. package/locales/ru-RU/plugin.json +12 -0
  33. package/locales/tr-TR/models.json +9 -3
  34. package/locales/tr-TR/plugin.json +12 -0
  35. package/locales/vi-VN/models.json +9 -3
  36. package/locales/vi-VN/plugin.json +12 -0
  37. package/locales/zh-CN/models.json +9 -3
  38. package/locales/zh-CN/plugin.json +12 -0
  39. package/locales/zh-TW/models.json +9 -3
  40. package/locales/zh-TW/plugin.json +12 -0
  41. package/package.json +10 -6
  42. package/packages/web-crawler/README.md +34 -0
  43. package/packages/web-crawler/package.json +13 -0
  44. package/packages/web-crawler/src/crawImpl/browserless.ts +62 -0
  45. package/packages/web-crawler/src/crawImpl/index.ts +11 -0
  46. package/packages/web-crawler/src/crawImpl/jina.ts +37 -0
  47. package/packages/web-crawler/src/crawImpl/naive.ts +84 -0
  48. package/packages/web-crawler/src/crawler.ts +66 -0
  49. package/packages/web-crawler/src/index.ts +2 -0
  50. package/packages/web-crawler/src/type.ts +42 -0
  51. package/packages/web-crawler/src/urlRules.ts +34 -0
  52. package/packages/web-crawler/src/utils/__snapshots__/htmlToMarkdown.test.ts.snap +638 -0
  53. package/packages/web-crawler/src/utils/appUrlRules.test.ts +26 -0
  54. package/packages/web-crawler/src/utils/appUrlRules.ts +40 -0
  55. package/packages/web-crawler/src/utils/errorType.ts +12 -0
  56. package/packages/web-crawler/src/utils/html/terms.html +1222 -0
  57. package/packages/web-crawler/src/utils/html/yingchao.html +1001 -0
  58. package/packages/web-crawler/src/utils/htmlToMarkdown.test.ts +35 -0
  59. package/packages/web-crawler/src/utils/htmlToMarkdown.ts +45 -0
  60. package/packages/web-crawler/tsconfig.json +20 -0
  61. package/pnpm-workspace.yaml +3 -0
  62. package/src/config/aiModels/openai.ts +29 -5
  63. package/src/database/server/models/__tests__/message.test.ts +2 -2
  64. package/src/features/Conversation/Messages/Assistant/Tool/Render/CustomRender.tsx +4 -35
  65. package/src/features/Conversation/Messages/Assistant/Tool/Render/index.tsx +1 -1
  66. package/src/features/PluginsUI/Render/BuiltinType/index.tsx +3 -0
  67. package/src/features/PluginsUI/Render/index.tsx +1 -0
  68. package/src/features/Portal/Plugins/Body/ToolRender.tsx +1 -0
  69. package/src/locales/default/plugin.ts +12 -0
  70. package/src/server/routers/tools/search.ts +23 -0
  71. package/src/services/search.ts +8 -0
  72. package/src/store/chat/slices/builtinTool/actions/searXNG.ts +50 -0
  73. package/src/store/chat/slices/builtinTool/initialState.ts +1 -0
  74. package/src/tools/web-browsing/Portal/PageContent/index.tsx +190 -0
  75. package/src/tools/web-browsing/Portal/PageContents/index.tsx +23 -0
  76. package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/Video.tsx +1 -1
  77. package/src/tools/web-browsing/Portal/Search/index.tsx +69 -0
  78. package/src/tools/web-browsing/Portal/index.tsx +28 -64
  79. package/src/tools/web-browsing/Render/PageContent/Loading.tsx +57 -0
  80. package/src/tools/web-browsing/Render/PageContent/Result.tsx +142 -0
  81. package/src/tools/web-browsing/Render/PageContent/index.tsx +41 -0
  82. package/src/tools/web-browsing/Render/{SearchQuery → Search/SearchQuery}/SearchView.tsx +1 -1
  83. package/src/tools/web-browsing/Render/{SearchQuery → Search/SearchQuery}/index.tsx +1 -1
  84. package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/ShowMore.tsx +1 -1
  85. package/src/tools/web-browsing/Render/Search/index.tsx +62 -0
  86. package/src/tools/web-browsing/Render/index.tsx +35 -44
  87. package/src/tools/web-browsing/index.ts +43 -47
  88. package/src/tools/web-browsing/systemRole.ts +109 -0
  89. package/src/types/tool/builtin.ts +2 -0
  90. package/src/types/tool/crawler.ts +19 -0
  91. package/src/types/tool/search.ts +1 -0
  92. /package/src/tools/web-browsing/Portal/{Footer.tsx → Search/Footer.tsx} +0 -0
  93. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/CategoryAvatar.tsx +0 -0
  94. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/TitleExtra.tsx +0 -0
  95. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/index.tsx +0 -0
  96. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/index.tsx +0 -0
  97. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/Form.tsx +0 -0
  98. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/SearchXNGIcon.tsx +0 -0
  99. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/index.tsx +0 -0
  100. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/style.tsx +0 -0
  101. /package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/SearchResultItem.tsx +0 -0
  102. /package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/index.tsx +0 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lobehub/chat",
3
- "version": "1.66.5",
3
+ "version": "1.67.0",
4
4
  "description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
5
5
  "keywords": [
6
6
  "framework",
@@ -25,6 +25,9 @@
25
25
  "license": "MIT",
26
26
  "author": "LobeHub <i@lobehub.com>",
27
27
  "sideEffects": false,
28
+ "workspaces": [
29
+ "packages/*"
30
+ ],
28
31
  "scripts": {
29
32
  "build": "next build",
30
33
  "build:analyze": "ANALYZE=true next build",
@@ -105,7 +108,7 @@
105
108
  "@ant-design/icons": "^5.5.2",
106
109
  "@ant-design/pro-components": "^2.8.3",
107
110
  "@anthropic-ai/sdk": "^0.37.0",
108
- "@auth/core": "^0.37.4",
111
+ "@auth/core": "^0.38.0",
109
112
  "@aws-sdk/client-bedrock-runtime": "^3.723.0",
110
113
  "@aws-sdk/client-s3": "^3.723.0",
111
114
  "@aws-sdk/s3-request-presigner": "^3.723.0",
@@ -124,6 +127,7 @@
124
127
  "@icons-pack/react-simple-icons": "9.6.0",
125
128
  "@khmyznikov/pwa-install": "0.3.9",
126
129
  "@langchain/community": "^0.3.22",
130
+ "@lobechat/web-crawler": "workspace:*",
127
131
  "@lobehub/charts": "^1.12.0",
128
132
  "@lobehub/chat-plugin-sdk": "^1.32.4",
129
133
  "@lobehub/chat-plugins-gateway": "^1.9.0",
@@ -156,7 +160,7 @@
156
160
  "debug": "^4.4.0",
157
161
  "dexie": "^3.2.7",
158
162
  "diff": "^7.0.0",
159
- "drizzle-orm": "^0.39.0",
163
+ "drizzle-orm": "^0.40.0",
160
164
  "drizzle-zod": "^0.5.1",
161
165
  "fast-deep-equal": "^3.1.3",
162
166
  "file-type": "^20.0.0",
@@ -174,7 +178,7 @@
174
178
  "langfuse": "3.29.1",
175
179
  "langfuse-core": "3.29.1",
176
180
  "lodash-es": "^4.17.21",
177
- "lucide-react": "^0.475.0",
181
+ "lucide-react": "^0.477.0",
178
182
  "mammoth": "^1.9.0",
179
183
  "mdast-util-to-markdown": "^2.1.2",
180
184
  "modern-screenshot": "^4.5.5",
@@ -210,7 +214,7 @@
210
214
  "react-lazy-load": "^4.0.1",
211
215
  "react-pdf": "^9.2.1",
212
216
  "react-rnd": "^10.4.14",
213
- "react-scan": "^0.1.0",
217
+ "react-scan": "^0.2.0",
214
218
  "react-virtuoso": "^4.12.3",
215
219
  "react-wrap-balancer": "^1.1.1",
216
220
  "remark": "^15.0.1",
@@ -320,7 +324,7 @@
320
324
  "vitest": "~1.2.2",
321
325
  "vitest-canvas-mock": "^0.3.3"
322
326
  },
323
- "packageManager": "pnpm@9.15.5",
327
+ "packageManager": "pnpm@9.15.6",
324
328
  "publishConfig": {
325
329
  "access": "public",
326
330
  "registry": "https://registry.npmjs.org"
@@ -0,0 +1,34 @@
1
+ # @lobechat/web-crawler
2
+
3
+ LobeChat 内置的网页抓取模块,用于从网页中提取结构化内容,并转换为 Markdown 格式。
4
+
5
+ ## 📝 简介
6
+
7
+ `@lobechat/web-crawler` 是 LobeChat 项目的内部组件,专门负责网页内容的抓取和处理。它能够智能地从各种网页中提取有意义的内容,剔除广告、导航栏等干扰元素,并将结果转换为结构良好的 Markdown 文本。
8
+
9
+ ## 🔍 主要功能
10
+
11
+ - **网页内容抓取**:支持从各类网站获取原始 HTML 内容
12
+ - **智能内容提取**:使用 Mozilla 的 Readability 算法识别页面中的主要内容
13
+ - **降级处理机制**:当标准抓取失败时,自动切换到 Browserless.io 服务进行渲染抓取(需要自行配置环境变量)
14
+ - **Markdown 转换**:将提取的 HTML 内容转换为易于 AI 处理的 Markdown 格式
15
+
16
+ ## 🛠️ 技术实现
17
+
18
+ 该模块主要依赖以下技术:
19
+
20
+ - **@mozilla/readability**:提供了强大的内容提取算法
21
+ - **happy-dom**:轻量级的服务端 DOM 实现
22
+ - **node-html-markdown**:高效的 HTML 到 Markdown 转换工具
23
+
24
+ ## 🤝 共建改进
25
+
26
+ 由于网页结构的多样性和复杂性,内容提取可能会遇到各种挑战。如果您发现某些网站的抓取效果不佳,欢迎通过以下方式参与改进:
27
+
28
+ 1. 提交具体的问题网址和期望的输出结果
29
+ 2. 分享您对特定网站类型的处理经验
30
+ 3. 提出针对性的算法或配置调整建议
31
+
32
+ ## 📌 注意事项
33
+
34
+ 这是 LobeHub 的内部模块(`"private": true`),不作为独立包发布使用。它专为 LobeChat 的特定需求设计,与其他系统组件紧密集成。
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "@lobechat/web-crawler",
3
+ "version": "1.0.0",
4
+ "private": true,
5
+ "main": "src/index.ts",
6
+ "types": "src/index.ts",
7
+ "dependencies": {
8
+ "@mozilla/readability": "^0.5.0",
9
+ "happy-dom": "^17.0.0",
10
+ "node-html-markdown": "^1.3.0",
11
+ "query-string": "^9.1.1"
12
+ }
13
+ }
@@ -0,0 +1,62 @@
1
+ import qs from 'query-string';
2
+
3
+ import { CrawlImpl, CrawlSuccessResult } from '../type';
4
+ import { htmlToMarkdown } from '../utils/htmlToMarkdown';
5
+
6
+ const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
7
+ const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;
8
+
9
+ class BrowserlessInitError extends Error {
10
+ constructor() {
11
+ super('`BROWSERLESS_URL` or `BROWSERLESS_TOKEN` are required');
12
+ this.name = 'BrowserlessInitError';
13
+ }
14
+ }
15
+
16
+ export const browserless: CrawlImpl = async (url, { filterOptions }) => {
17
+ if (!process.env.BROWSERLESS_URL && !process.env.BROWSERLESS_TOKEN) {
18
+ throw new BrowserlessInitError();
19
+ }
20
+
21
+ const input = {
22
+ gotoOptions: { waitUntil: 'networkidle2' },
23
+ url,
24
+ };
25
+
26
+ try {
27
+ const res = await fetch(
28
+ qs.stringifyUrl({ query: { token: BROWSERLESS_TOKEN }, url: `${BASE_URL}/content` }),
29
+ {
30
+ body: JSON.stringify(input),
31
+ headers: {
32
+ 'Content-Type': 'application/json',
33
+ },
34
+ method: 'POST',
35
+ },
36
+ );
37
+ const html = await res.text();
38
+
39
+ const result = htmlToMarkdown(html, { filterOptions, url });
40
+
41
+ if (
42
+ !!result.content &&
43
+ result.title &&
44
+ // Just a moment... 说明被 CF 拦截了
45
+ result.title.trim() !== 'Just a moment...'
46
+ ) {
47
+ return {
48
+ content: result.content,
49
+ contentType: 'text',
50
+ description: result?.excerpt,
51
+ length: result.length,
52
+ siteName: result?.siteName,
53
+ title: result?.title,
54
+ url,
55
+ } satisfies CrawlSuccessResult;
56
+ }
57
+ } catch (error) {
58
+ console.error(error);
59
+ }
60
+
61
+ return;
62
+ };
@@ -0,0 +1,11 @@
1
+ import { browserless } from './browserless';
2
+ import { jina } from './jina';
3
+ import { naive } from './naive';
4
+
5
+ export const crawlImpls = {
6
+ browserless,
7
+ jina,
8
+ naive,
9
+ };
10
+
11
+ export type CrawlImplType = keyof typeof crawlImpls;
@@ -0,0 +1,37 @@
1
+ import { CrawlImpl } from '../type';
2
+
3
+ export const jina: CrawlImpl<{ apiKey?: string }> = async (url, params) => {
4
+ const token = params.apiKey ?? process.env.JINA_API_KEY;
5
+
6
+ try {
7
+ const res = await fetch(`https://r.jina.ai/${url}`, {
8
+ headers: {
9
+ 'Accept': 'application/json',
10
+ 'Authorization': token ? `Bearer ${token}` : '',
11
+ 'x-send-from': 'LobeChat Community',
12
+ },
13
+ });
14
+
15
+ if (res.ok) {
16
+ const json = await res.json();
17
+ if (json.code === 200) {
18
+ const result = json.data;
19
+ return {
20
+ content: result.content,
21
+ contentType: 'text',
22
+ description: result?.description,
23
+ length: result.content.length,
24
+ siteName: result?.siteName,
25
+ title: result?.title,
26
+ url: url,
27
+ };
28
+ }
29
+
30
+ throw json;
31
+ }
32
+ } catch (error) {
33
+ console.error(error);
34
+ }
35
+
36
+ return;
37
+ };
@@ -0,0 +1,84 @@
1
+ import { CrawlImpl, CrawlSuccessResult } from '../type';
2
+ import { NetworkConnectionError, PageNotFoundError } from '../utils/errorType';
3
+ import { htmlToMarkdown } from '../utils/htmlToMarkdown';
4
+
5
+ const mixinHeaders = {
6
+ // 接受的内容类型
7
+ 'Accept':
8
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
9
+ // 接受的编码方式
10
+ 'Accept-Encoding': 'gzip, deflate, br',
11
+ // 接受的语言
12
+ 'Accept-Language': 'en-US,en;q=0.9,zh;q=0.8',
13
+ // 缓存控制
14
+ 'Cache-Control': 'max-age=0',
15
+ // 连接类型
16
+ 'Connection': 'keep-alive',
17
+ // 表明请求来自哪个站点
18
+ 'Referer': 'https://www.google.com/',
19
+ // 升级不安全请求
20
+ 'Upgrade-Insecure-Requests': '1',
21
+ // 模拟真实浏览器的 User-Agent
22
+ 'User-Agent':
23
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
24
+ // 防止跨站请求伪造
25
+ 'sec-ch-ua': '"Google Chrome";v="121", "Not A(Brand";v="99", "Chromium";v="121"',
26
+ 'sec-ch-ua-mobile': '?0',
27
+ 'sec-ch-ua-platform': '"Windows"',
28
+ 'sec-fetch-dest': 'document',
29
+ 'sec-fetch-mode': 'navigate',
30
+ 'sec-fetch-site': 'none',
31
+ 'sec-fetch-user': '?1',
32
+ };
33
+
34
+ export const naive: CrawlImpl = async (url, { filterOptions }) => {
35
+ let res: Response;
36
+
37
+ try {
38
+ res = await fetch(url, { headers: mixinHeaders });
39
+ } catch (e) {
40
+ if ((e as Error).message === 'fetch failed') {
41
+ throw new NetworkConnectionError();
42
+ }
43
+ throw e;
44
+ }
45
+
46
+ if (res.status === 404) {
47
+ throw new PageNotFoundError(res.statusText);
48
+ }
49
+ const type = res.headers.get('content-type');
50
+
51
+ if (type?.includes('application/json')) {
52
+ const json = await res.json();
53
+ return {
54
+ content: JSON.stringify(json, null, 2),
55
+ contentType: 'json',
56
+ length: json.length,
57
+ url,
58
+ } satisfies CrawlSuccessResult;
59
+ }
60
+
61
+ try {
62
+ const html = await res.text();
63
+
64
+ const result = htmlToMarkdown(html, { filterOptions, url });
65
+
66
+ // if the content is not empty or blocked
67
+ // just return
68
+ if (!!result.content && result.title !== 'Just a moment...') {
69
+ return {
70
+ content: result.content,
71
+ contentType: 'text',
72
+ description: result?.excerpt,
73
+ length: result.length,
74
+ siteName: result?.siteName,
75
+ title: result?.title,
76
+ url,
77
+ } satisfies CrawlSuccessResult;
78
+ }
79
+ } catch (error) {
80
+ console.error(error);
81
+ }
82
+
83
+ return;
84
+ };
@@ -0,0 +1,66 @@
1
+ import { CrawlImplType, crawlImpls } from './crawImpl';
2
+ import { CrawlUrlRule } from './type';
3
+ import { crawUrlRules } from './urlRules';
4
+ import { applyUrlRules } from './utils/appUrlRules';
5
+
6
+ export class Crawler {
7
+ impls = ['naive', 'jina', 'browserless'] as const;
8
+
9
+ /**
10
+ * 爬取网页内容
11
+ * @param options 爬取选项
12
+ */
13
+ async crawl({
14
+ url,
15
+ impls,
16
+ filterOptions: userFilterOptions,
17
+ }: {
18
+ filterOptions?: CrawlUrlRule['filterOptions'];
19
+ impls?: string[];
20
+ url: string;
21
+ }) {
22
+ // 应用URL规则
23
+ const { transformedUrl, filterOptions: ruleFilterOptions } = applyUrlRules(url, crawUrlRules);
24
+
25
+ // 合并用户提供的过滤选项和规则中的过滤选项,用户选项优先
26
+ const mergedFilterOptions = {
27
+ ...ruleFilterOptions,
28
+ ...userFilterOptions,
29
+ };
30
+
31
+ let finalError: Error | undefined;
32
+
33
+ const finalImpls = impls
34
+ ? (impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
35
+ : this.impls;
36
+
37
+ // 按照内置的实现顺序依次尝试
38
+ for (const impl of finalImpls) {
39
+ try {
40
+ const res = await crawlImpls[impl](transformedUrl, { filterOptions: mergedFilterOptions });
41
+
42
+ if (res)
43
+ return {
44
+ crawler: impl,
45
+ data: res,
46
+ originalUrl: url,
47
+ transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
48
+ };
49
+ } catch (error) {
50
+ console.error(error);
51
+ finalError = error as Error;
52
+ }
53
+ }
54
+
55
+ const errorType = finalError?.name || 'UnknownError';
56
+ const errorMessage = finalError?.message;
57
+
58
+ return {
59
+ content: `Fail to crawl the page. Error type: ${errorType}, error message: ${errorMessage}`,
60
+ errorMessage: errorMessage,
61
+ errorType,
62
+ originalUrl: url,
63
+ transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
64
+ };
65
+ }
66
+ }
@@ -0,0 +1,2 @@
1
+ export { Crawler } from './crawler';
2
+ export * from './type';
@@ -0,0 +1,42 @@
1
+ export interface CrawlSuccessResult {
2
+ content?: string;
3
+ contentType: 'text' | 'json';
4
+ description?: string;
5
+ length?: number;
6
+ siteName?: string;
7
+ title?: string;
8
+ url: string;
9
+ }
10
+
11
+ export interface CrawlErrorResult {
12
+ content: string;
13
+ errorMessage: string;
14
+ url: string;
15
+ }
16
+
17
+ export interface FilterOptions {
18
+ // 是否启用Readability
19
+ enableReadability?: boolean;
20
+
21
+ pureText?: boolean;
22
+ }
23
+
24
+ type CrawlImplParams<T> = T & {
25
+ filterOptions: FilterOptions;
26
+ };
27
+
28
+ export type CrawlImpl<Params = object> = (
29
+ url: string,
30
+ params: CrawlImplParams<Params>,
31
+ ) => Promise<CrawlSuccessResult | undefined>;
32
+
33
+ export interface CrawlUrlRule {
34
+ // 内容过滤配置(可选)
35
+ filterOptions?: FilterOptions;
36
+ // 是否使用正则表达式匹配(默认为glob模式)
37
+ isRegex?: boolean;
38
+ // URL匹配模式,支持glob模式或正则表达式
39
+ urlPattern: string;
40
+ // URL转换模板(可选),如果提供则进行URL转换
41
+ urlTransform?: string;
42
+ }
@@ -0,0 +1,34 @@
1
+ import { CrawlUrlRule } from './type';
2
+
3
+ export const crawUrlRules: CrawlUrlRule[] = [
4
+ // github 源码解析
5
+ {
6
+ filterOptions: {
7
+ enableReadability: false,
8
+ },
9
+ urlPattern: 'https://github.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)',
10
+ urlTransform: 'https://github.com/$1/$2/raw/refs/heads/$3/$4',
11
+ },
12
+ {
13
+ filterOptions: {
14
+ enableReadability: false,
15
+ },
16
+ // GitHub discussion
17
+ urlPattern: 'https://github.com/(.*)/discussions/(.*)',
18
+ },
19
+ {
20
+ // Medium 文章转换为 Scribe.rip
21
+ urlPattern: 'https://medium.com/(.*)',
22
+ urlTransform: 'https://scribe.rip/$1',
23
+ },
24
+
25
+ // 体育数据网站规则
26
+ {
27
+ filterOptions: {
28
+ // 对体育数据表格禁用 Readability 并且转换为纯文本
29
+ enableReadability: false,
30
+ pureText: true,
31
+ },
32
+ urlPattern: 'https://www.qiumiwu.com/standings/(.*)',
33
+ },
34
+ ];