@lobehub/chat 1.66.6 → 1.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/changelog/v1.json +9 -0
  3. package/locales/ar/models.json +9 -3
  4. package/locales/ar/plugin.json +12 -0
  5. package/locales/bg-BG/models.json +9 -3
  6. package/locales/bg-BG/plugin.json +12 -0
  7. package/locales/de-DE/models.json +9 -3
  8. package/locales/de-DE/plugin.json +12 -0
  9. package/locales/en-US/models.json +9 -3
  10. package/locales/en-US/plugin.json +12 -0
  11. package/locales/es-ES/models.json +9 -3
  12. package/locales/es-ES/plugin.json +12 -0
  13. package/locales/fa-IR/models.json +9 -3
  14. package/locales/fa-IR/plugin.json +12 -0
  15. package/locales/fr-FR/models.json +9 -3
  16. package/locales/fr-FR/plugin.json +12 -0
  17. package/locales/it-IT/models.json +9 -3
  18. package/locales/it-IT/plugin.json +12 -0
  19. package/locales/ja-JP/models.json +9 -3
  20. package/locales/ja-JP/plugin.json +12 -0
  21. package/locales/ko-KR/models.json +9 -3
  22. package/locales/ko-KR/plugin.json +12 -0
  23. package/locales/nl-NL/models.json +9 -3
  24. package/locales/nl-NL/plugin.json +12 -0
  25. package/locales/pl-PL/models.json +9 -3
  26. package/locales/pl-PL/plugin.json +12 -0
  27. package/locales/pt-BR/models.json +9 -3
  28. package/locales/pt-BR/plugin.json +12 -0
  29. package/locales/ru-RU/models.json +9 -3
  30. package/locales/ru-RU/plugin.json +12 -0
  31. package/locales/tr-TR/models.json +9 -3
  32. package/locales/tr-TR/plugin.json +12 -0
  33. package/locales/vi-VN/models.json +9 -3
  34. package/locales/vi-VN/plugin.json +12 -0
  35. package/locales/zh-CN/models.json +9 -3
  36. package/locales/zh-CN/plugin.json +12 -0
  37. package/locales/zh-TW/models.json +9 -3
  38. package/locales/zh-TW/plugin.json +12 -0
  39. package/package.json +5 -1
  40. package/packages/web-crawler/README.md +34 -0
  41. package/packages/web-crawler/package.json +13 -0
  42. package/packages/web-crawler/src/crawImpl/browserless.ts +62 -0
  43. package/packages/web-crawler/src/crawImpl/index.ts +11 -0
  44. package/packages/web-crawler/src/crawImpl/jina.ts +37 -0
  45. package/packages/web-crawler/src/crawImpl/naive.ts +84 -0
  46. package/packages/web-crawler/src/crawler.ts +66 -0
  47. package/packages/web-crawler/src/index.ts +2 -0
  48. package/packages/web-crawler/src/type.ts +42 -0
  49. package/packages/web-crawler/src/urlRules.ts +34 -0
  50. package/packages/web-crawler/src/utils/__snapshots__/htmlToMarkdown.test.ts.snap +638 -0
  51. package/packages/web-crawler/src/utils/appUrlRules.test.ts +26 -0
  52. package/packages/web-crawler/src/utils/appUrlRules.ts +40 -0
  53. package/packages/web-crawler/src/utils/errorType.ts +12 -0
  54. package/packages/web-crawler/src/utils/html/terms.html +1222 -0
  55. package/packages/web-crawler/src/utils/html/yingchao.html +1001 -0
  56. package/packages/web-crawler/src/utils/htmlToMarkdown.test.ts +35 -0
  57. package/packages/web-crawler/src/utils/htmlToMarkdown.ts +45 -0
  58. package/packages/web-crawler/tsconfig.json +20 -0
  59. package/pnpm-workspace.yaml +3 -0
  60. package/src/features/Conversation/Messages/Assistant/Tool/Render/CustomRender.tsx +4 -35
  61. package/src/features/Conversation/Messages/Assistant/Tool/Render/index.tsx +1 -1
  62. package/src/features/PluginsUI/Render/BuiltinType/index.tsx +3 -0
  63. package/src/features/PluginsUI/Render/index.tsx +1 -0
  64. package/src/features/Portal/Plugins/Body/ToolRender.tsx +1 -0
  65. package/src/locales/default/plugin.ts +12 -0
  66. package/src/server/routers/tools/search.ts +23 -0
  67. package/src/services/search.ts +8 -0
  68. package/src/store/chat/slices/builtinTool/actions/searXNG.ts +50 -0
  69. package/src/store/chat/slices/builtinTool/initialState.ts +1 -0
  70. package/src/tools/web-browsing/Portal/PageContent/index.tsx +190 -0
  71. package/src/tools/web-browsing/Portal/PageContents/index.tsx +23 -0
  72. package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/Video.tsx +1 -1
  73. package/src/tools/web-browsing/Portal/Search/index.tsx +69 -0
  74. package/src/tools/web-browsing/Portal/index.tsx +28 -64
  75. package/src/tools/web-browsing/Render/PageContent/Loading.tsx +57 -0
  76. package/src/tools/web-browsing/Render/PageContent/Result.tsx +142 -0
  77. package/src/tools/web-browsing/Render/PageContent/index.tsx +41 -0
  78. package/src/tools/web-browsing/Render/{SearchQuery → Search/SearchQuery}/SearchView.tsx +1 -1
  79. package/src/tools/web-browsing/Render/{SearchQuery → Search/SearchQuery}/index.tsx +1 -1
  80. package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/ShowMore.tsx +1 -1
  81. package/src/tools/web-browsing/Render/Search/index.tsx +62 -0
  82. package/src/tools/web-browsing/Render/index.tsx +35 -44
  83. package/src/tools/web-browsing/index.ts +43 -47
  84. package/src/tools/web-browsing/systemRole.ts +109 -0
  85. package/src/types/tool/builtin.ts +2 -0
  86. package/src/types/tool/crawler.ts +19 -0
  87. package/src/types/tool/search.ts +1 -0
  88. /package/src/tools/web-browsing/Portal/{Footer.tsx → Search/Footer.tsx} +0 -0
  89. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/CategoryAvatar.tsx +0 -0
  90. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/TitleExtra.tsx +0 -0
  91. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/SearchItem/index.tsx +0 -0
  92. /package/src/tools/web-browsing/Portal/{ResultList → Search/ResultList}/index.tsx +0 -0
  93. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/Form.tsx +0 -0
  94. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/SearchXNGIcon.tsx +0 -0
  95. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/index.tsx +0 -0
  96. /package/src/tools/web-browsing/Render/{ConfigForm → Search/ConfigForm}/style.tsx +0 -0
  97. /package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/SearchResultItem.tsx +0 -0
  98. /package/src/tools/web-browsing/Render/{SearchResult → Search/SearchResult}/index.tsx +0 -0
@@ -0,0 +1,62 @@
1
+ import qs from 'query-string';
2
+
3
+ import { CrawlImpl, CrawlSuccessResult } from '../type';
4
+ import { htmlToMarkdown } from '../utils/htmlToMarkdown';
5
+
6
+ const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
7
+ const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;
8
+
9
+ class BrowserlessInitError extends Error {
10
+ constructor() {
11
+ super('`BROWSERLESS_URL` or `BROWSERLESS_TOKEN` are required');
12
+ this.name = 'BrowserlessInitError';
13
+ }
14
+ }
15
+
16
+ export const browserless: CrawlImpl = async (url, { filterOptions }) => {
17
+ if (!process.env.BROWSERLESS_URL && !process.env.BROWSERLESS_TOKEN) {
18
+ throw new BrowserlessInitError();
19
+ }
20
+
21
+ const input = {
22
+ gotoOptions: { waitUntil: 'networkidle2' },
23
+ url,
24
+ };
25
+
26
+ try {
27
+ const res = await fetch(
28
+ qs.stringifyUrl({ query: { token: BROWSERLESS_TOKEN }, url: `${BASE_URL}/content` }),
29
+ {
30
+ body: JSON.stringify(input),
31
+ headers: {
32
+ 'Content-Type': 'application/json',
33
+ },
34
+ method: 'POST',
35
+ },
36
+ );
37
+ const html = await res.text();
38
+
39
+ const result = htmlToMarkdown(html, { filterOptions, url });
40
+
41
+ if (
42
+ !!result.content &&
43
+ result.title &&
44
+ // Just a moment... 说明被 CF 拦截了
45
+ result.title.trim() !== 'Just a moment...'
46
+ ) {
47
+ return {
48
+ content: result.content,
49
+ contentType: 'text',
50
+ description: result?.excerpt,
51
+ length: result.length,
52
+ siteName: result?.siteName,
53
+ title: result?.title,
54
+ url,
55
+ } satisfies CrawlSuccessResult;
56
+ }
57
+ } catch (error) {
58
+ console.error(error);
59
+ }
60
+
61
+ return;
62
+ };
@@ -0,0 +1,11 @@
1
+ import { browserless } from './browserless';
2
+ import { jina } from './jina';
3
+ import { naive } from './naive';
4
+
5
+ export const crawlImpls = {
6
+ browserless,
7
+ jina,
8
+ naive,
9
+ };
10
+
11
+ export type CrawlImplType = keyof typeof crawlImpls;
@@ -0,0 +1,37 @@
1
+ import { CrawlImpl } from '../type';
2
+
3
+ export const jina: CrawlImpl<{ apiKey?: string }> = async (url, params) => {
4
+ const token = params.apiKey ?? process.env.JINA_API_KEY;
5
+
6
+ try {
7
+ const res = await fetch(`https://r.jina.ai/${url}`, {
8
+ headers: {
9
+ 'Accept': 'application/json',
10
+ 'Authorization': token ? `Bearer ${token}` : '',
11
+ 'x-send-from': 'LobeChat Community',
12
+ },
13
+ });
14
+
15
+ if (res.ok) {
16
+ const json = await res.json();
17
+ if (json.code === 200) {
18
+ const result = json.data;
19
+ return {
20
+ content: result.content,
21
+ contentType: 'text',
22
+ description: result?.description,
23
+ length: result.content.length,
24
+ siteName: result?.siteName,
25
+ title: result?.title,
26
+ url: url,
27
+ };
28
+ }
29
+
30
+ throw json;
31
+ }
32
+ } catch (error) {
33
+ console.error(error);
34
+ }
35
+
36
+ return;
37
+ };
@@ -0,0 +1,84 @@
1
+ import { CrawlImpl, CrawlSuccessResult } from '../type';
2
+ import { NetworkConnectionError, PageNotFoundError } from '../utils/errorType';
3
+ import { htmlToMarkdown } from '../utils/htmlToMarkdown';
4
+
5
+ const mixinHeaders = {
6
+ // 接受的内容类型
7
+ 'Accept':
8
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
9
+ // 接受的编码方式
10
+ 'Accept-Encoding': 'gzip, deflate, br',
11
+ // 接受的语言
12
+ 'Accept-Language': 'en-US,en;q=0.9,zh;q=0.8',
13
+ // 缓存控制
14
+ 'Cache-Control': 'max-age=0',
15
+ // 连接类型
16
+ 'Connection': 'keep-alive',
17
+ // 表明请求来自哪个站点
18
+ 'Referer': 'https://www.google.com/',
19
+ // 升级不安全请求
20
+ 'Upgrade-Insecure-Requests': '1',
21
+ // 模拟真实浏览器的 User-Agent
22
+ 'User-Agent':
23
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
24
+ // 防止跨站请求伪造
25
+ 'sec-ch-ua': '"Google Chrome";v="121", "Not A(Brand";v="99", "Chromium";v="121"',
26
+ 'sec-ch-ua-mobile': '?0',
27
+ 'sec-ch-ua-platform': '"Windows"',
28
+ 'sec-fetch-dest': 'document',
29
+ 'sec-fetch-mode': 'navigate',
30
+ 'sec-fetch-site': 'none',
31
+ 'sec-fetch-user': '?1',
32
+ };
33
+
34
+ export const naive: CrawlImpl = async (url, { filterOptions }) => {
35
+ let res: Response;
36
+
37
+ try {
38
+ res = await fetch(url, { headers: mixinHeaders });
39
+ } catch (e) {
40
+ if ((e as Error).message === 'fetch failed') {
41
+ throw new NetworkConnectionError();
42
+ }
43
+ throw e;
44
+ }
45
+
46
+ if (res.status === 404) {
47
+ throw new PageNotFoundError(res.statusText);
48
+ }
49
+ const type = res.headers.get('content-type');
50
+
51
+ if (type?.includes('application/json')) {
52
+ const json = await res.json();
53
+ return {
54
+ content: JSON.stringify(json, null, 2),
55
+ contentType: 'json',
56
+ length: json.length,
57
+ url,
58
+ } satisfies CrawlSuccessResult;
59
+ }
60
+
61
+ try {
62
+ const html = await res.text();
63
+
64
+ const result = htmlToMarkdown(html, { filterOptions, url });
65
+
66
+ // if the content is not empty or blocked
67
+ // just return
68
+ if (!!result.content && result.title !== 'Just a moment...') {
69
+ return {
70
+ content: result.content,
71
+ contentType: 'text',
72
+ description: result?.excerpt,
73
+ length: result.length,
74
+ siteName: result?.siteName,
75
+ title: result?.title,
76
+ url,
77
+ } satisfies CrawlSuccessResult;
78
+ }
79
+ } catch (error) {
80
+ console.error(error);
81
+ }
82
+
83
+ return;
84
+ };
@@ -0,0 +1,66 @@
1
+ import { CrawlImplType, crawlImpls } from './crawImpl';
2
+ import { CrawlUrlRule } from './type';
3
+ import { crawUrlRules } from './urlRules';
4
+ import { applyUrlRules } from './utils/appUrlRules';
5
+
6
+ export class Crawler {
7
+ impls = ['naive', 'jina', 'browserless'] as const;
8
+
9
+ /**
10
+ * 爬取网页内容
11
+ * @param options 爬取选项
12
+ */
13
+ async crawl({
14
+ url,
15
+ impls,
16
+ filterOptions: userFilterOptions,
17
+ }: {
18
+ filterOptions?: CrawlUrlRule['filterOptions'];
19
+ impls?: string[];
20
+ url: string;
21
+ }) {
22
+ // 应用URL规则
23
+ const { transformedUrl, filterOptions: ruleFilterOptions } = applyUrlRules(url, crawUrlRules);
24
+
25
+ // 合并用户提供的过滤选项和规则中的过滤选项,用户选项优先
26
+ const mergedFilterOptions = {
27
+ ...ruleFilterOptions,
28
+ ...userFilterOptions,
29
+ };
30
+
31
+ let finalError: Error | undefined;
32
+
33
+ const finalImpls = impls
34
+ ? (impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
35
+ : this.impls;
36
+
37
+ // 按照内置的实现顺序依次尝试
38
+ for (const impl of finalImpls) {
39
+ try {
40
+ const res = await crawlImpls[impl](transformedUrl, { filterOptions: mergedFilterOptions });
41
+
42
+ if (res)
43
+ return {
44
+ crawler: impl,
45
+ data: res,
46
+ originalUrl: url,
47
+ transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
48
+ };
49
+ } catch (error) {
50
+ console.error(error);
51
+ finalError = error as Error;
52
+ }
53
+ }
54
+
55
+ const errorType = finalError?.name || 'UnknownError';
56
+ const errorMessage = finalError?.message;
57
+
58
+ return {
59
+ content: `Fail to crawl the page. Error type: ${errorType}, error message: ${errorMessage}`,
60
+ errorMessage: errorMessage,
61
+ errorType,
62
+ originalUrl: url,
63
+ transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
64
+ };
65
+ }
66
+ }
@@ -0,0 +1,2 @@
1
+ export { Crawler } from './crawler';
2
+ export * from './type';
@@ -0,0 +1,42 @@
1
+ export interface CrawlSuccessResult {
2
+ content?: string;
3
+ contentType: 'text' | 'json';
4
+ description?: string;
5
+ length?: number;
6
+ siteName?: string;
7
+ title?: string;
8
+ url: string;
9
+ }
10
+
11
+ export interface CrawlErrorResult {
12
+ content: string;
13
+ errorMessage: string;
14
+ url: string;
15
+ }
16
+
17
+ export interface FilterOptions {
18
+ // 是否启用Readability
19
+ enableReadability?: boolean;
20
+
21
+ pureText?: boolean;
22
+ }
23
+
24
+ type CrawlImplParams<T> = T & {
25
+ filterOptions: FilterOptions;
26
+ };
27
+
28
+ export type CrawlImpl<Params = object> = (
29
+ url: string,
30
+ params: CrawlImplParams<Params>,
31
+ ) => Promise<CrawlSuccessResult | undefined>;
32
+
33
+ export interface CrawlUrlRule {
34
+ // 内容过滤配置(可选)
35
+ filterOptions?: FilterOptions;
36
+ // 是否使用正则表达式匹配(默认为glob模式)
37
+ isRegex?: boolean;
38
+ // URL匹配模式,支持glob模式或正则表达式
39
+ urlPattern: string;
40
+ // URL转换模板(可选),如果提供则进行URL转换
41
+ urlTransform?: string;
42
+ }
@@ -0,0 +1,34 @@
1
+ import { CrawlUrlRule } from './type';
2
+
3
+ export const crawUrlRules: CrawlUrlRule[] = [
4
+ // github 源码解析
5
+ {
6
+ filterOptions: {
7
+ enableReadability: false,
8
+ },
9
+ urlPattern: 'https://github.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)',
10
+ urlTransform: 'https://github.com/$1/$2/raw/refs/heads/$3/$4',
11
+ },
12
+ {
13
+ filterOptions: {
14
+ enableReadability: false,
15
+ },
16
+ // GitHub discussion
17
+ urlPattern: 'https://github.com/(.*)/discussions/(.*)',
18
+ },
19
+ {
20
+ // Medium 文章转换为 Scribe.rip
21
+ urlPattern: 'https://medium.com/(.*)',
22
+ urlTransform: 'https://scribe.rip/$1',
23
+ },
24
+
25
+ // 体育数据网站规则
26
+ {
27
+ filterOptions: {
28
+ // 对体育数据表格禁用 Readability 并且转换为纯文本
29
+ enableReadability: false,
30
+ pureText: true,
31
+ },
32
+ urlPattern: 'https://www.qiumiwu.com/standings/(.*)',
33
+ },
34
+ ];