@lobehub/chat 1.55.4 → 1.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,31 @@
2
2
 
3
3
  # Changelog
4
4
 
5
+ ## [Version 1.56.0](https://github.com/lobehub/lobe-chat/compare/v1.55.4...v1.56.0)
6
+
7
+ <sup>Released on **2025-02-15**</sup>
8
+
9
+ #### ✨ Features
10
+
11
+ - **misc**: Add configurable PDF processing method with Unstructured.
12
+
13
+ <br/>
14
+
15
+ <details>
16
+ <summary><kbd>Improvements and Fixes</kbd></summary>
17
+
18
+ #### What's improved
19
+
20
+ - **misc**: Add configurable PDF processing method with Unstructured, closes [#5927](https://github.com/lobehub/lobe-chat/issues/5927) ([35fa3ee](https://github.com/lobehub/lobe-chat/commit/35fa3ee))
21
+
22
+ </details>
23
+
24
+ <div align="right">
25
+
26
+ [![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top)
27
+
28
+ </div>
29
+
5
30
  ### [Version 1.55.4](https://github.com/lobehub/lobe-chat/compare/v1.55.3...v1.55.4)
6
31
 
7
32
  <sup>Released on **2025-02-15**</sup>
package/changelog/v1.json CHANGED
@@ -1,4 +1,13 @@
1
1
  [
2
+ {
3
+ "children": {
4
+ "features": [
5
+ "Add configurable PDF processing method with Unstructured."
6
+ ]
7
+ },
8
+ "date": "2025-02-15",
9
+ "version": "1.56.0"
10
+ },
2
11
  {
3
12
  "children": {
4
13
  "improvements": [
@@ -112,18 +112,34 @@ services:
112
112
  echo '⚠️Warining: Unable to fetch OIDC configuration from Casdoor'
113
113
  echo 'Request URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
114
114
  echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
115
+ echo ''
116
+ echo '⚠️注意:无法从 Casdoor 获取 OIDC 配置'
117
+ echo '请求 URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
118
+ echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
119
+ echo ''
115
120
  else
116
121
  if ! wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep 'issuer' | grep ${AUTH_CASDOOR_ISSUER}; then
117
122
  printf '❌Error: The Auth issuer is conflict, Issuer in OIDC configuration is: %s' \$(wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep -E 'issuer.*' | awk -F '\"' '{print \$4}')
118
123
  echo ' , but the issuer in .env file is: ${AUTH_CASDOOR_ISSUER} '
119
124
  echo 'Request URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
120
125
  echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
126
+ echo ''
127
+ printf '❌错误:Auth 的 issuer 冲突,OIDC 配置中的 issuer 是:%s' \$(wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep -E 'issuer.*' | awk -F '\"' '{print \$4}')
128
+ echo ' , 但 .env 文件中的 issuer 是:${AUTH_CASDOOR_ISSUER} '
129
+ echo '请求 URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
130
+ echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
131
+ echo ''
121
132
  fi
122
133
  fi
123
134
  if [ $(wget --timeout=5 --spider --server-response ${S3_ENDPOINT}/minio/health/live 2>&1 | grep -c 'HTTP/1.1 200 OK') -eq 0 ]; then
124
135
  echo '⚠️Warining: Unable to fetch MinIO health status'
125
136
  echo 'Request URL: ${S3_ENDPOINT}/minio/health/live'
126
137
  echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
138
+ echo ''
139
+ echo '⚠️注意:无法获取 MinIO 健康状态'
140
+ echo '请求 URL: ${S3_ENDPOINT}/minio/health/live'
141
+ echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
142
+ echo ''
127
143
  fi
128
144
  wait \$LOBE_PID
129
145
  "
@@ -109,7 +109,7 @@ If you are deploying using a public network, the following assumptions apply:
109
109
 
110
110
  Go to `Admin` -> `Webhooks`, add a webhook, and fill in the following fields:
111
111
 
112
- - URL: `https://lobe.example.com/api/auth/webhooks/casdoor`
112
+ - URL: `https://lobe.example.com/api/webhooks/casdoor`
113
113
  - Method: `POST`
114
114
  - Content Type: `application/json`
115
115
  - Headers: `casdoor-secret`: `Your Webhook Secret`
@@ -98,7 +98,7 @@ tags:
98
98
 
99
99
  前往 `管理工具` -> `Webhooks`,创建一个 Webhook,添加一个 Webhook,填写以下字段:
100
100
 
101
- - 链接:`http://lobe.example.com/api/auth/webhooks/casdoor`
101
+ - 链接:`http://lobe.example.com/api/webhooks/casdoor`
102
102
  - 方法:`POST`
103
103
  - 内容类型:`application/json`
104
104
  - 协议头:`casdoor-secret`: `你的Webhook密钥`
@@ -49,7 +49,7 @@ If you are using Logto Cloud, assume its endpoint domain is `https://example.log
49
49
 
50
50
  Go to `Webhooks`, create a Webhook, and fill in the following fields:
51
51
 
52
- - Endpoint URL: `https://lobe.example.com/api/auth/webhooks/logto`
52
+ - Endpoint URL: `https://lobe.example.com/api/webhooks/logto`
53
53
  - Events: `User.Data.Updated`
54
54
 
55
55
  After successful creation, copy the Webhook's `Signing Key` and fill it in the `LOGTO_WEBHOOK_SIGNING_KEY` environment variable.
@@ -46,7 +46,7 @@ tags:
46
46
 
47
47
  前往 `Webhooks` ,创建一个 Webhook,填写以下字段:
48
48
 
49
- - 端点 URL: `https://lobe.example.com/api/auth/webhooks/logto`
49
+ - 端点 URL: `https://lobe.example.com/api/webhooks/logto`
50
50
  - 事件: `User.Data.Updated`
51
51
 
52
52
  创建成功后,复制 Webhook 的 `签名密钥`。填写到环境变量中的 `LOGTO_WEBHOOK_SIGNING_KEY`。
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lobehub/chat",
3
- "version": "1.55.4",
3
+ "version": "1.56.0",
4
4
  "description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
5
5
  "keywords": [
6
6
  "framework",
@@ -5,7 +5,7 @@ const branchName = process.env.VERCEL_GIT_COMMIT_REF || '';
5
5
 
6
6
  function shouldProceedBuild() {
7
7
  // 如果是 lighthouse 分支或以 testgru 开头的分支,取消构建
8
- if (branchName === 'lighthouse' || branchName.startsWith('testgru')) {
8
+ if (branchName === 'lighthouse' || branchName.startsWith('gru/')) {
9
9
  return false;
10
10
  }
11
11
 
@@ -1,19 +1,17 @@
1
1
  import { createEnv } from '@t3-oss/env-nextjs';
2
2
  import { z } from 'zod';
3
3
 
4
- export const getKnowledgeConfig = () => {
5
- return createEnv({
6
- runtimeEnv: {
7
- DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
8
- UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
9
- UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
10
- },
11
- server: {
12
- DEFAULT_FILES_CONFIG: z.string().optional(),
13
- UNSTRUCTURED_API_KEY: z.string().optional(),
14
- UNSTRUCTURED_SERVER_URL: z.string().optional(),
15
- },
16
- });
17
- };
18
-
19
- export const knowledgeEnv = getKnowledgeConfig();
4
+ export const knowledgeEnv = createEnv({
5
+ runtimeEnv: {
6
+ DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
7
+ FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES,
8
+ UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
9
+ UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
10
+ },
11
+ server: {
12
+ DEFAULT_FILES_CONFIG: z.string().optional(),
13
+ FILE_TYPE_CHUNKING_RULES: z.string().optional(),
14
+ UNSTRUCTURED_API_KEY: z.string().optional(),
15
+ UNSTRUCTURED_SERVER_URL: z.string().optional(),
16
+ },
17
+ });
@@ -1,9 +1,13 @@
1
1
  import { ChunkingLoader } from 'src/libs/langchain';
2
2
  import { Strategy } from 'unstructured-client/sdk/models/shared';
3
3
 
4
- import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
4
+ import { knowledgeEnv } from '@/config/knowledge';
5
+ import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
5
6
  import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
6
7
 
8
+ import { ChunkingRuleParser } from './rules';
9
+ import type { ChunkingService } from './rules';
10
+
7
11
  export interface ChunkContentParams {
8
12
  content: Uint8Array;
9
13
  fileType: string;
@@ -19,23 +23,57 @@ interface ChunkResult {
19
23
  export class ContentChunk {
20
24
  private unstructuredClient: Unstructured;
21
25
  private langchainClient: ChunkingLoader;
26
+ private chunkingRules: Record<string, ChunkingService[]>;
22
27
 
23
28
  constructor() {
24
29
  this.unstructuredClient = new Unstructured();
25
30
  this.langchainClient = new ChunkingLoader();
31
+ this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
26
32
  }
27
33
 
28
- isUsingUnstructured(params: ChunkContentParams) {
29
- return params.fileType === 'application/pdf' && params.mode === 'hi-res';
34
+ private getChunkingServices(fileType: string): ChunkingService[] {
35
+ const ext = fileType.split('/').pop()?.toLowerCase() || '';
36
+ return this.chunkingRules[ext] || ['default'];
30
37
  }
31
38
 
32
39
  async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
33
- if (this.isUsingUnstructured(params))
34
- return await this.chunkByUnstructured(params.filename, params.content);
35
-
40
+ const services = this.getChunkingServices(params.fileType);
41
+
42
+ for (const service of services) {
43
+ try {
44
+ switch (service) {
45
+ case 'unstructured': {
46
+ if (this.canUseUnstructured()) {
47
+ return await this.chunkByUnstructured(params.filename, params.content);
48
+ }
49
+ break;
50
+ }
51
+
52
+ case 'doc2x': {
53
+ // Future implementation
54
+ break;
55
+ }
56
+
57
+ default: {
58
+ return await this.chunkByLangChain(params.filename, params.content);
59
+ }
60
+ }
61
+ } catch (error) {
62
+ // If this is the last service, throw the error
63
+ if (service === services.at(-1)) throw error;
64
+ // Otherwise continue to next service
65
+ console.error(`Chunking failed with service ${service}:`, error);
66
+ }
67
+ }
68
+
69
+ // Fallback to langchain if no service succeeded
36
70
  return await this.chunkByLangChain(params.filename, params.content);
37
71
  }
38
72
 
73
+ private canUseUnstructured(): boolean {
74
+ return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
75
+ }
76
+
39
77
  private chunkByUnstructured = async (
40
78
  filename: string,
41
79
  content: Uint8Array,
@@ -0,0 +1,81 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { ChunkingRuleParser } from './rules';
3
+
4
+ describe('ChunkingRuleParser', () => {
5
+ describe('parse', () => {
6
+ it('should parse a single file type rule correctly', () => {
7
+ const input = 'pdf=unstructured,default';
8
+ const result = ChunkingRuleParser.parse(input);
9
+
10
+ expect(result).toEqual({
11
+ pdf: ['unstructured', 'default'],
12
+ });
13
+ });
14
+
15
+ it('should parse multiple file type rules correctly', () => {
16
+ const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default';
17
+ const result = ChunkingRuleParser.parse(input);
18
+
19
+ expect(result).toEqual({
20
+ pdf: ['unstructured', 'default'],
21
+ doc: ['doc2x', 'default'],
22
+ txt: ['default'],
23
+ });
24
+ });
25
+
26
+ it('should convert file types to lowercase', () => {
27
+ const input = 'PDF=unstructured;DOC=doc2x';
28
+ const result = ChunkingRuleParser.parse(input);
29
+
30
+ expect(result).toEqual({
31
+ pdf: ['unstructured'],
32
+ doc: ['doc2x'],
33
+ });
34
+ });
35
+
36
+ it('should filter out invalid service names', () => {
37
+ const input = 'pdf=unstructured,invalid,default,wrongservice';
38
+ const result = ChunkingRuleParser.parse(input);
39
+
40
+ expect(result).toEqual({
41
+ pdf: ['unstructured', 'default'],
42
+ });
43
+ });
44
+
45
+ it('should handle empty string input', () => {
46
+ const input = '';
47
+ const result = ChunkingRuleParser.parse(input);
48
+
49
+ expect(result).toEqual({});
50
+ });
51
+
52
+ it('should skip invalid rule formats', () => {
53
+ const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt';
54
+ const result = ChunkingRuleParser.parse(input);
55
+
56
+ expect(result).toEqual({
57
+ pdf: ['unstructured'],
58
+ doc: ['doc2x'],
59
+ });
60
+ });
61
+
62
+ it('should handle whitespace in service names', () => {
63
+ const input = 'pdf= unstructured , default ;doc=doc2x';
64
+ const result = ChunkingRuleParser.parse(input);
65
+
66
+ expect(result).toEqual({
67
+ pdf: ['unstructured', 'default'],
68
+ doc: ['doc2x'],
69
+ });
70
+ });
71
+
72
+ it('should handle duplicate services for same file type', () => {
73
+ const input = 'pdf=unstructured,default,unstructured';
74
+ const result = ChunkingRuleParser.parse(input);
75
+
76
+ expect(result).toEqual({
77
+ pdf: ['unstructured', 'default', 'unstructured'],
78
+ });
79
+ });
80
+ });
81
+ });
@@ -0,0 +1,23 @@
1
+ export type ChunkingService = 'unstructured' | 'doc2x' | 'default';
2
+
3
+ export const ChunkingRuleParser = {
4
+ parse(rulesStr: string): Record<string, ChunkingService[]> {
5
+ const rules: Record<string, ChunkingService[]> = {};
6
+
7
+ // Split by semicolon for different file types
8
+ const fileTypeRules = rulesStr.split(';');
9
+
10
+ for (const rule of fileTypeRules) {
11
+ const [fileType, services] = rule.split('=');
12
+ if (!fileType || !services) continue;
13
+
14
+ // Split services by comma and validate each service
15
+ rules[fileType.toLowerCase()] = services
16
+ .split(',')
17
+ .map((s) => s.trim().toLowerCase())
18
+ .filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s));
19
+ }
20
+
21
+ return rules;
22
+ },
23
+ } as const;