@lobehub/chat 1.55.3 → 1.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +50 -0
- package/changelog/v1.json +18 -0
- package/docker-compose/local/docker-compose.yml +16 -0
- package/docs/self-hosting/advanced/auth/next-auth/casdoor.mdx +1 -1
- package/docs/self-hosting/advanced/auth/next-auth/casdoor.zh-CN.mdx +1 -1
- package/docs/self-hosting/advanced/auth/next-auth/logto.mdx +1 -1
- package/docs/self-hosting/advanced/auth/next-auth/logto.zh-CN.mdx +1 -1
- package/package.json +3 -3
- package/scripts/vercelIgnoredBuildStep.js +1 -1
- package/src/config/knowledge.ts +14 -16
- package/src/features/ChatInput/ActionBar/Params/ParamsControls.tsx +12 -1
- package/src/features/ChatInput/ActionBar/Params/index.tsx +6 -1
- package/src/server/modules/ContentChunk/index.ts +44 -6
- package/src/server/modules/ContentChunk/rules.test.ts +81 -0
- package/src/server/modules/ContentChunk/rules.ts +23 -0
package/CHANGELOG.md
CHANGED
@@ -2,6 +2,56 @@
|
|
2
2
|
|
3
3
|
# Changelog
|
4
4
|
|
5
|
+
## [Version 1.56.0](https://github.com/lobehub/lobe-chat/compare/v1.55.4...v1.56.0)
|
6
|
+
|
7
|
+
<sup>Released on **2025-02-15**</sup>
|
8
|
+
|
9
|
+
#### ✨ Features
|
10
|
+
|
11
|
+
- **misc**: Add configurable PDF processing method with Unstructured.
|
12
|
+
|
13
|
+
<br/>
|
14
|
+
|
15
|
+
<details>
|
16
|
+
<summary><kbd>Improvements and Fixes</kbd></summary>
|
17
|
+
|
18
|
+
#### What's improved
|
19
|
+
|
20
|
+
- **misc**: Add configurable PDF processing method with Unstructured, closes [#5927](https://github.com/lobehub/lobe-chat/issues/5927) ([35fa3ee](https://github.com/lobehub/lobe-chat/commit/35fa3ee))
|
21
|
+
|
22
|
+
</details>
|
23
|
+
|
24
|
+
<div align="right">
|
25
|
+
|
26
|
+
[](#readme-top)
|
27
|
+
|
28
|
+
</div>
|
29
|
+
|
30
|
+
### [Version 1.55.4](https://github.com/lobehub/lobe-chat/compare/v1.55.3...v1.55.4)
|
31
|
+
|
32
|
+
<sup>Released on **2025-02-15**</sup>
|
33
|
+
|
34
|
+
#### 💄 Styles
|
35
|
+
|
36
|
+
- **misc**: Improve mobile params style.
|
37
|
+
|
38
|
+
<br/>
|
39
|
+
|
40
|
+
<details>
|
41
|
+
<summary><kbd>Improvements and Fixes</kbd></summary>
|
42
|
+
|
43
|
+
#### Styles
|
44
|
+
|
45
|
+
- **misc**: Improve mobile params style, closes [#6176](https://github.com/lobehub/lobe-chat/issues/6176) ([b5276de](https://github.com/lobehub/lobe-chat/commit/b5276de))
|
46
|
+
|
47
|
+
</details>
|
48
|
+
|
49
|
+
<div align="right">
|
50
|
+
|
51
|
+
[](#readme-top)
|
52
|
+
|
53
|
+
</div>
|
54
|
+
|
5
55
|
### [Version 1.55.3](https://github.com/lobehub/lobe-chat/compare/v1.55.2...v1.55.3)
|
6
56
|
|
7
57
|
<sup>Released on **2025-02-15**</sup>
|
package/changelog/v1.json
CHANGED
@@ -1,4 +1,22 @@
|
|
1
1
|
[
|
2
|
+
{
|
3
|
+
"children": {
|
4
|
+
"features": [
|
5
|
+
"Add configurable PDF processing method with Unstructured."
|
6
|
+
]
|
7
|
+
},
|
8
|
+
"date": "2025-02-15",
|
9
|
+
"version": "1.56.0"
|
10
|
+
},
|
11
|
+
{
|
12
|
+
"children": {
|
13
|
+
"improvements": [
|
14
|
+
"Improve mobile params style."
|
15
|
+
]
|
16
|
+
},
|
17
|
+
"date": "2025-02-15",
|
18
|
+
"version": "1.55.4"
|
19
|
+
},
|
2
20
|
{
|
3
21
|
"children": {
|
4
22
|
"improvements": [
|
@@ -112,18 +112,34 @@ services:
|
|
112
112
|
echo '⚠️Warining: Unable to fetch OIDC configuration from Casdoor'
|
113
113
|
echo 'Request URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
|
114
114
|
echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
115
|
+
echo ''
|
116
|
+
echo '⚠️注意:无法从 Casdoor 获取 OIDC 配置'
|
117
|
+
echo '请求 URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
|
118
|
+
echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
119
|
+
echo ''
|
115
120
|
else
|
116
121
|
if ! wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep 'issuer' | grep ${AUTH_CASDOOR_ISSUER}; then
|
117
122
|
printf '❌Error: The Auth issuer is conflict, Issuer in OIDC configuration is: %s' \$(wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep -E 'issuer.*' | awk -F '\"' '{print \$4}')
|
118
123
|
echo ' , but the issuer in .env file is: ${AUTH_CASDOOR_ISSUER} '
|
119
124
|
echo 'Request URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
|
120
125
|
echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
126
|
+
echo ''
|
127
|
+
printf '❌错误:Auth 的 issuer 冲突,OIDC 配置中的 issuer 是:%s' \$(wget -O - --timeout=5 ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration 2>&1 | grep -E 'issuer.*' | awk -F '\"' '{print \$4}')
|
128
|
+
echo ' , 但 .env 文件中的 issuer 是:${AUTH_CASDOOR_ISSUER} '
|
129
|
+
echo '请求 URL: ${AUTH_CASDOOR_ISSUER}/.well-known/openid-configuration'
|
130
|
+
echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
131
|
+
echo ''
|
121
132
|
fi
|
122
133
|
fi
|
123
134
|
if [ $(wget --timeout=5 --spider --server-response ${S3_ENDPOINT}/minio/health/live 2>&1 | grep -c 'HTTP/1.1 200 OK') -eq 0 ]; then
|
124
135
|
echo '⚠️Warining: Unable to fetch MinIO health status'
|
125
136
|
echo 'Request URL: ${S3_ENDPOINT}/minio/health/live'
|
126
137
|
echo 'Read more at: https://lobehub.com/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
138
|
+
echo ''
|
139
|
+
echo '⚠️注意:无法获取 MinIO 健康状态'
|
140
|
+
echo '请求 URL: ${S3_ENDPOINT}/minio/health/live'
|
141
|
+
echo '了解更多:https://lobehub.com/zh/docs/self-hosting/server-database/docker-compose#necessary-configuration'
|
142
|
+
echo ''
|
127
143
|
fi
|
128
144
|
wait \$LOBE_PID
|
129
145
|
"
|
@@ -109,7 +109,7 @@ If you are deploying using a public network, the following assumptions apply:
|
|
109
109
|
|
110
110
|
Go to `Admin` -> `Webhooks`, add a webhook, and fill in the following fields:
|
111
111
|
|
112
|
-
- URL: `https://lobe.example.com/api/
|
112
|
+
- URL: `https://lobe.example.com/api/webhooks/casdoor`
|
113
113
|
- Method: `POST`
|
114
114
|
- Content Type: `application/json`
|
115
115
|
- Headers: `casdoor-secret`: `Your Webhook Secret`
|
@@ -98,7 +98,7 @@ tags:
|
|
98
98
|
|
99
99
|
前往 `管理工具` -> `Webhooks`,创建一个 Webhook,添加一个 Webhook,填写以下字段:
|
100
100
|
|
101
|
-
- 链接:`http://lobe.example.com/api/
|
101
|
+
- 链接:`http://lobe.example.com/api/webhooks/casdoor`
|
102
102
|
- 方法:`POST`
|
103
103
|
- 内容类型:`application/json`
|
104
104
|
- 协议头:`casdoor-secret`: `你的Webhook密钥`
|
@@ -49,7 +49,7 @@ If you are using Logto Cloud, assume its endpoint domain is `https://example.log
|
|
49
49
|
|
50
50
|
Go to `Webhooks`, create a Webhook, and fill in the following fields:
|
51
51
|
|
52
|
-
- Endpoint URL: `https://lobe.example.com/api/
|
52
|
+
- Endpoint URL: `https://lobe.example.com/api/webhooks/logto`
|
53
53
|
- Events: `User.Data.Updated`
|
54
54
|
|
55
55
|
After successful creation, copy the Webhook's `Signing Key` and fill it in the `LOGTO_WEBHOOK_SIGNING_KEY` environment variable.
|
@@ -46,7 +46,7 @@ tags:
|
|
46
46
|
|
47
47
|
前往 `Webhooks` ,创建一个 Webhook,填写以下字段:
|
48
48
|
|
49
|
-
- 端点 URL: `https://lobe.example.com/api/
|
49
|
+
- 端点 URL: `https://lobe.example.com/api/webhooks/logto`
|
50
50
|
- 事件: `User.Data.Updated`
|
51
51
|
|
52
52
|
创建成功后,复制 Webhook 的 `签名密钥`。填写到环境变量中的 `LOGTO_WEBHOOK_SIGNING_KEY`。
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "@lobehub/chat",
|
3
|
-
"version": "1.
|
3
|
+
"version": "1.56.0",
|
4
4
|
"description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
|
5
5
|
"keywords": [
|
6
6
|
"framework",
|
@@ -126,13 +126,13 @@
|
|
126
126
|
"@lobehub/chat-plugins-gateway": "^1.9.0",
|
127
127
|
"@lobehub/icons": "^1.69.0",
|
128
128
|
"@lobehub/tts": "^1.28.0",
|
129
|
-
"@lobehub/ui": "^1.164.
|
129
|
+
"@lobehub/ui": "^1.164.15",
|
130
130
|
"@neondatabase/serverless": "^0.10.4",
|
131
131
|
"@next/third-parties": "^15.1.4",
|
132
132
|
"@react-spring/web": "^9.7.5",
|
133
133
|
"@sentry/nextjs": "^7.120.2",
|
134
134
|
"@serwist/next": "^9.0.11",
|
135
|
-
"@t3-oss/env-nextjs": "^0.
|
135
|
+
"@t3-oss/env-nextjs": "^0.12.0",
|
136
136
|
"@tanstack/react-query": "^5.62.16",
|
137
137
|
"@trpc/client": "next",
|
138
138
|
"@trpc/next": "next",
|
@@ -5,7 +5,7 @@ const branchName = process.env.VERCEL_GIT_COMMIT_REF || '';
|
|
5
5
|
|
6
6
|
function shouldProceedBuild() {
|
7
7
|
// 如果是 lighthouse 分支或以 testgru 开头的分支,取消构建
|
8
|
-
if (branchName === 'lighthouse' || branchName.startsWith('
|
8
|
+
if (branchName === 'lighthouse' || branchName.startsWith('gru/')) {
|
9
9
|
return false;
|
10
10
|
}
|
11
11
|
|
package/src/config/knowledge.ts
CHANGED
@@ -1,19 +1,17 @@
|
|
1
1
|
import { createEnv } from '@t3-oss/env-nextjs';
|
2
2
|
import { z } from 'zod';
|
3
3
|
|
4
|
-
export const
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
}
|
17
|
-
};
|
18
|
-
|
19
|
-
export const knowledgeEnv = getKnowledgeConfig();
|
4
|
+
export const knowledgeEnv = createEnv({
|
5
|
+
runtimeEnv: {
|
6
|
+
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
|
7
|
+
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES,
|
8
|
+
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
|
9
|
+
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
|
10
|
+
},
|
11
|
+
server: {
|
12
|
+
DEFAULT_FILES_CONFIG: z.string().optional(),
|
13
|
+
FILE_TYPE_CHUNKING_RULES: z.string().optional(),
|
14
|
+
UNSTRUCTURED_API_KEY: z.string().optional(),
|
15
|
+
UNSTRUCTURED_SERVER_URL: z.string().optional(),
|
16
|
+
},
|
17
|
+
});
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import { Form, Tag } from '@lobehub/ui';
|
2
2
|
import type { FormItemProps } from '@lobehub/ui/es/Form/components/FormItem';
|
3
|
+
import { createStyles } from 'antd-style';
|
3
4
|
import isEqual from 'fast-deep-equal';
|
4
5
|
import { debounce } from 'lodash-es';
|
5
6
|
import { memo } from 'react';
|
@@ -16,10 +17,20 @@ import {
|
|
16
17
|
import { useAgentStore } from '@/store/agent';
|
17
18
|
import { agentSelectors } from '@/store/agent/selectors';
|
18
19
|
|
20
|
+
const useStyles = createStyles(({ css }) => ({
|
21
|
+
container: css`
|
22
|
+
.ant-form-group {
|
23
|
+
padding-inline: 0;
|
24
|
+
background: transparent;
|
25
|
+
}
|
26
|
+
`,
|
27
|
+
}));
|
28
|
+
|
19
29
|
interface ParamsControlsProps {
|
20
30
|
setUpdating: (updating: boolean) => void;
|
21
31
|
}
|
22
32
|
const ParamsControls = memo<ParamsControlsProps>(({ setUpdating }) => {
|
33
|
+
const { styles } = useStyles();
|
23
34
|
const { t } = useTranslation('setting');
|
24
35
|
|
25
36
|
const updateAgentConfig = useAgentStore((s) => s.updateAgentConfig);
|
@@ -75,13 +86,13 @@ const ParamsControls = memo<ParamsControlsProps>(({ setUpdating }) => {
|
|
75
86
|
|
76
87
|
return (
|
77
88
|
<Form
|
89
|
+
className={styles.container}
|
78
90
|
initialValues={config}
|
79
91
|
itemMinWidth={200}
|
80
92
|
items={items}
|
81
93
|
itemsType={'flat'}
|
82
94
|
onValuesChange={debounce(async (values) => {
|
83
95
|
setUpdating(true);
|
84
|
-
console.log(values);
|
85
96
|
await updateAgentConfig(values);
|
86
97
|
setUpdating(false);
|
87
98
|
}, 500)}
|
@@ -7,6 +7,7 @@ import { useTranslation } from 'react-i18next';
|
|
7
7
|
import { Flexbox } from 'react-layout-kit';
|
8
8
|
|
9
9
|
import UpdateLoading from '@/components/Loading/UpdateLoading';
|
10
|
+
import { useIsMobile } from '@/hooks/useIsMobile';
|
10
11
|
|
11
12
|
import ParamsControls from './ParamsControls';
|
12
13
|
|
@@ -16,6 +17,7 @@ const Params = memo(() => {
|
|
16
17
|
const [isUpdating, setUpdating] = useState(false);
|
17
18
|
|
18
19
|
const theme = useTheme();
|
20
|
+
const isMobile = useIsMobile();
|
19
21
|
return (
|
20
22
|
<Popover
|
21
23
|
arrow={false}
|
@@ -24,7 +26,10 @@ const Params = memo(() => {
|
|
24
26
|
open={popoverOpen}
|
25
27
|
placement={'top'}
|
26
28
|
styles={{
|
27
|
-
body: {
|
29
|
+
body: {
|
30
|
+
minWidth: isMobile ? undefined : 400,
|
31
|
+
width: isMobile ? '100vw' : undefined,
|
32
|
+
},
|
28
33
|
}}
|
29
34
|
title={
|
30
35
|
<Flexbox horizontal justify={'space-between'}>
|
@@ -1,9 +1,13 @@
|
|
1
1
|
import { ChunkingLoader } from 'src/libs/langchain';
|
2
2
|
import { Strategy } from 'unstructured-client/sdk/models/shared';
|
3
3
|
|
4
|
-
import {
|
4
|
+
import { knowledgeEnv } from '@/config/knowledge';
|
5
|
+
import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
|
5
6
|
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
|
6
7
|
|
8
|
+
import { ChunkingRuleParser } from './rules';
|
9
|
+
import type { ChunkingService } from './rules';
|
10
|
+
|
7
11
|
export interface ChunkContentParams {
|
8
12
|
content: Uint8Array;
|
9
13
|
fileType: string;
|
@@ -19,23 +23,57 @@ interface ChunkResult {
|
|
19
23
|
export class ContentChunk {
|
20
24
|
private unstructuredClient: Unstructured;
|
21
25
|
private langchainClient: ChunkingLoader;
|
26
|
+
private chunkingRules: Record<string, ChunkingService[]>;
|
22
27
|
|
23
28
|
constructor() {
|
24
29
|
this.unstructuredClient = new Unstructured();
|
25
30
|
this.langchainClient = new ChunkingLoader();
|
31
|
+
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
|
26
32
|
}
|
27
33
|
|
28
|
-
|
29
|
-
|
34
|
+
private getChunkingServices(fileType: string): ChunkingService[] {
|
35
|
+
const ext = fileType.split('/').pop()?.toLowerCase() || '';
|
36
|
+
return this.chunkingRules[ext] || ['default'];
|
30
37
|
}
|
31
38
|
|
32
39
|
async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
|
33
|
-
|
34
|
-
|
35
|
-
|
40
|
+
const services = this.getChunkingServices(params.fileType);
|
41
|
+
|
42
|
+
for (const service of services) {
|
43
|
+
try {
|
44
|
+
switch (service) {
|
45
|
+
case 'unstructured': {
|
46
|
+
if (this.canUseUnstructured()) {
|
47
|
+
return await this.chunkByUnstructured(params.filename, params.content);
|
48
|
+
}
|
49
|
+
break;
|
50
|
+
}
|
51
|
+
|
52
|
+
case 'doc2x': {
|
53
|
+
// Future implementation
|
54
|
+
break;
|
55
|
+
}
|
56
|
+
|
57
|
+
default: {
|
58
|
+
return await this.chunkByLangChain(params.filename, params.content);
|
59
|
+
}
|
60
|
+
}
|
61
|
+
} catch (error) {
|
62
|
+
// If this is the last service, throw the error
|
63
|
+
if (service === services.at(-1)) throw error;
|
64
|
+
// Otherwise continue to next service
|
65
|
+
console.error(`Chunking failed with service ${service}:`, error);
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
// Fallback to langchain if no service succeeded
|
36
70
|
return await this.chunkByLangChain(params.filename, params.content);
|
37
71
|
}
|
38
72
|
|
73
|
+
private canUseUnstructured(): boolean {
|
74
|
+
return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
|
75
|
+
}
|
76
|
+
|
39
77
|
private chunkByUnstructured = async (
|
40
78
|
filename: string,
|
41
79
|
content: Uint8Array,
|
@@ -0,0 +1,81 @@
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
2
|
+
import { ChunkingRuleParser } from './rules';
|
3
|
+
|
4
|
+
describe('ChunkingRuleParser', () => {
|
5
|
+
describe('parse', () => {
|
6
|
+
it('should parse a single file type rule correctly', () => {
|
7
|
+
const input = 'pdf=unstructured,default';
|
8
|
+
const result = ChunkingRuleParser.parse(input);
|
9
|
+
|
10
|
+
expect(result).toEqual({
|
11
|
+
pdf: ['unstructured', 'default'],
|
12
|
+
});
|
13
|
+
});
|
14
|
+
|
15
|
+
it('should parse multiple file type rules correctly', () => {
|
16
|
+
const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default';
|
17
|
+
const result = ChunkingRuleParser.parse(input);
|
18
|
+
|
19
|
+
expect(result).toEqual({
|
20
|
+
pdf: ['unstructured', 'default'],
|
21
|
+
doc: ['doc2x', 'default'],
|
22
|
+
txt: ['default'],
|
23
|
+
});
|
24
|
+
});
|
25
|
+
|
26
|
+
it('should convert file types to lowercase', () => {
|
27
|
+
const input = 'PDF=unstructured;DOC=doc2x';
|
28
|
+
const result = ChunkingRuleParser.parse(input);
|
29
|
+
|
30
|
+
expect(result).toEqual({
|
31
|
+
pdf: ['unstructured'],
|
32
|
+
doc: ['doc2x'],
|
33
|
+
});
|
34
|
+
});
|
35
|
+
|
36
|
+
it('should filter out invalid service names', () => {
|
37
|
+
const input = 'pdf=unstructured,invalid,default,wrongservice';
|
38
|
+
const result = ChunkingRuleParser.parse(input);
|
39
|
+
|
40
|
+
expect(result).toEqual({
|
41
|
+
pdf: ['unstructured', 'default'],
|
42
|
+
});
|
43
|
+
});
|
44
|
+
|
45
|
+
it('should handle empty string input', () => {
|
46
|
+
const input = '';
|
47
|
+
const result = ChunkingRuleParser.parse(input);
|
48
|
+
|
49
|
+
expect(result).toEqual({});
|
50
|
+
});
|
51
|
+
|
52
|
+
it('should skip invalid rule formats', () => {
|
53
|
+
const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt';
|
54
|
+
const result = ChunkingRuleParser.parse(input);
|
55
|
+
|
56
|
+
expect(result).toEqual({
|
57
|
+
pdf: ['unstructured'],
|
58
|
+
doc: ['doc2x'],
|
59
|
+
});
|
60
|
+
});
|
61
|
+
|
62
|
+
it('should handle whitespace in service names', () => {
|
63
|
+
const input = 'pdf= unstructured , default ;doc=doc2x';
|
64
|
+
const result = ChunkingRuleParser.parse(input);
|
65
|
+
|
66
|
+
expect(result).toEqual({
|
67
|
+
pdf: ['unstructured', 'default'],
|
68
|
+
doc: ['doc2x'],
|
69
|
+
});
|
70
|
+
});
|
71
|
+
|
72
|
+
it('should handle duplicate services for same file type', () => {
|
73
|
+
const input = 'pdf=unstructured,default,unstructured';
|
74
|
+
const result = ChunkingRuleParser.parse(input);
|
75
|
+
|
76
|
+
expect(result).toEqual({
|
77
|
+
pdf: ['unstructured', 'default', 'unstructured'],
|
78
|
+
});
|
79
|
+
});
|
80
|
+
});
|
81
|
+
});
|
@@ -0,0 +1,23 @@
|
|
1
|
+
export type ChunkingService = 'unstructured' | 'doc2x' | 'default';
|
2
|
+
|
3
|
+
export const ChunkingRuleParser = {
|
4
|
+
parse(rulesStr: string): Record<string, ChunkingService[]> {
|
5
|
+
const rules: Record<string, ChunkingService[]> = {};
|
6
|
+
|
7
|
+
// Split by semicolon for different file types
|
8
|
+
const fileTypeRules = rulesStr.split(';');
|
9
|
+
|
10
|
+
for (const rule of fileTypeRules) {
|
11
|
+
const [fileType, services] = rule.split('=');
|
12
|
+
if (!fileType || !services) continue;
|
13
|
+
|
14
|
+
// Split services by comma and validate each service
|
15
|
+
rules[fileType.toLowerCase()] = services
|
16
|
+
.split(',')
|
17
|
+
.map((s) => s.trim().toLowerCase())
|
18
|
+
.filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s));
|
19
|
+
}
|
20
|
+
|
21
|
+
return rules;
|
22
|
+
},
|
23
|
+
} as const;
|