plugin-document-parser 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/client.d.ts +2 -0
- package/client.js +1 -0
- package/dist/client/01b8a5798a872638.js +10 -0
- package/dist/client/022be20abc96fdb4.js +10 -0
- package/dist/client/12e97e7a84d900e0.js +10 -0
- package/dist/client/index.js +10 -0
- package/dist/externalVersion.js +20 -0
- package/dist/index.js +48 -0
- package/dist/locale/en-US.json +54 -0
- package/dist/locale/vi-VN.json +54 -0
- package/dist/node_modules/form-data/License +19 -0
- package/dist/node_modules/form-data/index.d.ts +62 -0
- package/dist/node_modules/form-data/lib/browser.js +4 -0
- package/dist/node_modules/form-data/lib/form_data.js +14 -0
- package/dist/node_modules/form-data/lib/populate.js +10 -0
- package/dist/node_modules/form-data/package.json +1 -0
- package/dist/server/collections/doc-parser-providers.js +137 -0
- package/dist/server/collections/doc-parser-settings.js +85 -0
- package/dist/server/index.js +51 -0
- package/dist/server/plugin.js +181 -0
- package/dist/server/resource/docParserProviders.js +91 -0
- package/dist/server/services/builtin-ai-handler.js +63 -0
- package/dist/server/services/external-ocr-client.js +189 -0
- package/dist/server/services/internal-parser-registry.js +82 -0
- package/dist/server/services/parse-router.js +273 -0
- package/package.json +33 -0
- package/server.d.ts +2 -0
- package/server.js +1 -0
- package/src/client/components/GlobalSettings.tsx +151 -0
- package/src/client/components/ProviderForm.tsx +266 -0
- package/src/client/components/ProviderList.tsx +193 -0
- package/src/client/components/SettingsPage.tsx +43 -0
- package/src/client/index.tsx +2 -0
- package/src/client/locale.ts +12 -0
- package/src/client/plugin.tsx +34 -0
- package/src/index.ts +2 -0
- package/src/locale/en-US.json +54 -0
- package/src/locale/vi-VN.json +54 -0
- package/src/server/collections/doc-parser-providers.ts +107 -0
- package/src/server/collections/doc-parser-settings.ts +59 -0
- package/src/server/index.ts +10 -0
- package/src/server/plugin.ts +172 -0
- package/src/server/resource/docParserProviders.ts +72 -0
- package/src/server/services/builtin-ai-handler.ts +49 -0
- package/src/server/services/external-ocr-client.ts +233 -0
- package/src/server/services/internal-parser-registry.ts +126 -0
- package/src/server/services/parse-router.ts +357 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { Plugin, lazy } from '@nocobase/client';
|
|
2
|
+
import { NAMESPACE } from './locale';
|
|
3
|
+
|
|
4
|
+
const SettingsPage = lazy(() => import('./components/SettingsPage'), 'SettingsPage');
|
|
5
|
+
|
|
6
|
+
export class PluginDocumentParserClient extends Plugin {
|
|
7
|
+
async afterAdd() {
|
|
8
|
+
// Register locale
|
|
9
|
+
await this.app.i18n.changeLanguage(this.app.i18n.language);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async load() {
|
|
13
|
+
// Add locale resources
|
|
14
|
+
const locale = this.app.i18n.language || 'en-US';
|
|
15
|
+
try {
|
|
16
|
+
const messages = await import(`../locale/${locale}.json`).catch(
|
|
17
|
+
() => import('../locale/en-US.json'),
|
|
18
|
+
);
|
|
19
|
+
this.app.i18n.addResourceBundle(locale, NAMESPACE, messages.default || messages, true, true);
|
|
20
|
+
} catch {
|
|
21
|
+
// Locale file may not exist for this language — silently skip
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Register the settings page under Plugin Settings
|
|
25
|
+
this.app.pluginSettingsManager.add(NAMESPACE, {
|
|
26
|
+
title: '{{t("Document Parser", { ns: "' + NAMESPACE + '" })}}',
|
|
27
|
+
icon: 'FileTextOutlined',
|
|
28
|
+
Component: SettingsPage,
|
|
29
|
+
aclSnippet: `pm.${NAMESPACE}.settings`,
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export default PluginDocumentParserClient;
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"Document Parser": "Document Parser",
|
|
3
|
+
"Processing Mode": "Processing Mode",
|
|
4
|
+
"Default (plugin-ai built-in)": "Default (plugin-ai built-in)",
|
|
5
|
+
"Internal (built-in document loaders)": "Internal (built-in document loaders)",
|
|
6
|
+
"External (OCR API provider)": "External (OCR API provider)",
|
|
7
|
+
"Active Provider": "Active Provider",
|
|
8
|
+
"Fallback to default on error": "Fallback to default on error",
|
|
9
|
+
"Pass images through to default": "Pass images through to default",
|
|
10
|
+
"OCR Providers": "OCR Providers",
|
|
11
|
+
"Add Provider": "Add Provider",
|
|
12
|
+
"Edit Provider": "Edit Provider",
|
|
13
|
+
"Delete Provider": "Delete Provider",
|
|
14
|
+
"Test Connection": "Test Connection",
|
|
15
|
+
"Provider Title": "Provider Title",
|
|
16
|
+
"API Endpoint": "API Endpoint",
|
|
17
|
+
"API Key": "API Key",
|
|
18
|
+
"Auth Type": "Auth Type",
|
|
19
|
+
"Bearer Token": "Bearer Token",
|
|
20
|
+
"API Key Header": "API Key Header",
|
|
21
|
+
"Basic Auth": "Basic Auth",
|
|
22
|
+
"Custom Header": "Custom Header",
|
|
23
|
+
"Header Name": "Header Name",
|
|
24
|
+
"Username": "Username",
|
|
25
|
+
"Password": "Password",
|
|
26
|
+
"Custom Headers": "Custom Headers",
|
|
27
|
+
"Request Format": "Request Format",
|
|
28
|
+
"Multipart Form Data": "Multipart Form Data",
|
|
29
|
+
"JSON Base64": "JSON Base64",
|
|
30
|
+
"Form Field Name": "Form Field Name",
|
|
31
|
+
"Base64 Field Path": "Base64 Field Path",
|
|
32
|
+
"Filename Field Path": "Filename Field Path",
|
|
33
|
+
"Mimetype Field Path": "Mimetype Field Path",
|
|
34
|
+
"Extra Request Body": "Extra Request Body",
|
|
35
|
+
"Response Text Path": "Response Text Path",
|
|
36
|
+
"Timeout (ms)": "Timeout (ms)",
|
|
37
|
+
"Supported MIME Types": "Supported MIME Types",
|
|
38
|
+
"Leave empty to handle all non-image types": "Leave empty to handle all non-image types",
|
|
39
|
+
"Connection successful": "Connection successful",
|
|
40
|
+
"Connection failed": "Connection failed",
|
|
41
|
+
"Global Settings": "Global Settings",
|
|
42
|
+
"Provider Configuration": "Provider Configuration",
|
|
43
|
+
"Enabled": "Enabled",
|
|
44
|
+
"No providers configured": "No providers configured",
|
|
45
|
+
"Please select a provider": "Please select a provider",
|
|
46
|
+
"Settings saved": "Settings saved",
|
|
47
|
+
"Provider saved": "Provider saved",
|
|
48
|
+
"Provider deleted": "Provider deleted",
|
|
49
|
+
"mode_default_desc": "Use the built-in attachment processing from plugin-ai (default behavior).",
|
|
50
|
+
"mode_internal_desc": "Parse documents using built-in loaders (PDF, DOCX, PPT, TXT) or custom parsers registered by other plugins.",
|
|
51
|
+
"mode_external_desc": "Send files to a configured external OCR/parse API and use the returned text as attachment content.",
|
|
52
|
+
"Index with DocPixie (when available)": "Index with DocPixie (when available)",
|
|
53
|
+
"docpixie_mode_desc": "When plugin-docpixie is active, automatically index attached documents and instruct the AI to use the docpixie:query tool instead of reading raw file content. Applies before the processing mode above."
|
|
54
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"Document Parser": "Trình xử lý tài liệu",
|
|
3
|
+
"Processing Mode": "Chế độ xử lý",
|
|
4
|
+
"Default (plugin-ai built-in)": "Mặc định (plugin-ai tích hợp sẵn)",
|
|
5
|
+
"Internal (built-in document loaders)": "Nội bộ (trình đọc tài liệu tích hợp)",
|
|
6
|
+
"External (OCR API provider)": "Bên ngoài (nhà cung cấp OCR API)",
|
|
7
|
+
"Active Provider": "Nhà cung cấp đang dùng",
|
|
8
|
+
"Fallback to default on error": "Dùng mặc định khi lỗi",
|
|
9
|
+
"Pass images through to default": "Chuyển hình ảnh qua xử lý mặc định",
|
|
10
|
+
"OCR Providers": "Nhà cung cấp OCR",
|
|
11
|
+
"Add Provider": "Thêm nhà cung cấp",
|
|
12
|
+
"Edit Provider": "Chỉnh sửa nhà cung cấp",
|
|
13
|
+
"Delete Provider": "Xóa nhà cung cấp",
|
|
14
|
+
"Test Connection": "Kiểm tra kết nối",
|
|
15
|
+
"Provider Title": "Tên nhà cung cấp",
|
|
16
|
+
"API Endpoint": "Địa chỉ API",
|
|
17
|
+
"API Key": "Khóa API",
|
|
18
|
+
"Auth Type": "Loại xác thực",
|
|
19
|
+
"Bearer Token": "Bearer Token",
|
|
20
|
+
"API Key Header": "Header API Key",
|
|
21
|
+
"Basic Auth": "Basic Auth",
|
|
22
|
+
"Custom Header": "Header tùy chỉnh",
|
|
23
|
+
"Header Name": "Tên header",
|
|
24
|
+
"Username": "Tên đăng nhập",
|
|
25
|
+
"Password": "Mật khẩu",
|
|
26
|
+
"Custom Headers": "Headers tùy chỉnh",
|
|
27
|
+
"Request Format": "Định dạng yêu cầu",
|
|
28
|
+
"Multipart Form Data": "Multipart Form Data",
|
|
29
|
+
"JSON Base64": "JSON Base64",
|
|
30
|
+
"Form Field Name": "Tên trường form",
|
|
31
|
+
"Base64 Field Path": "Đường dẫn trường Base64",
|
|
32
|
+
"Filename Field Path": "Đường dẫn tên file",
|
|
33
|
+
"Mimetype Field Path": "Đường dẫn MIME type",
|
|
34
|
+
"Extra Request Body": "Body yêu cầu thêm",
|
|
35
|
+
"Response Text Path": "Đường dẫn text trong phản hồi",
|
|
36
|
+
"Timeout (ms)": "Timeout (ms)",
|
|
37
|
+
"Supported MIME Types": "MIME Types được hỗ trợ",
|
|
38
|
+
"Leave empty to handle all non-image types": "Để trống để xử lý tất cả loại không phải ảnh",
|
|
39
|
+
"Connection successful": "Kết nối thành công",
|
|
40
|
+
"Connection failed": "Kết nối thất bại",
|
|
41
|
+
"Global Settings": "Cài đặt chung",
|
|
42
|
+
"Provider Configuration": "Cấu hình nhà cung cấp",
|
|
43
|
+
"Enabled": "Kích hoạt",
|
|
44
|
+
"No providers configured": "Chưa có nhà cung cấp nào",
|
|
45
|
+
"Please select a provider": "Vui lòng chọn nhà cung cấp",
|
|
46
|
+
"Settings saved": "Đã lưu cài đặt",
|
|
47
|
+
"Provider saved": "Đã lưu nhà cung cấp",
|
|
48
|
+
"Provider deleted": "Đã xóa nhà cung cấp",
|
|
49
|
+
"Index with DocPixie (when available)": "Lập chỉ mục qua DocPixie (nếu có)",
|
|
50
|
+
"docpixie_mode_desc": "Khi plugin-docpixie đang hoạt động, tự động lập chỉ mục tài liệu đính kèm và hướng dẫn AI dùng công cụ docpixie:query thay vì đọc nội dung thô. Được áp dụng trước chế độ xử lý bên trên.",
|
|
51
|
+
"mode_default_desc": "Dùng chức năng xử lý tệp đính kèm mặc định của plugin-ai.",
|
|
52
|
+
"mode_internal_desc": "Phân tích tài liệu dùng trình đọc tích hợp (PDF, DOCX, PPT, TXT) hoặc các parser tùy chỉnh từ plugin khác.",
|
|
53
|
+
"mode_external_desc": "Gửi tệp đến API OCR/parse bên ngoài đã cấu hình và dùng văn bản trả về làm nội dung đính kèm."
|
|
54
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { defineCollection } from '@nocobase/database';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* External OCR/document-parse API provider configurations.
|
|
5
|
+
*
|
|
6
|
+
* Each provider describes HOW to call an external service:
|
|
7
|
+
* - authentication (bearer / api-key-header / basic / custom-headers)
|
|
8
|
+
* - request format (multipart | json-base64)
|
|
9
|
+
* - response text extraction (dot-path into JSON response)
|
|
10
|
+
*/
|
|
11
|
+
export default defineCollection({
|
|
12
|
+
name: 'docParserProviders',
|
|
13
|
+
title: 'Document Parser Providers',
|
|
14
|
+
fields: [
|
|
15
|
+
{
|
|
16
|
+
name: 'title',
|
|
17
|
+
type: 'string',
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
name: 'enabled',
|
|
21
|
+
type: 'boolean',
|
|
22
|
+
defaultValue: true,
|
|
23
|
+
},
|
|
24
|
+
// ── Endpoint ──────────────────────────────────────────────────────────────
|
|
25
|
+
{
|
|
26
|
+
name: 'apiEndpoint',
|
|
27
|
+
type: 'string',
|
|
28
|
+
comment: 'Full URL, e.g. https://ocr.example.com/v1/parse',
|
|
29
|
+
},
|
|
30
|
+
// ── Authentication ────────────────────────────────────────────────────────
|
|
31
|
+
{
|
|
32
|
+
name: 'authType',
|
|
33
|
+
type: 'string',
|
|
34
|
+
defaultValue: 'bearer',
|
|
35
|
+
comment: "'bearer' | 'api-key-header' | 'basic' | 'custom-headers' | 'none'",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
// Encrypted at rest via NocoBase password field
|
|
39
|
+
name: 'apiKey',
|
|
40
|
+
type: 'password',
|
|
41
|
+
allowNull: true,
|
|
42
|
+
comment: 'Used for bearer / api-key-header auth',
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
name: 'authConfig',
|
|
46
|
+
type: 'json',
|
|
47
|
+
defaultValue: {},
|
|
48
|
+
comment: JSON.stringify({
|
|
49
|
+
headerName: 'X-Api-Key', // for api-key-header
|
|
50
|
+
username: '', // for basic auth
|
|
51
|
+
password: '', // for basic auth
|
|
52
|
+
customHeaders: {}, // for custom-headers: { "X-Foo": "bar" }
|
|
53
|
+
}),
|
|
54
|
+
},
|
|
55
|
+
// ── Request format ────────────────────────────────────────────────────────
|
|
56
|
+
{
|
|
57
|
+
name: 'requestFormat',
|
|
58
|
+
type: 'string',
|
|
59
|
+
defaultValue: 'multipart',
|
|
60
|
+
comment: "'multipart' | 'json-base64' | 'url'",
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
name: 'requestConfig',
|
|
64
|
+
type: 'json',
|
|
65
|
+
defaultValue: {},
|
|
66
|
+
comment: JSON.stringify({
|
|
67
|
+
// multipart
|
|
68
|
+
fileFieldName: 'file',
|
|
69
|
+
filenameFieldName: '', // optional extra field for filename
|
|
70
|
+
mimetypeFieldName: '', // optional extra field for mimetype
|
|
71
|
+
extraFields: {}, // extra form fields
|
|
72
|
+
// json-base64
|
|
73
|
+
base64FieldPath: 'file', // e.g. "document.content"
|
|
74
|
+
filenameFieldPath: 'filename',
|
|
75
|
+
mimetypeFieldPath: 'mimetype',
|
|
76
|
+
extraBody: {},
|
|
77
|
+
// url (send download URL instead of file bytes)
|
|
78
|
+
urlFieldPath: 'url',
|
|
79
|
+
}),
|
|
80
|
+
},
|
|
81
|
+
// ── Response extraction ───────────────────────────────────────────────────
|
|
82
|
+
{
|
|
83
|
+
name: 'responseTextPath',
|
|
84
|
+
type: 'string',
|
|
85
|
+
defaultValue: 'text',
|
|
86
|
+
comment: "Dot-path into the JSON response body, e.g. 'data.text' or 'result.pages[0].content'",
|
|
87
|
+
},
|
|
88
|
+
// ── Scope ─────────────────────────────────────────────────────────────────
|
|
89
|
+
{
|
|
90
|
+
name: 'supportedMimetypes',
|
|
91
|
+
type: 'json',
|
|
92
|
+
defaultValue: [],
|
|
93
|
+
comment: 'Empty = handle everything routed to external. e.g. ["application/pdf"]',
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
name: 'timeout',
|
|
97
|
+
type: 'integer',
|
|
98
|
+
defaultValue: 60000,
|
|
99
|
+
comment: 'HTTP request timeout in milliseconds',
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
name: 'options',
|
|
103
|
+
type: 'json',
|
|
104
|
+
defaultValue: {},
|
|
105
|
+
},
|
|
106
|
+
],
|
|
107
|
+
});
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { defineCollection } from '@nocobase/database';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Global settings for the document parser plugin.
|
|
5
|
+
* Single-row config table (only one record expected).
|
|
6
|
+
*/
|
|
7
|
+
export default defineCollection({
|
|
8
|
+
name: 'docParserSettings',
|
|
9
|
+
title: 'Document Parser Settings',
|
|
10
|
+
fields: [
|
|
11
|
+
{
|
|
12
|
+
name: 'mode',
|
|
13
|
+
type: 'string',
|
|
14
|
+
defaultValue: 'default',
|
|
15
|
+
comment: "'default' | 'internal' | 'external'",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
// FK to docParserProviders — which external provider is active
|
|
19
|
+
name: 'activeProviderId',
|
|
20
|
+
type: 'bigInt',
|
|
21
|
+
allowNull: true,
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
// When internal/external parsing fails, fall back to the default provider logic
|
|
25
|
+
name: 'fallbackToDefault',
|
|
26
|
+
type: 'boolean',
|
|
27
|
+
defaultValue: true,
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
// Images are always passed through to the default provider (they don't need OCR)
|
|
31
|
+
name: 'imagePassThrough',
|
|
32
|
+
type: 'boolean',
|
|
33
|
+
defaultValue: true,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
// Optional: restrict which extnames this plugin handles (empty = all non-image)
|
|
37
|
+
name: 'includedExtnames',
|
|
38
|
+
type: 'json',
|
|
39
|
+
defaultValue: [],
|
|
40
|
+
comment: 'e.g. [".pdf", ".docx"] — empty means all non-image files',
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: 'options',
|
|
44
|
+
type: 'json',
|
|
45
|
+
defaultValue: {},
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
/**
|
|
49
|
+
* When true and plugin-docpixie is active:
|
|
50
|
+
* - Trigger docpixie:processDocument (async indexing)
|
|
51
|
+
* - Return a metadata reference block instead of full text
|
|
52
|
+
* - LLM is instructed to call docpixie:query tool for retrieval
|
|
53
|
+
*/
|
|
54
|
+
name: 'useDocpixie',
|
|
55
|
+
type: 'boolean',
|
|
56
|
+
defaultValue: false,
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export { default } from './plugin';
|
|
2
|
+
export { PluginDocumentParserServer } from './plugin';
|
|
3
|
+
export { InternalParserRegistry } from './services/internal-parser-registry';
|
|
4
|
+
export type {
|
|
5
|
+
InternalParserHandler,
|
|
6
|
+
InternalParseResult,
|
|
7
|
+
AttachmentLike,
|
|
8
|
+
} from './services/internal-parser-registry';
|
|
9
|
+
export type { OcrProviderConfig, OcrAuthType, OcrRequestFormat } from './services/external-ocr-client';
|
|
10
|
+
export type { ParsedAttachmentResult } from './services/parse-router';
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { Plugin } from '@nocobase/server';
|
|
2
|
+
import { resolve } from 'path';
|
|
3
|
+
import { Context } from '@nocobase/actions';
|
|
4
|
+
import axios from 'axios';
|
|
5
|
+
import { InternalParserRegistry } from './services/internal-parser-registry';
|
|
6
|
+
import { BuiltinAIDocumentHandler } from './services/builtin-ai-handler';
|
|
7
|
+
import { ParseRouter } from './services/parse-router';
|
|
8
|
+
import { testConnection, getSettings, saveSettings } from './resource/docParserProviders';
|
|
9
|
+
import type { AttachmentLike } from './services/internal-parser-registry';
|
|
10
|
+
|
|
11
|
+
export class PluginDocumentParserServer extends Plugin {
|
|
12
|
+
/**
|
|
13
|
+
* Public registry — other plugins register their format handlers here:
|
|
14
|
+
*
|
|
15
|
+
* const docParser = this.pm.get(PluginDocumentParserServer);
|
|
16
|
+
* docParser.internalParserRegistry.register({ name, supports, parse });
|
|
17
|
+
*/
|
|
18
|
+
readonly internalParserRegistry = new InternalParserRegistry();
|
|
19
|
+
|
|
20
|
+
parseRouter!: ParseRouter;
|
|
21
|
+
|
|
22
|
+
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
async beforeLoad() {
|
|
25
|
+
// Register the built-in AI document handler (lowest priority — appended last)
|
|
26
|
+
// Done in beforeLoad so other plugins' load() can prepend higher-priority handlers
|
|
27
|
+
this.internalParserRegistry.register(
|
|
28
|
+
new BuiltinAIDocumentHandler(() => {
|
|
29
|
+
const aiPlugin = this.pm.get('@nocobase/plugin-ai') as any;
|
|
30
|
+
return aiPlugin?.documentLoaders;
|
|
31
|
+
}),
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async load() {
|
|
36
|
+
// 1. Load collections
|
|
37
|
+
await this.importCollections(resolve(__dirname, 'collections'));
|
|
38
|
+
|
|
39
|
+
// 2. Wire up the parse router
|
|
40
|
+
this.parseRouter = new ParseRouter(
|
|
41
|
+
() => this.db.getRepository('docParserSettings'),
|
|
42
|
+
() => this.db.getRepository('docParserProviders'),
|
|
43
|
+
this.internalParserRegistry,
|
|
44
|
+
this.fetchFileBuffer.bind(this),
|
|
45
|
+
() => {
|
|
46
|
+
const p = this.pm.get('@nocobase/plugin-docpixie') as any;
|
|
47
|
+
return p?.service ? p : null;
|
|
48
|
+
},
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
// 3. Patch AIManager to intercept parseAttachment on ALL providers
|
|
52
|
+
this.wrapAIManager();
|
|
53
|
+
|
|
54
|
+
// 4. Register resources
|
|
55
|
+
this.app.resourceManager.define({
|
|
56
|
+
name: 'docParserProviders',
|
|
57
|
+
actions: {
|
|
58
|
+
testConnection,
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
this.app.resourceManager.define({
|
|
63
|
+
name: 'docParserSettings',
|
|
64
|
+
actions: {
|
|
65
|
+
get: getSettings,
|
|
66
|
+
save: saveSettings,
|
|
67
|
+
},
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// 5. ACL — allow admins to manage settings & providers
|
|
71
|
+
this.app.acl.allow('docParserProviders', ['list', 'create', 'update', 'destroy', 'get', 'testConnection'], 'loggedIn');
|
|
72
|
+
this.app.acl.allow('docParserSettings', ['get', 'save'], 'loggedIn');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// ── AIManager patching ────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Wrap AIManager.registerLLMProvider so that every provider class — including
|
|
79
|
+
* those registered AFTER this plugin loads (e.g. plugin-custom-llm) — gets
|
|
80
|
+
* its `parseAttachment` intercepted.
|
|
81
|
+
*
|
|
82
|
+
* Additionally, iterate providers already registered (plugin-ai built-ins:
|
|
83
|
+
* OpenAI, Anthropic, etc.) and wrap them immediately.
|
|
84
|
+
*/
|
|
85
|
+
private wrapAIManager() {
|
|
86
|
+
const aiPlugin = this.pm.get('@nocobase/plugin-ai') as any;
|
|
87
|
+
if (!aiPlugin?.aiManager) {
|
|
88
|
+
this.log.warn('[DocumentParser] plugin-ai not found — parseAttachment interception skipped');
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const aiManager = aiPlugin.aiManager;
|
|
93
|
+
const self = this;
|
|
94
|
+
|
|
95
|
+
// Wrap the registration method (future registrations)
|
|
96
|
+
const originalRegister = aiManager.registerLLMProvider.bind(aiManager);
|
|
97
|
+
aiManager.registerLLMProvider = (name: string, meta: any) => {
|
|
98
|
+
return originalRegister(name, { ...meta, provider: self.wrapProviderClass(meta.provider) });
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
// Wrap already-registered providers (built-ins)
|
|
102
|
+
for (const [name, meta] of aiManager.llmProviders.entries()) {
|
|
103
|
+
aiManager.llmProviders.set(name, { ...meta, provider: self.wrapProviderClass(meta.provider) });
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
this.log.info(`[DocumentParser] Wrapped ${aiManager.llmProviders.size} LLM providers`);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Create a subclass of `OriginalProviderClass` that overrides `parseAttachment`
|
|
111
|
+
* to go through our router first. Uses `super.parseAttachment` as the default
|
|
112
|
+
* parser fallback — this correctly handles providers that already override the
|
|
113
|
+
* method (e.g. CustomLLMProvider, AnthropicProvider…).
|
|
114
|
+
*/
|
|
115
|
+
private wrapProviderClass(OriginalClass: new (...args: any[]) => any) {
|
|
116
|
+
const self = this;
|
|
117
|
+
return class extends OriginalClass {
|
|
118
|
+
async parseAttachment(ctx: Context, attachment: any) {
|
|
119
|
+
return self.parseRouter.route(ctx, attachment, () =>
|
|
120
|
+
super.parseAttachment(ctx, attachment),
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ── File buffer helper ────────────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Fetch the raw bytes of an attachment using the file-manager plugin,
|
|
130
|
+
* returning both the buffer and the resolved URL.
|
|
131
|
+
*/
|
|
132
|
+
private async fetchFileBuffer(
|
|
133
|
+
ctx: Context,
|
|
134
|
+
attachment: AttachmentLike,
|
|
135
|
+
): Promise<{ buffer: Buffer; url: string }> {
|
|
136
|
+
const fileManager = this.app.pm.get('file-manager') as any;
|
|
137
|
+
const rawUrl: string = await fileManager.getFileURL(attachment);
|
|
138
|
+
const url = decodeURIComponent(rawUrl);
|
|
139
|
+
|
|
140
|
+
if (url.startsWith('http://') || url.startsWith('https://')) {
|
|
141
|
+
const referer = ctx.get?.('referer') || '';
|
|
142
|
+
const ua = ctx.get?.('user-agent') || '';
|
|
143
|
+
const response = await axios.get(url, {
|
|
144
|
+
responseType: 'arraybuffer',
|
|
145
|
+
timeout: 60_000,
|
|
146
|
+
headers: { referer, 'User-Agent': ua },
|
|
147
|
+
});
|
|
148
|
+
return { buffer: Buffer.from(response.data), url };
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Local file — strip APP_PUBLIC_PATH before joining
|
|
152
|
+
const { resolve: resolvePath, sep } = require('path');
|
|
153
|
+
const { readFile } = require('fs/promises');
|
|
154
|
+
|
|
155
|
+
let localPath = url;
|
|
156
|
+
const appPublicPath = (process.env.APP_PUBLIC_PATH || '/').replace(/\/+$/, '');
|
|
157
|
+
if (appPublicPath && localPath.startsWith(appPublicPath + '/')) {
|
|
158
|
+
localPath = localPath.slice(appPublicPath.length);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const storageRoot = resolvePath(process.cwd());
|
|
162
|
+
const absPath = resolvePath(storageRoot, localPath.replace(/^\//, ''));
|
|
163
|
+
if (!absPath.startsWith(storageRoot + sep) && absPath !== storageRoot) {
|
|
164
|
+
throw new Error(`[DocumentParser] Attachment path escapes storage root: ${localPath}`);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const buffer = await readFile(absPath);
|
|
168
|
+
return { buffer, url };
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export default PluginDocumentParserServer;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { Context, Next } from '@nocobase/actions';
|
|
2
|
+
import { testOcrProviderConnection } from '../services/external-ocr-client';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Extra actions for the docParserProviders resource.
|
|
6
|
+
* Standard CRUD (list/create/update/destroy/get) is handled by NocoBase's
|
|
7
|
+
* default resource manager — we only need to add the custom `testConnection`.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export async function testConnection(ctx: Context, next: Next) {
|
|
11
|
+
const { filterByTk } = ctx.action.params;
|
|
12
|
+
|
|
13
|
+
const repo = ctx.db.getRepository('docParserProviders');
|
|
14
|
+
const record = await repo.findById(filterByTk);
|
|
15
|
+
|
|
16
|
+
if (!record) {
|
|
17
|
+
ctx.throw(404, 'Provider not found');
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const result = await testOcrProviderConnection({
|
|
22
|
+
apiEndpoint: record.get('apiEndpoint'),
|
|
23
|
+
authType: record.get('authType'),
|
|
24
|
+
apiKey: record.get('apiKey'),
|
|
25
|
+
authConfig: record.get('authConfig') ?? {},
|
|
26
|
+
timeout: Math.min(record.get('timeout') ?? 10000, 15000), // cap at 15s for test
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
ctx.body = result;
|
|
30
|
+
await next();
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Get/update the single global settings record.
|
|
35
|
+
* Returns existing record or auto-creates with defaults.
|
|
36
|
+
*/
|
|
37
|
+
export async function getSettings(ctx: Context, next: Next) {
|
|
38
|
+
const repo = ctx.db.getRepository('docParserSettings');
|
|
39
|
+
let record = await repo.findOne({});
|
|
40
|
+
if (!record) {
|
|
41
|
+
record = await repo.create({
|
|
42
|
+
values: {
|
|
43
|
+
mode: 'default',
|
|
44
|
+
fallbackToDefault: true,
|
|
45
|
+
imagePassThrough: true,
|
|
46
|
+
includedExtnames: [],
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
ctx.body = record;
|
|
51
|
+
await next();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export async function saveSettings(ctx: Context, next: Next) {
|
|
55
|
+
const repo = ctx.db.getRepository('docParserSettings');
|
|
56
|
+
const body = ctx.request.body as Record<string, any>;
|
|
57
|
+
|
|
58
|
+
let record = await repo.findOne({});
|
|
59
|
+
if (!record) {
|
|
60
|
+
record = await repo.create({ values: body });
|
|
61
|
+
} else {
|
|
62
|
+
await repo.update({ filter: { id: record.get('id') }, values: body });
|
|
63
|
+
record = await repo.findOne({});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Invalidate the router's settings cache
|
|
67
|
+
const plugin = ctx.app.pm.get('@nocobase/plugin-document-parser') as any;
|
|
68
|
+
plugin?.parseRouter?.invalidateSettingsCache?.();
|
|
69
|
+
|
|
70
|
+
ctx.body = record;
|
|
71
|
+
await next();
|
|
72
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { Context } from '@nocobase/actions';
|
|
2
|
+
import type { InternalParserHandler, InternalParseResult, AttachmentLike } from './internal-parser-registry';
|
|
3
|
+
|
|
4
|
+
// Extnames that plugin-ai's CachedDocumentLoader natively handles
|
|
5
|
+
const AI_SUPPORTED_EXTNAMES = new Set(['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt']);
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Built-in internal parser handler that delegates to plugin-ai's
|
|
9
|
+
* `DocumentLoaders.cached` — the same infrastructure used by the
|
|
10
|
+
* Knowledge Base feature.
|
|
11
|
+
*
|
|
12
|
+
* This handler is registered automatically during plugin load with the
|
|
13
|
+
* lowest priority (appended last) so custom handlers from other plugins
|
|
14
|
+
* can take precedence.
|
|
15
|
+
*/
|
|
16
|
+
export class BuiltinAIDocumentHandler implements InternalParserHandler {
|
|
17
|
+
readonly name = 'builtin-ai-document-loader';
|
|
18
|
+
|
|
19
|
+
constructor(
|
|
20
|
+
/** Lazy getter — resolved at call time to avoid circular dep during init */
|
|
21
|
+
private readonly getDocumentLoaders: () => { cached: { load(file: any): Promise<any> } },
|
|
22
|
+
) {}
|
|
23
|
+
|
|
24
|
+
supports(attachment: AttachmentLike): boolean {
|
|
25
|
+
const ext = this.resolveExtname(attachment);
|
|
26
|
+
return AI_SUPPORTED_EXTNAMES.has(ext);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
async parse(attachment: AttachmentLike, _ctx: Context): Promise<InternalParseResult> {
|
|
30
|
+
const loaders = this.getDocumentLoaders();
|
|
31
|
+
const result = await loaders.cached.load(attachment);
|
|
32
|
+
|
|
33
|
+
if (!result.supported) {
|
|
34
|
+
return { text: '', handled: false };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
text: result.text ?? '',
|
|
39
|
+
handled: true,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
private resolveExtname(attachment: AttachmentLike): string {
|
|
44
|
+
if (attachment.extname) return attachment.extname.toLowerCase();
|
|
45
|
+
const name = attachment.filename ?? attachment.name ?? '';
|
|
46
|
+
const idx = name.lastIndexOf('.');
|
|
47
|
+
return idx >= 0 ? name.slice(idx).toLowerCase() : '';
|
|
48
|
+
}
|
|
49
|
+
}
|