plugin-document-parser 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/client.d.ts +2 -0
  2. package/client.js +1 -0
  3. package/dist/client/01b8a5798a872638.js +10 -0
  4. package/dist/client/022be20abc96fdb4.js +10 -0
  5. package/dist/client/12e97e7a84d900e0.js +10 -0
  6. package/dist/client/index.js +10 -0
  7. package/dist/externalVersion.js +20 -0
  8. package/dist/index.js +48 -0
  9. package/dist/locale/en-US.json +54 -0
  10. package/dist/locale/vi-VN.json +54 -0
  11. package/dist/node_modules/form-data/License +19 -0
  12. package/dist/node_modules/form-data/index.d.ts +62 -0
  13. package/dist/node_modules/form-data/lib/browser.js +4 -0
  14. package/dist/node_modules/form-data/lib/form_data.js +14 -0
  15. package/dist/node_modules/form-data/lib/populate.js +10 -0
  16. package/dist/node_modules/form-data/package.json +1 -0
  17. package/dist/server/collections/doc-parser-providers.js +137 -0
  18. package/dist/server/collections/doc-parser-settings.js +85 -0
  19. package/dist/server/index.js +51 -0
  20. package/dist/server/plugin.js +181 -0
  21. package/dist/server/resource/docParserProviders.js +91 -0
  22. package/dist/server/services/builtin-ai-handler.js +63 -0
  23. package/dist/server/services/external-ocr-client.js +189 -0
  24. package/dist/server/services/internal-parser-registry.js +82 -0
  25. package/dist/server/services/parse-router.js +273 -0
  26. package/package.json +33 -0
  27. package/server.d.ts +2 -0
  28. package/server.js +1 -0
  29. package/src/client/components/GlobalSettings.tsx +151 -0
  30. package/src/client/components/ProviderForm.tsx +266 -0
  31. package/src/client/components/ProviderList.tsx +193 -0
  32. package/src/client/components/SettingsPage.tsx +43 -0
  33. package/src/client/index.tsx +2 -0
  34. package/src/client/locale.ts +12 -0
  35. package/src/client/plugin.tsx +34 -0
  36. package/src/index.ts +2 -0
  37. package/src/locale/en-US.json +54 -0
  38. package/src/locale/vi-VN.json +54 -0
  39. package/src/server/collections/doc-parser-providers.ts +107 -0
  40. package/src/server/collections/doc-parser-settings.ts +59 -0
  41. package/src/server/index.ts +10 -0
  42. package/src/server/plugin.ts +172 -0
  43. package/src/server/resource/docParserProviders.ts +72 -0
  44. package/src/server/services/builtin-ai-handler.ts +49 -0
  45. package/src/server/services/external-ocr-client.ts +233 -0
  46. package/src/server/services/internal-parser-registry.ts +126 -0
  47. package/src/server/services/parse-router.ts +357 -0
@@ -0,0 +1,34 @@
1
+ import { Plugin, lazy } from '@nocobase/client';
2
+ import { NAMESPACE } from './locale';
3
+
4
+ const SettingsPage = lazy(() => import('./components/SettingsPage'), 'SettingsPage');
5
+
6
+ export class PluginDocumentParserClient extends Plugin {
7
+ async afterAdd() {
8
+ // Register locale
9
+ await this.app.i18n.changeLanguage(this.app.i18n.language);
10
+ }
11
+
12
+ async load() {
13
+ // Add locale resources
14
+ const locale = this.app.i18n.language || 'en-US';
15
+ try {
16
+ const messages = await import(`../locale/${locale}.json`).catch(
17
+ () => import('../locale/en-US.json'),
18
+ );
19
+ this.app.i18n.addResourceBundle(locale, NAMESPACE, messages.default || messages, true, true);
20
+ } catch {
21
+ // Locale file may not exist for this language — silently skip
22
+ }
23
+
24
+ // Register the settings page under Plugin Settings
25
+ this.app.pluginSettingsManager.add(NAMESPACE, {
26
+ title: '{{t("Document Parser", { ns: "' + NAMESPACE + '" })}}',
27
+ icon: 'FileTextOutlined',
28
+ Component: SettingsPage,
29
+ aclSnippet: `pm.${NAMESPACE}.settings`,
30
+ });
31
+ }
32
+ }
33
+
34
+ export default PluginDocumentParserClient;
package/src/index.ts ADDED
@@ -0,0 +1,2 @@
1
+ export * from './server';
2
+ export { default } from './server';
@@ -0,0 +1,54 @@
1
+ {
2
+ "Document Parser": "Document Parser",
3
+ "Processing Mode": "Processing Mode",
4
+ "Default (plugin-ai built-in)": "Default (plugin-ai built-in)",
5
+ "Internal (built-in document loaders)": "Internal (built-in document loaders)",
6
+ "External (OCR API provider)": "External (OCR API provider)",
7
+ "Active Provider": "Active Provider",
8
+ "Fallback to default on error": "Fallback to default on error",
9
+ "Pass images through to default": "Pass images through to default",
10
+ "OCR Providers": "OCR Providers",
11
+ "Add Provider": "Add Provider",
12
+ "Edit Provider": "Edit Provider",
13
+ "Delete Provider": "Delete Provider",
14
+ "Test Connection": "Test Connection",
15
+ "Provider Title": "Provider Title",
16
+ "API Endpoint": "API Endpoint",
17
+ "API Key": "API Key",
18
+ "Auth Type": "Auth Type",
19
+ "Bearer Token": "Bearer Token",
20
+ "API Key Header": "API Key Header",
21
+ "Basic Auth": "Basic Auth",
22
+ "Custom Header": "Custom Header",
23
+ "Header Name": "Header Name",
24
+ "Username": "Username",
25
+ "Password": "Password",
26
+ "Custom Headers": "Custom Headers",
27
+ "Request Format": "Request Format",
28
+ "Multipart Form Data": "Multipart Form Data",
29
+ "JSON Base64": "JSON Base64",
30
+ "Form Field Name": "Form Field Name",
31
+ "Base64 Field Path": "Base64 Field Path",
32
+ "Filename Field Path": "Filename Field Path",
33
+ "Mimetype Field Path": "Mimetype Field Path",
34
+ "Extra Request Body": "Extra Request Body",
35
+ "Response Text Path": "Response Text Path",
36
+ "Timeout (ms)": "Timeout (ms)",
37
+ "Supported MIME Types": "Supported MIME Types",
38
+ "Leave empty to handle all non-image types": "Leave empty to handle all non-image types",
39
+ "Connection successful": "Connection successful",
40
+ "Connection failed": "Connection failed",
41
+ "Global Settings": "Global Settings",
42
+ "Provider Configuration": "Provider Configuration",
43
+ "Enabled": "Enabled",
44
+ "No providers configured": "No providers configured",
45
+ "Please select a provider": "Please select a provider",
46
+ "Settings saved": "Settings saved",
47
+ "Provider saved": "Provider saved",
48
+ "Provider deleted": "Provider deleted",
49
+ "mode_default_desc": "Use the built-in attachment processing from plugin-ai (default behavior).",
50
+ "mode_internal_desc": "Parse documents using built-in loaders (PDF, DOCX, PPT, TXT) or custom parsers registered by other plugins.",
51
+ "mode_external_desc": "Send files to a configured external OCR/parse API and use the returned text as attachment content.",
52
+ "Index with DocPixie (when available)": "Index with DocPixie (when available)",
53
+ "docpixie_mode_desc": "When plugin-docpixie is active, automatically index attached documents and instruct the AI to use the docpixie:query tool instead of reading raw file content. Applies before the processing mode above."
54
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "Document Parser": "Trình xử lý tài liệu",
3
+ "Processing Mode": "Chế độ xử lý",
4
+ "Default (plugin-ai built-in)": "Mặc định (plugin-ai tích hợp sẵn)",
5
+ "Internal (built-in document loaders)": "Nội bộ (trình đọc tài liệu tích hợp)",
6
+ "External (OCR API provider)": "Bên ngoài (nhà cung cấp OCR API)",
7
+ "Active Provider": "Nhà cung cấp đang dùng",
8
+ "Fallback to default on error": "Dùng mặc định khi lỗi",
9
+ "Pass images through to default": "Chuyển hình ảnh qua xử lý mặc định",
10
+ "OCR Providers": "Nhà cung cấp OCR",
11
+ "Add Provider": "Thêm nhà cung cấp",
12
+ "Edit Provider": "Chỉnh sửa nhà cung cấp",
13
+ "Delete Provider": "Xóa nhà cung cấp",
14
+ "Test Connection": "Kiểm tra kết nối",
15
+ "Provider Title": "Tên nhà cung cấp",
16
+ "API Endpoint": "Địa chỉ API",
17
+ "API Key": "Khóa API",
18
+ "Auth Type": "Loại xác thực",
19
+ "Bearer Token": "Bearer Token",
20
+ "API Key Header": "Header API Key",
21
+ "Basic Auth": "Basic Auth",
22
+ "Custom Header": "Header tùy chỉnh",
23
+ "Header Name": "Tên header",
24
+ "Username": "Tên đăng nhập",
25
+ "Password": "Mật khẩu",
26
+ "Custom Headers": "Headers tùy chỉnh",
27
+ "Request Format": "Định dạng yêu cầu",
28
+ "Multipart Form Data": "Multipart Form Data",
29
+ "JSON Base64": "JSON Base64",
30
+ "Form Field Name": "Tên trường form",
31
+ "Base64 Field Path": "Đường dẫn trường Base64",
32
+ "Filename Field Path": "Đường dẫn tên file",
33
+ "Mimetype Field Path": "Đường dẫn MIME type",
34
+ "Extra Request Body": "Body yêu cầu thêm",
35
+ "Response Text Path": "Đường dẫn text trong phản hồi",
36
+ "Timeout (ms)": "Timeout (ms)",
37
+ "Supported MIME Types": "MIME Types được hỗ trợ",
38
+ "Leave empty to handle all non-image types": "Để trống để xử lý tất cả loại không phải ảnh",
39
+ "Connection successful": "Kết nối thành công",
40
+ "Connection failed": "Kết nối thất bại",
41
+ "Global Settings": "Cài đặt chung",
42
+ "Provider Configuration": "Cấu hình nhà cung cấp",
43
+ "Enabled": "Kích hoạt",
44
+ "No providers configured": "Chưa có nhà cung cấp nào",
45
+ "Please select a provider": "Vui lòng chọn nhà cung cấp",
46
+ "Settings saved": "Đã lưu cài đặt",
47
+ "Provider saved": "Đã lưu nhà cung cấp",
48
+ "Provider deleted": "Đã xóa nhà cung cấp",
49
+ "Index with DocPixie (when available)": "Lập chỉ mục qua DocPixie (nếu có)",
50
+ "docpixie_mode_desc": "Khi plugin-docpixie đang hoạt động, tự động lập chỉ mục tài liệu đính kèm và hướng dẫn AI dùng công cụ docpixie:query thay vì đọc nội dung thô. Được áp dụng trước chế độ xử lý bên trên.",
51
+ "mode_default_desc": "Dùng chức năng xử lý tệp đính kèm mặc định của plugin-ai.",
52
+ "mode_internal_desc": "Phân tích tài liệu dùng trình đọc tích hợp (PDF, DOCX, PPT, TXT) hoặc các parser tùy chỉnh từ plugin khác.",
53
+ "mode_external_desc": "Gửi tệp đến API OCR/parse bên ngoài đã cấu hình và dùng văn bản trả về làm nội dung đính kèm."
54
+ }
@@ -0,0 +1,107 @@
1
+ import { defineCollection } from '@nocobase/database';
2
+
3
+ /**
4
+ * External OCR/document-parse API provider configurations.
5
+ *
6
+ * Each provider describes HOW to call an external service:
7
+ * - authentication (bearer / api-key-header / basic / custom-headers)
8
+ * - request format (multipart | json-base64)
9
+ * - response text extraction (dot-path into JSON response)
10
+ */
11
+ export default defineCollection({
12
+ name: 'docParserProviders',
13
+ title: 'Document Parser Providers',
14
+ fields: [
15
+ {
16
+ name: 'title',
17
+ type: 'string',
18
+ },
19
+ {
20
+ name: 'enabled',
21
+ type: 'boolean',
22
+ defaultValue: true,
23
+ },
24
+ // ── Endpoint ──────────────────────────────────────────────────────────────
25
+ {
26
+ name: 'apiEndpoint',
27
+ type: 'string',
28
+ comment: 'Full URL, e.g. https://ocr.example.com/v1/parse',
29
+ },
30
+ // ── Authentication ────────────────────────────────────────────────────────
31
+ {
32
+ name: 'authType',
33
+ type: 'string',
34
+ defaultValue: 'bearer',
35
+ comment: "'bearer' | 'api-key-header' | 'basic' | 'custom-headers' | 'none'",
36
+ },
37
+ {
38
+ // Encrypted at rest via NocoBase password field
39
+ name: 'apiKey',
40
+ type: 'password',
41
+ allowNull: true,
42
+ comment: 'Used for bearer / api-key-header auth',
43
+ },
44
+ {
45
+ name: 'authConfig',
46
+ type: 'json',
47
+ defaultValue: {},
48
+ comment: JSON.stringify({
49
+ headerName: 'X-Api-Key', // for api-key-header
50
+ username: '', // for basic auth
51
+ password: '', // for basic auth
52
+ customHeaders: {}, // for custom-headers: { "X-Foo": "bar" }
53
+ }),
54
+ },
55
+ // ── Request format ────────────────────────────────────────────────────────
56
+ {
57
+ name: 'requestFormat',
58
+ type: 'string',
59
+ defaultValue: 'multipart',
60
+ comment: "'multipart' | 'json-base64' | 'url'",
61
+ },
62
+ {
63
+ name: 'requestConfig',
64
+ type: 'json',
65
+ defaultValue: {},
66
+ comment: JSON.stringify({
67
+ // multipart
68
+ fileFieldName: 'file',
69
+ filenameFieldName: '', // optional extra field for filename
70
+ mimetypeFieldName: '', // optional extra field for mimetype
71
+ extraFields: {}, // extra form fields
72
+ // json-base64
73
+ base64FieldPath: 'file', // e.g. "document.content"
74
+ filenameFieldPath: 'filename',
75
+ mimetypeFieldPath: 'mimetype',
76
+ extraBody: {},
77
+ // url (send download URL instead of file bytes)
78
+ urlFieldPath: 'url',
79
+ }),
80
+ },
81
+ // ── Response extraction ───────────────────────────────────────────────────
82
+ {
83
+ name: 'responseTextPath',
84
+ type: 'string',
85
+ defaultValue: 'text',
86
+ comment: "Dot-path into the JSON response body, e.g. 'data.text' or 'result.pages[0].content'",
87
+ },
88
+ // ── Scope ─────────────────────────────────────────────────────────────────
89
+ {
90
+ name: 'supportedMimetypes',
91
+ type: 'json',
92
+ defaultValue: [],
93
+ comment: 'Empty = handle everything routed to external. e.g. ["application/pdf"]',
94
+ },
95
+ {
96
+ name: 'timeout',
97
+ type: 'integer',
98
+ defaultValue: 60000,
99
+ comment: 'HTTP request timeout in milliseconds',
100
+ },
101
+ {
102
+ name: 'options',
103
+ type: 'json',
104
+ defaultValue: {},
105
+ },
106
+ ],
107
+ });
@@ -0,0 +1,59 @@
1
+ import { defineCollection } from '@nocobase/database';
2
+
3
+ /**
4
+ * Global settings for the document parser plugin.
5
+ * Single-row config table (only one record expected).
6
+ */
7
+ export default defineCollection({
8
+ name: 'docParserSettings',
9
+ title: 'Document Parser Settings',
10
+ fields: [
11
+ {
12
+ name: 'mode',
13
+ type: 'string',
14
+ defaultValue: 'default',
15
+ comment: "'default' | 'internal' | 'external'",
16
+ },
17
+ {
18
+ // FK to docParserProviders — which external provider is active
19
+ name: 'activeProviderId',
20
+ type: 'bigInt',
21
+ allowNull: true,
22
+ },
23
+ {
24
+ // When internal/external parsing fails, fall back to the default provider logic
25
+ name: 'fallbackToDefault',
26
+ type: 'boolean',
27
+ defaultValue: true,
28
+ },
29
+ {
30
+ // Images are always passed through to the default provider (they don't need OCR)
31
+ name: 'imagePassThrough',
32
+ type: 'boolean',
33
+ defaultValue: true,
34
+ },
35
+ {
36
+ // Optional: restrict which extnames this plugin handles (empty = all non-image)
37
+ name: 'includedExtnames',
38
+ type: 'json',
39
+ defaultValue: [],
40
+ comment: 'e.g. [".pdf", ".docx"] — empty means all non-image files',
41
+ },
42
+ {
43
+ name: 'options',
44
+ type: 'json',
45
+ defaultValue: {},
46
+ },
47
+ {
48
+ /**
49
+ * When true and plugin-docpixie is active:
50
+ * - Trigger docpixie:processDocument (async indexing)
51
+ * - Return a metadata reference block instead of full text
52
+ * - LLM is instructed to call docpixie:query tool for retrieval
53
+ */
54
+ name: 'useDocpixie',
55
+ type: 'boolean',
56
+ defaultValue: false,
57
+ },
58
+ ],
59
+ });
@@ -0,0 +1,10 @@
1
+ export { default } from './plugin';
2
+ export { PluginDocumentParserServer } from './plugin';
3
+ export { InternalParserRegistry } from './services/internal-parser-registry';
4
+ export type {
5
+ InternalParserHandler,
6
+ InternalParseResult,
7
+ AttachmentLike,
8
+ } from './services/internal-parser-registry';
9
+ export type { OcrProviderConfig, OcrAuthType, OcrRequestFormat } from './services/external-ocr-client';
10
+ export type { ParsedAttachmentResult } from './services/parse-router';
@@ -0,0 +1,172 @@
1
+ import { Plugin } from '@nocobase/server';
2
+ import { resolve } from 'path';
3
+ import { Context } from '@nocobase/actions';
4
+ import axios from 'axios';
5
+ import { InternalParserRegistry } from './services/internal-parser-registry';
6
+ import { BuiltinAIDocumentHandler } from './services/builtin-ai-handler';
7
+ import { ParseRouter } from './services/parse-router';
8
+ import { testConnection, getSettings, saveSettings } from './resource/docParserProviders';
9
+ import type { AttachmentLike } from './services/internal-parser-registry';
10
+
11
+ export class PluginDocumentParserServer extends Plugin {
12
+ /**
13
+ * Public registry — other plugins register their format handlers here:
14
+ *
15
+ * const docParser = this.pm.get(PluginDocumentParserServer);
16
+ * docParser.internalParserRegistry.register({ name, supports, parse });
17
+ */
18
+ readonly internalParserRegistry = new InternalParserRegistry();
19
+
20
+ parseRouter!: ParseRouter;
21
+
22
+ // ── Lifecycle ─────────────────────────────────────────────────────────────
23
+
24
+ async beforeLoad() {
25
+ // Register the built-in AI document handler (lowest priority — appended last)
26
+ // Done in beforeLoad so other plugins' load() can prepend higher-priority handlers
27
+ this.internalParserRegistry.register(
28
+ new BuiltinAIDocumentHandler(() => {
29
+ const aiPlugin = this.pm.get('@nocobase/plugin-ai') as any;
30
+ return aiPlugin?.documentLoaders;
31
+ }),
32
+ );
33
+ }
34
+
35
+ async load() {
36
+ // 1. Load collections
37
+ await this.importCollections(resolve(__dirname, 'collections'));
38
+
39
+ // 2. Wire up the parse router
40
+ this.parseRouter = new ParseRouter(
41
+ () => this.db.getRepository('docParserSettings'),
42
+ () => this.db.getRepository('docParserProviders'),
43
+ this.internalParserRegistry,
44
+ this.fetchFileBuffer.bind(this),
45
+ () => {
46
+ const p = this.pm.get('@nocobase/plugin-docpixie') as any;
47
+ return p?.service ? p : null;
48
+ },
49
+ );
50
+
51
+ // 3. Patch AIManager to intercept parseAttachment on ALL providers
52
+ this.wrapAIManager();
53
+
54
+ // 4. Register resources
55
+ this.app.resourceManager.define({
56
+ name: 'docParserProviders',
57
+ actions: {
58
+ testConnection,
59
+ },
60
+ });
61
+
62
+ this.app.resourceManager.define({
63
+ name: 'docParserSettings',
64
+ actions: {
65
+ get: getSettings,
66
+ save: saveSettings,
67
+ },
68
+ });
69
+
70
+ // 5. ACL — allow admins to manage settings & providers
71
+ this.app.acl.allow('docParserProviders', ['list', 'create', 'update', 'destroy', 'get', 'testConnection'], 'loggedIn');
72
+ this.app.acl.allow('docParserSettings', ['get', 'save'], 'loggedIn');
73
+ }
74
+
75
+ // ── AIManager patching ────────────────────────────────────────────────────
76
+
77
+ /**
78
+ * Wrap AIManager.registerLLMProvider so that every provider class — including
79
+ * those registered AFTER this plugin loads (e.g. plugin-custom-llm) — gets
80
+ * its `parseAttachment` intercepted.
81
+ *
82
+ * Additionally, iterate providers already registered (plugin-ai built-ins:
83
+ * OpenAI, Anthropic, etc.) and wrap them immediately.
84
+ */
85
+ private wrapAIManager() {
86
+ const aiPlugin = this.pm.get('@nocobase/plugin-ai') as any;
87
+ if (!aiPlugin?.aiManager) {
88
+ this.log.warn('[DocumentParser] plugin-ai not found — parseAttachment interception skipped');
89
+ return;
90
+ }
91
+
92
+ const aiManager = aiPlugin.aiManager;
93
+ const self = this;
94
+
95
+ // Wrap the registration method (future registrations)
96
+ const originalRegister = aiManager.registerLLMProvider.bind(aiManager);
97
+ aiManager.registerLLMProvider = (name: string, meta: any) => {
98
+ return originalRegister(name, { ...meta, provider: self.wrapProviderClass(meta.provider) });
99
+ };
100
+
101
+ // Wrap already-registered providers (built-ins)
102
+ for (const [name, meta] of aiManager.llmProviders.entries()) {
103
+ aiManager.llmProviders.set(name, { ...meta, provider: self.wrapProviderClass(meta.provider) });
104
+ }
105
+
106
+ this.log.info(`[DocumentParser] Wrapped ${aiManager.llmProviders.size} LLM providers`);
107
+ }
108
+
109
+ /**
110
+ * Create a subclass of `OriginalProviderClass` that overrides `parseAttachment`
111
+ * to go through our router first. Uses `super.parseAttachment` as the default
112
+ * parser fallback — this correctly handles providers that already override the
113
+ * method (e.g. CustomLLMProvider, AnthropicProvider…).
114
+ */
115
+ private wrapProviderClass(OriginalClass: new (...args: any[]) => any) {
116
+ const self = this;
117
+ return class extends OriginalClass {
118
+ async parseAttachment(ctx: Context, attachment: any) {
119
+ return self.parseRouter.route(ctx, attachment, () =>
120
+ super.parseAttachment(ctx, attachment),
121
+ );
122
+ }
123
+ };
124
+ }
125
+
126
+ // ── File buffer helper ────────────────────────────────────────────────────
127
+
128
+ /**
129
+ * Fetch the raw bytes of an attachment using the file-manager plugin,
130
+ * returning both the buffer and the resolved URL.
131
+ */
132
+ private async fetchFileBuffer(
133
+ ctx: Context,
134
+ attachment: AttachmentLike,
135
+ ): Promise<{ buffer: Buffer; url: string }> {
136
+ const fileManager = this.app.pm.get('file-manager') as any;
137
+ const rawUrl: string = await fileManager.getFileURL(attachment);
138
+ const url = decodeURIComponent(rawUrl);
139
+
140
+ if (url.startsWith('http://') || url.startsWith('https://')) {
141
+ const referer = ctx.get?.('referer') || '';
142
+ const ua = ctx.get?.('user-agent') || '';
143
+ const response = await axios.get(url, {
144
+ responseType: 'arraybuffer',
145
+ timeout: 60_000,
146
+ headers: { referer, 'User-Agent': ua },
147
+ });
148
+ return { buffer: Buffer.from(response.data), url };
149
+ }
150
+
151
+ // Local file — strip APP_PUBLIC_PATH before joining
152
+ const { resolve: resolvePath, sep } = require('path');
153
+ const { readFile } = require('fs/promises');
154
+
155
+ let localPath = url;
156
+ const appPublicPath = (process.env.APP_PUBLIC_PATH || '/').replace(/\/+$/, '');
157
+ if (appPublicPath && localPath.startsWith(appPublicPath + '/')) {
158
+ localPath = localPath.slice(appPublicPath.length);
159
+ }
160
+
161
+ const storageRoot = resolvePath(process.cwd());
162
+ const absPath = resolvePath(storageRoot, localPath.replace(/^\//, ''));
163
+ if (!absPath.startsWith(storageRoot + sep) && absPath !== storageRoot) {
164
+ throw new Error(`[DocumentParser] Attachment path escapes storage root: ${localPath}`);
165
+ }
166
+
167
+ const buffer = await readFile(absPath);
168
+ return { buffer, url };
169
+ }
170
+ }
171
+
172
+ export default PluginDocumentParserServer;
@@ -0,0 +1,72 @@
1
+ import { Context, Next } from '@nocobase/actions';
2
+ import { testOcrProviderConnection } from '../services/external-ocr-client';
3
+
4
+ /**
5
+ * Extra actions for the docParserProviders resource.
6
+ * Standard CRUD (list/create/update/destroy/get) is handled by NocoBase's
7
+ * default resource manager — we only need to add the custom `testConnection`.
8
+ */
9
+
10
+ export async function testConnection(ctx: Context, next: Next) {
11
+ const { filterByTk } = ctx.action.params;
12
+
13
+ const repo = ctx.db.getRepository('docParserProviders');
14
+ const record = await repo.findById(filterByTk);
15
+
16
+ if (!record) {
17
+ ctx.throw(404, 'Provider not found');
18
+ return;
19
+ }
20
+
21
+ const result = await testOcrProviderConnection({
22
+ apiEndpoint: record.get('apiEndpoint'),
23
+ authType: record.get('authType'),
24
+ apiKey: record.get('apiKey'),
25
+ authConfig: record.get('authConfig') ?? {},
26
+ timeout: Math.min(record.get('timeout') ?? 10000, 15000), // cap at 15s for test
27
+ });
28
+
29
+ ctx.body = result;
30
+ await next();
31
+ }
32
+
33
+ /**
34
+ * Get/update the single global settings record.
35
+ * Returns existing record or auto-creates with defaults.
36
+ */
37
+ export async function getSettings(ctx: Context, next: Next) {
38
+ const repo = ctx.db.getRepository('docParserSettings');
39
+ let record = await repo.findOne({});
40
+ if (!record) {
41
+ record = await repo.create({
42
+ values: {
43
+ mode: 'default',
44
+ fallbackToDefault: true,
45
+ imagePassThrough: true,
46
+ includedExtnames: [],
47
+ },
48
+ });
49
+ }
50
+ ctx.body = record;
51
+ await next();
52
+ }
53
+
54
+ export async function saveSettings(ctx: Context, next: Next) {
55
+ const repo = ctx.db.getRepository('docParserSettings');
56
+ const body = ctx.request.body as Record<string, any>;
57
+
58
+ let record = await repo.findOne({});
59
+ if (!record) {
60
+ record = await repo.create({ values: body });
61
+ } else {
62
+ await repo.update({ filter: { id: record.get('id') }, values: body });
63
+ record = await repo.findOne({});
64
+ }
65
+
66
+ // Invalidate the router's settings cache
67
+ const plugin = ctx.app.pm.get('@nocobase/plugin-document-parser') as any;
68
+ plugin?.parseRouter?.invalidateSettingsCache?.();
69
+
70
+ ctx.body = record;
71
+ await next();
72
+ }
@@ -0,0 +1,49 @@
1
+ import { Context } from '@nocobase/actions';
2
+ import type { InternalParserHandler, InternalParseResult, AttachmentLike } from './internal-parser-registry';
3
+
4
+ // Extnames that plugin-ai's CachedDocumentLoader natively handles
5
+ const AI_SUPPORTED_EXTNAMES = new Set(['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt']);
6
+
7
+ /**
8
+ * Built-in internal parser handler that delegates to plugin-ai's
9
+ * `DocumentLoaders.cached` — the same infrastructure used by the
10
+ * Knowledge Base feature.
11
+ *
12
+ * This handler is registered automatically during plugin load with the
13
+ * lowest priority (appended last) so custom handlers from other plugins
14
+ * can take precedence.
15
+ */
16
+ export class BuiltinAIDocumentHandler implements InternalParserHandler {
17
+ readonly name = 'builtin-ai-document-loader';
18
+
19
+ constructor(
20
+ /** Lazy getter — resolved at call time to avoid circular dep during init */
21
+ private readonly getDocumentLoaders: () => { cached: { load(file: any): Promise<any> } },
22
+ ) {}
23
+
24
+ supports(attachment: AttachmentLike): boolean {
25
+ const ext = this.resolveExtname(attachment);
26
+ return AI_SUPPORTED_EXTNAMES.has(ext);
27
+ }
28
+
29
+ async parse(attachment: AttachmentLike, _ctx: Context): Promise<InternalParseResult> {
30
+ const loaders = this.getDocumentLoaders();
31
+ const result = await loaders.cached.load(attachment);
32
+
33
+ if (!result.supported) {
34
+ return { text: '', handled: false };
35
+ }
36
+
37
+ return {
38
+ text: result.text ?? '',
39
+ handled: true,
40
+ };
41
+ }
42
+
43
+ private resolveExtname(attachment: AttachmentLike): string {
44
+ if (attachment.extname) return attachment.extname.toLowerCase();
45
+ const name = attachment.filename ?? attachment.name ?? '';
46
+ const idx = name.lastIndexOf('.');
47
+ return idx >= 0 ? name.slice(idx).toLowerCase() : '';
48
+ }
49
+ }