plugin-document-parser 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/client.d.ts +2 -0
  2. package/client.js +1 -0
  3. package/dist/client/01b8a5798a872638.js +10 -0
  4. package/dist/client/022be20abc96fdb4.js +10 -0
  5. package/dist/client/12e97e7a84d900e0.js +10 -0
  6. package/dist/client/index.js +10 -0
  7. package/dist/externalVersion.js +20 -0
  8. package/dist/index.js +48 -0
  9. package/dist/locale/en-US.json +54 -0
  10. package/dist/locale/vi-VN.json +54 -0
  11. package/dist/node_modules/form-data/License +19 -0
  12. package/dist/node_modules/form-data/index.d.ts +62 -0
  13. package/dist/node_modules/form-data/lib/browser.js +4 -0
  14. package/dist/node_modules/form-data/lib/form_data.js +14 -0
  15. package/dist/node_modules/form-data/lib/populate.js +10 -0
  16. package/dist/node_modules/form-data/package.json +1 -0
  17. package/dist/server/collections/doc-parser-providers.js +137 -0
  18. package/dist/server/collections/doc-parser-settings.js +85 -0
  19. package/dist/server/index.js +51 -0
  20. package/dist/server/plugin.js +181 -0
  21. package/dist/server/resource/docParserProviders.js +91 -0
  22. package/dist/server/services/builtin-ai-handler.js +63 -0
  23. package/dist/server/services/external-ocr-client.js +189 -0
  24. package/dist/server/services/internal-parser-registry.js +82 -0
  25. package/dist/server/services/parse-router.js +273 -0
  26. package/package.json +33 -0
  27. package/server.d.ts +2 -0
  28. package/server.js +1 -0
  29. package/src/client/components/GlobalSettings.tsx +151 -0
  30. package/src/client/components/ProviderForm.tsx +266 -0
  31. package/src/client/components/ProviderList.tsx +193 -0
  32. package/src/client/components/SettingsPage.tsx +43 -0
  33. package/src/client/index.tsx +2 -0
  34. package/src/client/locale.ts +12 -0
  35. package/src/client/plugin.tsx +34 -0
  36. package/src/index.ts +2 -0
  37. package/src/locale/en-US.json +54 -0
  38. package/src/locale/vi-VN.json +54 -0
  39. package/src/server/collections/doc-parser-providers.ts +107 -0
  40. package/src/server/collections/doc-parser-settings.ts +59 -0
  41. package/src/server/index.ts +10 -0
  42. package/src/server/plugin.ts +172 -0
  43. package/src/server/resource/docParserProviders.ts +72 -0
  44. package/src/server/services/builtin-ai-handler.ts +49 -0
  45. package/src/server/services/external-ocr-client.ts +233 -0
  46. package/src/server/services/internal-parser-registry.ts +126 -0
  47. package/src/server/services/parse-router.ts +357 -0
@@ -0,0 +1,233 @@
1
+ import axios, { AxiosRequestConfig } from 'axios';
2
+ import FormData from 'form-data';
3
+ import type { AttachmentLike } from './internal-parser-registry';
4
+
5
+ // ─── Provider config types ─────────────────────────────────────────────────────
6
+
7
+ export type OcrAuthType = 'bearer' | 'api-key-header' | 'basic' | 'custom-headers' | 'none';
8
+ export type OcrRequestFormat = 'multipart' | 'json-base64' | 'url';
9
+
10
+ export type OcrProviderConfig = {
11
+ apiEndpoint: string;
12
+ authType: OcrAuthType;
13
+ apiKey?: string;
14
+ authConfig?: {
15
+ headerName?: string; // for api-key-header
16
+ username?: string; // for basic auth
17
+ password?: string; // for basic auth
18
+ customHeaders?: Record<string, string>; // for custom-headers
19
+ };
20
+ requestFormat: OcrRequestFormat;
21
+ requestConfig?: {
22
+ // multipart
23
+ fileFieldName?: string;
24
+ filenameFieldName?: string;
25
+ mimetypeFieldName?: string;
26
+ extraFields?: Record<string, string>;
27
+ // json-base64
28
+ base64FieldPath?: string;
29
+ filenameFieldPath?: string;
30
+ mimetypeFieldPath?: string;
31
+ extraBody?: Record<string, any>;
32
+ // url
33
+ urlFieldPath?: string;
34
+ };
35
+ responseTextPath?: string;
36
+ timeout?: number;
37
+ };
38
+
39
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
40
+
41
+ /** Set a value at a dot-path inside an object, creating intermediate objects */
42
+ function setByPath(obj: Record<string, any>, dotPath: string, value: any): void {
43
+ const parts = dotPath.split('.');
44
+ let current = obj;
45
+ for (let i = 0; i < parts.length - 1; i++) {
46
+ if (current[parts[i]] == null || typeof current[parts[i]] !== 'object') {
47
+ current[parts[i]] = {};
48
+ }
49
+ current = current[parts[i]];
50
+ }
51
+ current[parts[parts.length - 1]] = value;
52
+ }
53
+
54
+ /** Get a value from an object via dot-path. Supports simple array index: "pages.0.text" */
55
+ function getByPath(obj: any, dotPath: string): any {
56
+ if (!obj || !dotPath) return undefined;
57
+ const parts = dotPath.split('.');
58
+ let current = obj;
59
+ for (const part of parts) {
60
+ if (current == null) return undefined;
61
+ current = current[part];
62
+ }
63
+ return current;
64
+ }
65
+
66
+ /** Build Authorization / custom headers based on authType */
67
+ function buildAuthHeaders(config: OcrProviderConfig): Record<string, string> {
68
+ const { authType, apiKey, authConfig = {} } = config;
69
+ const headers: Record<string, string> = {};
70
+
71
+ switch (authType) {
72
+ case 'bearer':
73
+ if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`;
74
+ break;
75
+ case 'api-key-header': {
76
+ const headerName = authConfig.headerName || 'X-Api-Key';
77
+ if (apiKey) headers[headerName] = apiKey;
78
+ break;
79
+ }
80
+ case 'basic': {
81
+ const { username = '', password = '' } = authConfig;
82
+ const encoded = Buffer.from(`${username}:${password}`).toString('base64');
83
+ headers['Authorization'] = `Basic ${encoded}`;
84
+ break;
85
+ }
86
+ case 'custom-headers':
87
+ Object.assign(headers, authConfig.customHeaders ?? {});
88
+ break;
89
+ case 'none':
90
+ default:
91
+ break;
92
+ }
93
+
94
+ return headers;
95
+ }
96
+
97
+ // ─── Main client ──────────────────────────────────────────────────────────────
98
+
99
+ export type ExternalOcrCallOptions = {
100
+ /** Raw file bytes */
101
+ fileBuffer: Buffer;
102
+ /** Original filename */
103
+ filename: string;
104
+ /** MIME type */
105
+ mimetype: string;
106
+ /** Download URL (used when requestFormat = 'url') */
107
+ fileUrl?: string;
108
+ };
109
+
110
+ /**
111
+ * Call an external OCR/document-parse API and return the extracted text.
112
+ *
113
+ * Supports three request formats:
114
+ * - `multipart` → multipart/form-data (most common for file upload APIs)
115
+ * - `json-base64` → JSON body with base64-encoded file
116
+ * - `url` → JSON body with file download URL (provider fetches the file itself)
117
+ *
118
+ * Auth methods: bearer, api-key-header, basic, custom-headers, none.
119
+ */
120
+ export async function callExternalOcr(
121
+ providerConfig: OcrProviderConfig,
122
+ options: ExternalOcrCallOptions,
123
+ ): Promise<string> {
124
+ const {
125
+ apiEndpoint,
126
+ requestFormat = 'multipart',
127
+ requestConfig = {},
128
+ responseTextPath = 'text',
129
+ timeout = 60000,
130
+ } = providerConfig;
131
+
132
+ const authHeaders = buildAuthHeaders(providerConfig);
133
+ let axiosConfig: AxiosRequestConfig = {
134
+ timeout,
135
+ headers: { ...authHeaders },
136
+ };
137
+ let body: any;
138
+
139
+ if (requestFormat === 'multipart') {
140
+ const form = new FormData();
141
+ const fileFieldName = requestConfig.fileFieldName || 'file';
142
+
143
+ form.append(fileFieldName, options.fileBuffer, {
144
+ filename: options.filename,
145
+ contentType: options.mimetype,
146
+ });
147
+
148
+ if (requestConfig.filenameFieldName) {
149
+ form.append(requestConfig.filenameFieldName, options.filename);
150
+ }
151
+ if (requestConfig.mimetypeFieldName) {
152
+ form.append(requestConfig.mimetypeFieldName, options.mimetype);
153
+ }
154
+ for (const [k, v] of Object.entries(requestConfig.extraFields ?? {})) {
155
+ form.append(k, v);
156
+ }
157
+
158
+ body = form;
159
+ axiosConfig.headers = {
160
+ ...axiosConfig.headers,
161
+ ...form.getHeaders(),
162
+ };
163
+ } else if (requestFormat === 'json-base64') {
164
+ const base64 = options.fileBuffer.toString('base64');
165
+ const jsonBody: Record<string, any> = { ...(requestConfig.extraBody ?? {}) };
166
+
167
+ setByPath(jsonBody, requestConfig.base64FieldPath || 'file', base64);
168
+ if (requestConfig.filenameFieldPath) {
169
+ setByPath(jsonBody, requestConfig.filenameFieldPath, options.filename);
170
+ }
171
+ if (requestConfig.mimetypeFieldPath) {
172
+ setByPath(jsonBody, requestConfig.mimetypeFieldPath, options.mimetype);
173
+ }
174
+
175
+ body = jsonBody;
176
+ axiosConfig.headers = {
177
+ ...axiosConfig.headers,
178
+ 'Content-Type': 'application/json',
179
+ };
180
+ } else if (requestFormat === 'url') {
181
+ if (!options.fileUrl) {
182
+ throw new Error('[DocumentParser] requestFormat=url but no fileUrl was provided');
183
+ }
184
+ const jsonBody: Record<string, any> = { ...(requestConfig.extraBody ?? {}) };
185
+ setByPath(jsonBody, requestConfig.urlFieldPath || 'url', options.fileUrl);
186
+
187
+ body = jsonBody;
188
+ axiosConfig.headers = {
189
+ ...axiosConfig.headers,
190
+ 'Content-Type': 'application/json',
191
+ };
192
+ } else {
193
+ throw new Error(`[DocumentParser] Unknown requestFormat: ${requestFormat}`);
194
+ }
195
+
196
+ const response = await axios.post(apiEndpoint, body, axiosConfig);
197
+ const responseData = response.data;
198
+
199
+ const text = getByPath(responseData, responseTextPath);
200
+ if (typeof text !== 'string') {
201
+ throw new Error(
202
+ `[DocumentParser] Could not extract text from response at path "${responseTextPath}". ` +
203
+ `Response: ${JSON.stringify(responseData).slice(0, 300)}`,
204
+ );
205
+ }
206
+
207
+ return text;
208
+ }
209
+
210
+ /**
211
+ * Lightweight connectivity test — sends a GET (or HEAD) to the provider endpoint
212
+ * to check reachability and auth. Used by the "Test Connection" button.
213
+ */
214
+ export async function testOcrProviderConnection(
215
+ providerConfig: Pick<OcrProviderConfig, 'apiEndpoint' | 'authType' | 'apiKey' | 'authConfig' | 'timeout'>,
216
+ ): Promise<{ ok: boolean; status?: number; message?: string }> {
217
+ try {
218
+ const authHeaders = buildAuthHeaders(providerConfig as OcrProviderConfig);
219
+ const response = await axios.get(providerConfig.apiEndpoint, {
220
+ headers: authHeaders,
221
+ timeout: providerConfig.timeout ?? 10000,
222
+ // Don't throw on 4xx so we can return the status to the UI
223
+ validateStatus: () => true,
224
+ });
225
+ return {
226
+ ok: response.status < 500,
227
+ status: response.status,
228
+ message: response.status < 500 ? undefined : `Server error: ${response.status}`,
229
+ };
230
+ } catch (err: any) {
231
+ return { ok: false, message: err?.message ?? String(err) };
232
+ }
233
+ }
@@ -0,0 +1,126 @@
1
+ import { Context } from '@nocobase/actions';
2
+
3
+ // ─── Public contract ──────────────────────────────────────────────────────────
4
+
5
+ export type InternalParseResult = {
6
+ /** Extracted plain text content */
7
+ text: string;
8
+ /** True when this handler actually processed the file (not skipped/unsupported) */
9
+ handled: boolean;
10
+ };
11
+
12
+ /**
13
+ * Contract every internal parser handler must satisfy.
14
+ *
15
+ * Other plugins register handlers via:
16
+ * plugin.internalParserRegistry.register(myHandler)
17
+ *
18
+ * Handlers are tried in registration order. The first one that returns
19
+ * `handled: true` wins; the rest are skipped. This lets specialised handlers
20
+ * (e.g. a dedicated Excel parser plugin) take priority over the generic
21
+ * fallback handler.
22
+ */
23
+ export interface InternalParserHandler {
24
+ /** Unique name used for logging / debugging */
25
+ name: string;
26
+
27
+ /**
28
+ * Return true if this handler CAN process the given attachment.
29
+ * Called before `parse()` to avoid unnecessary work.
30
+ */
31
+ supports(attachment: AttachmentLike): boolean;
32
+
33
+ /**
34
+ * Parse the attachment and return extracted text.
35
+ * Throw if parsing fails — the registry will either propagate the error
36
+ * or let the router fall back to default, depending on settings.
37
+ */
38
+ parse(attachment: AttachmentLike, ctx: Context): Promise<InternalParseResult>;
39
+ }
40
+
41
+ /** Minimal shape of an attachment object passed through the system */
42
+ export type AttachmentLike = {
43
+ id?: string | number;
44
+ filename?: string;
45
+ name?: string;
46
+ mimetype?: string;
47
+ extname?: string;
48
+ url?: string;
49
+ storageId?: number;
50
+ size?: number;
51
+ meta?: Record<string, any>;
52
+ [key: string]: any;
53
+ };
54
+
55
+ // ─── Registry ─────────────────────────────────────────────────────────────────
56
+
57
+ /**
58
+ * Central registry for internal document parser handlers.
59
+ *
60
+ * Handlers are called in the order they were registered.
61
+ * The first handler that declares `supports() = true` processes the file.
62
+ *
63
+ * Usage (from another plugin):
64
+ *
65
+ * const docParser = this.pm.get(PluginDocumentParserServer);
66
+ * docParser.internalParserRegistry.register({
67
+ * name: 'my-excel-parser',
68
+ * supports: (att) => att.mimetype === 'application/vnd.openxmlformats...',
69
+ * async parse(att, ctx) {
70
+ * const text = await myExcelToText(att);
71
+ * return { text, handled: true };
72
+ * },
73
+ * });
74
+ */
75
+ export class InternalParserRegistry {
76
+ private handlers: InternalParserHandler[] = [];
77
+
78
+ /**
79
+ * Register a new handler.
80
+ * Pass `{ prepend: true }` to insert at the front so it takes priority
81
+ * over previously registered handlers (useful for specialised format plugins).
82
+ */
83
+ register(handler: InternalParserHandler, options?: { prepend?: boolean }): void {
84
+ if (this.handlers.find((h) => h.name === handler.name)) {
85
+ throw new Error(`[DocumentParser] Handler "${handler.name}" is already registered.`);
86
+ }
87
+ if (options?.prepend) {
88
+ this.handlers.unshift(handler);
89
+ } else {
90
+ this.handlers.push(handler);
91
+ }
92
+ }
93
+
94
+ /** Remove a previously registered handler by name */
95
+ unregister(name: string): void {
96
+ this.handlers = this.handlers.filter((h) => h.name !== name);
97
+ }
98
+
99
+ /** Return a copy of the current handler list (for introspection / tests) */
100
+ list(): ReadonlyArray<InternalParserHandler> {
101
+ return [...this.handlers];
102
+ }
103
+
104
+ /**
105
+ * Try handlers in order; return the result of the first one that
106
+ * `supports` and successfully `parse`s the attachment.
107
+ *
108
+ * Returns `{ text: '', handled: false }` when no handler supports the file.
109
+ */
110
+ async parse(attachment: AttachmentLike, ctx: Context): Promise<InternalParseResult> {
111
+ for (const handler of this.handlers) {
112
+ if (!handler.supports(attachment)) {
113
+ continue;
114
+ }
115
+ const result = await handler.parse(attachment, ctx);
116
+ if (result.handled) {
117
+ return result;
118
+ }
119
+ }
120
+ return { text: '', handled: false };
121
+ }
122
+
123
+ get size(): number {
124
+ return this.handlers.length;
125
+ }
126
+ }