plugin-document-parser 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/client.d.ts +2 -0
  2. package/client.js +1 -0
  3. package/dist/client/01b8a5798a872638.js +10 -0
  4. package/dist/client/022be20abc96fdb4.js +10 -0
  5. package/dist/client/12e97e7a84d900e0.js +10 -0
  6. package/dist/client/index.js +10 -0
  7. package/dist/externalVersion.js +20 -0
  8. package/dist/index.js +48 -0
  9. package/dist/locale/en-US.json +54 -0
  10. package/dist/locale/vi-VN.json +54 -0
  11. package/dist/node_modules/form-data/License +19 -0
  12. package/dist/node_modules/form-data/index.d.ts +62 -0
  13. package/dist/node_modules/form-data/lib/browser.js +4 -0
  14. package/dist/node_modules/form-data/lib/form_data.js +14 -0
  15. package/dist/node_modules/form-data/lib/populate.js +10 -0
  16. package/dist/node_modules/form-data/package.json +1 -0
  17. package/dist/server/collections/doc-parser-providers.js +137 -0
  18. package/dist/server/collections/doc-parser-settings.js +85 -0
  19. package/dist/server/index.js +51 -0
  20. package/dist/server/plugin.js +181 -0
  21. package/dist/server/resource/docParserProviders.js +91 -0
  22. package/dist/server/services/builtin-ai-handler.js +63 -0
  23. package/dist/server/services/external-ocr-client.js +189 -0
  24. package/dist/server/services/internal-parser-registry.js +82 -0
  25. package/dist/server/services/parse-router.js +273 -0
  26. package/package.json +33 -0
  27. package/server.d.ts +2 -0
  28. package/server.js +1 -0
  29. package/src/client/components/GlobalSettings.tsx +151 -0
  30. package/src/client/components/ProviderForm.tsx +266 -0
  31. package/src/client/components/ProviderList.tsx +193 -0
  32. package/src/client/components/SettingsPage.tsx +43 -0
  33. package/src/client/index.tsx +2 -0
  34. package/src/client/locale.ts +12 -0
  35. package/src/client/plugin.tsx +34 -0
  36. package/src/index.ts +2 -0
  37. package/src/locale/en-US.json +54 -0
  38. package/src/locale/vi-VN.json +54 -0
  39. package/src/server/collections/doc-parser-providers.ts +107 -0
  40. package/src/server/collections/doc-parser-settings.ts +59 -0
  41. package/src/server/index.ts +10 -0
  42. package/src/server/plugin.ts +172 -0
  43. package/src/server/resource/docParserProviders.ts +72 -0
  44. package/src/server/services/builtin-ai-handler.ts +49 -0
  45. package/src/server/services/external-ocr-client.ts +233 -0
  46. package/src/server/services/internal-parser-registry.ts +126 -0
  47. package/src/server/services/parse-router.ts +357 -0
@@ -0,0 +1,189 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __create = Object.create;
11
+ var __defProp = Object.defineProperty;
12
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
13
+ var __getOwnPropNames = Object.getOwnPropertyNames;
14
+ var __getProtoOf = Object.getPrototypeOf;
15
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
16
+ var __export = (target, all) => {
17
+ for (var name in all)
18
+ __defProp(target, name, { get: all[name], enumerable: true });
19
+ };
20
+ var __copyProps = (to, from, except, desc) => {
21
+ if (from && typeof from === "object" || typeof from === "function") {
22
+ for (let key of __getOwnPropNames(from))
23
+ if (!__hasOwnProp.call(to, key) && key !== except)
24
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
25
+ }
26
+ return to;
27
+ };
28
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
29
+ // If the importer is in node compatibility mode or this is not an ESM
30
+ // file that has been converted to a CommonJS file using a Babel-
31
+ // compatible transform (i.e. "__esModule" has not been set), then set
32
+ // "default" to the CommonJS "module.exports" for node compatibility.
33
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
34
+ mod
35
+ ));
36
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
37
+ var external_ocr_client_exports = {};
38
+ __export(external_ocr_client_exports, {
39
+ callExternalOcr: () => callExternalOcr,
40
+ testOcrProviderConnection: () => testOcrProviderConnection
41
+ });
42
+ module.exports = __toCommonJS(external_ocr_client_exports);
43
+ var import_axios = __toESM(require("axios"));
44
+ var import_form_data = __toESM(require("form-data"));
45
+ function setByPath(obj, dotPath, value) {
46
+ const parts = dotPath.split(".");
47
+ let current = obj;
48
+ for (let i = 0; i < parts.length - 1; i++) {
49
+ if (current[parts[i]] == null || typeof current[parts[i]] !== "object") {
50
+ current[parts[i]] = {};
51
+ }
52
+ current = current[parts[i]];
53
+ }
54
+ current[parts[parts.length - 1]] = value;
55
+ }
56
+ function getByPath(obj, dotPath) {
57
+ if (!obj || !dotPath) return void 0;
58
+ const parts = dotPath.split(".");
59
+ let current = obj;
60
+ for (const part of parts) {
61
+ if (current == null) return void 0;
62
+ current = current[part];
63
+ }
64
+ return current;
65
+ }
66
+ function buildAuthHeaders(config) {
67
+ const { authType, apiKey, authConfig = {} } = config;
68
+ const headers = {};
69
+ switch (authType) {
70
+ case "bearer":
71
+ if (apiKey) headers["Authorization"] = `Bearer ${apiKey}`;
72
+ break;
73
+ case "api-key-header": {
74
+ const headerName = authConfig.headerName || "X-Api-Key";
75
+ if (apiKey) headers[headerName] = apiKey;
76
+ break;
77
+ }
78
+ case "basic": {
79
+ const { username = "", password = "" } = authConfig;
80
+ const encoded = Buffer.from(`${username}:${password}`).toString("base64");
81
+ headers["Authorization"] = `Basic ${encoded}`;
82
+ break;
83
+ }
84
+ case "custom-headers":
85
+ Object.assign(headers, authConfig.customHeaders ?? {});
86
+ break;
87
+ case "none":
88
+ default:
89
+ break;
90
+ }
91
+ return headers;
92
+ }
93
+ async function callExternalOcr(providerConfig, options) {
94
+ const {
95
+ apiEndpoint,
96
+ requestFormat = "multipart",
97
+ requestConfig = {},
98
+ responseTextPath = "text",
99
+ timeout = 6e4
100
+ } = providerConfig;
101
+ const authHeaders = buildAuthHeaders(providerConfig);
102
+ let axiosConfig = {
103
+ timeout,
104
+ headers: { ...authHeaders }
105
+ };
106
+ let body;
107
+ if (requestFormat === "multipart") {
108
+ const form = new import_form_data.default();
109
+ const fileFieldName = requestConfig.fileFieldName || "file";
110
+ form.append(fileFieldName, options.fileBuffer, {
111
+ filename: options.filename,
112
+ contentType: options.mimetype
113
+ });
114
+ if (requestConfig.filenameFieldName) {
115
+ form.append(requestConfig.filenameFieldName, options.filename);
116
+ }
117
+ if (requestConfig.mimetypeFieldName) {
118
+ form.append(requestConfig.mimetypeFieldName, options.mimetype);
119
+ }
120
+ for (const [k, v] of Object.entries(requestConfig.extraFields ?? {})) {
121
+ form.append(k, v);
122
+ }
123
+ body = form;
124
+ axiosConfig.headers = {
125
+ ...axiosConfig.headers,
126
+ ...form.getHeaders()
127
+ };
128
+ } else if (requestFormat === "json-base64") {
129
+ const base64 = options.fileBuffer.toString("base64");
130
+ const jsonBody = { ...requestConfig.extraBody ?? {} };
131
+ setByPath(jsonBody, requestConfig.base64FieldPath || "file", base64);
132
+ if (requestConfig.filenameFieldPath) {
133
+ setByPath(jsonBody, requestConfig.filenameFieldPath, options.filename);
134
+ }
135
+ if (requestConfig.mimetypeFieldPath) {
136
+ setByPath(jsonBody, requestConfig.mimetypeFieldPath, options.mimetype);
137
+ }
138
+ body = jsonBody;
139
+ axiosConfig.headers = {
140
+ ...axiosConfig.headers,
141
+ "Content-Type": "application/json"
142
+ };
143
+ } else if (requestFormat === "url") {
144
+ if (!options.fileUrl) {
145
+ throw new Error("[DocumentParser] requestFormat=url but no fileUrl was provided");
146
+ }
147
+ const jsonBody = { ...requestConfig.extraBody ?? {} };
148
+ setByPath(jsonBody, requestConfig.urlFieldPath || "url", options.fileUrl);
149
+ body = jsonBody;
150
+ axiosConfig.headers = {
151
+ ...axiosConfig.headers,
152
+ "Content-Type": "application/json"
153
+ };
154
+ } else {
155
+ throw new Error(`[DocumentParser] Unknown requestFormat: ${requestFormat}`);
156
+ }
157
+ const response = await import_axios.default.post(apiEndpoint, body, axiosConfig);
158
+ const responseData = response.data;
159
+ const text = getByPath(responseData, responseTextPath);
160
+ if (typeof text !== "string") {
161
+ throw new Error(
162
+ `[DocumentParser] Could not extract text from response at path "${responseTextPath}". Response: ${JSON.stringify(responseData).slice(0, 300)}`
163
+ );
164
+ }
165
+ return text;
166
+ }
167
+ async function testOcrProviderConnection(providerConfig) {
168
+ try {
169
+ const authHeaders = buildAuthHeaders(providerConfig);
170
+ const response = await import_axios.default.get(providerConfig.apiEndpoint, {
171
+ headers: authHeaders,
172
+ timeout: providerConfig.timeout ?? 1e4,
173
+ // Don't throw on 4xx so we can return the status to the UI
174
+ validateStatus: () => true
175
+ });
176
+ return {
177
+ ok: response.status < 500,
178
+ status: response.status,
179
+ message: response.status < 500 ? void 0 : `Server error: ${response.status}`
180
+ };
181
+ } catch (err) {
182
+ return { ok: false, message: (err == null ? void 0 : err.message) ?? String(err) };
183
+ }
184
+ }
185
+ // Annotate the CommonJS export names for ESM import in node:
186
+ 0 && (module.exports = {
187
+ callExternalOcr,
188
+ testOcrProviderConnection
189
+ });
@@ -0,0 +1,82 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __defProp = Object.defineProperty;
11
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
12
+ var __getOwnPropNames = Object.getOwnPropertyNames;
13
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
14
+ var __export = (target, all) => {
15
+ for (var name in all)
16
+ __defProp(target, name, { get: all[name], enumerable: true });
17
+ };
18
+ var __copyProps = (to, from, except, desc) => {
19
+ if (from && typeof from === "object" || typeof from === "function") {
20
+ for (let key of __getOwnPropNames(from))
21
+ if (!__hasOwnProp.call(to, key) && key !== except)
22
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
23
+ }
24
+ return to;
25
+ };
26
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
27
+ var internal_parser_registry_exports = {};
28
+ __export(internal_parser_registry_exports, {
29
+ InternalParserRegistry: () => InternalParserRegistry
30
+ });
31
+ module.exports = __toCommonJS(internal_parser_registry_exports);
32
+ class InternalParserRegistry {
33
+ handlers = [];
34
+ /**
35
+ * Register a new handler.
36
+ * Pass `{ prepend: true }` to insert at the front so it takes priority
37
+ * over previously registered handlers (useful for specialised format plugins).
38
+ */
39
+ register(handler, options) {
40
+ if (this.handlers.find((h) => h.name === handler.name)) {
41
+ throw new Error(`[DocumentParser] Handler "${handler.name}" is already registered.`);
42
+ }
43
+ if (options == null ? void 0 : options.prepend) {
44
+ this.handlers.unshift(handler);
45
+ } else {
46
+ this.handlers.push(handler);
47
+ }
48
+ }
49
+ /** Remove a previously registered handler by name */
50
+ unregister(name) {
51
+ this.handlers = this.handlers.filter((h) => h.name !== name);
52
+ }
53
+ /** Return a copy of the current handler list (for introspection / tests) */
54
+ list() {
55
+ return [...this.handlers];
56
+ }
57
+ /**
58
+ * Try handlers in order; return the result of the first one that
59
+ * `supports` and successfully `parse`s the attachment.
60
+ *
61
+ * Returns `{ text: '', handled: false }` when no handler supports the file.
62
+ */
63
+ async parse(attachment, ctx) {
64
+ for (const handler of this.handlers) {
65
+ if (!handler.supports(attachment)) {
66
+ continue;
67
+ }
68
+ const result = await handler.parse(attachment, ctx);
69
+ if (result.handled) {
70
+ return result;
71
+ }
72
+ }
73
+ return { text: "", handled: false };
74
+ }
75
+ get size() {
76
+ return this.handlers.length;
77
+ }
78
+ }
79
+ // Annotate the CommonJS export names for ESM import in node:
80
+ 0 && (module.exports = {
81
+ InternalParserRegistry
82
+ });
@@ -0,0 +1,273 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __defProp = Object.defineProperty;
11
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
12
+ var __getOwnPropNames = Object.getOwnPropertyNames;
13
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
14
+ var __export = (target, all) => {
15
+ for (var name in all)
16
+ __defProp(target, name, { get: all[name], enumerable: true });
17
+ };
18
+ var __copyProps = (to, from, except, desc) => {
19
+ if (from && typeof from === "object" || typeof from === "function") {
20
+ for (let key of __getOwnPropNames(from))
21
+ if (!__hasOwnProp.call(to, key) && key !== except)
22
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
23
+ }
24
+ return to;
25
+ };
26
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
27
+ var parse_router_exports = {};
28
+ __export(parse_router_exports, {
29
+ ParseRouter: () => ParseRouter
30
+ });
31
+ module.exports = __toCommonJS(parse_router_exports);
32
+ var import_os = require("os");
33
+ var import_path = require("path");
34
+ var import_promises = require("fs/promises");
35
+ var import_external_ocr_client = require("./external-ocr-client");
36
+ class ParseRouter {
37
+ constructor(getSettingsRepo, getProvidersRepo, internalRegistry, getFileBuffer, getDocpixiePlugin = () => null) {
38
+ this.getSettingsRepo = getSettingsRepo;
39
+ this.getProvidersRepo = getProvidersRepo;
40
+ this.internalRegistry = internalRegistry;
41
+ this.getFileBuffer = getFileBuffer;
42
+ this.getDocpixiePlugin = getDocpixiePlugin;
43
+ }
44
+ // ── Settings cache (invalidated every N ms) ───────────────────────────────
45
+ cachedSettings = null;
46
+ settingsCachedAt = 0;
47
+ CACHE_TTL_MS = 5e3;
48
+ async getSettings() {
49
+ const now = Date.now();
50
+ if (this.cachedSettings && now - this.settingsCachedAt < this.CACHE_TTL_MS) {
51
+ return this.cachedSettings;
52
+ }
53
+ const repo = this.getSettingsRepo();
54
+ let record = await repo.findOne({});
55
+ if (!record) {
56
+ record = await repo.create({
57
+ values: {
58
+ mode: "default",
59
+ fallbackToDefault: true,
60
+ imagePassThrough: true,
61
+ includedExtnames: []
62
+ }
63
+ });
64
+ }
65
+ this.cachedSettings = {
66
+ mode: record.get("mode") ?? "default",
67
+ activeProviderId: record.get("activeProviderId") ?? null,
68
+ fallbackToDefault: record.get("fallbackToDefault") ?? true,
69
+ imagePassThrough: record.get("imagePassThrough") ?? true,
70
+ includedExtnames: record.get("includedExtnames") ?? [],
71
+ useDocpixie: record.get("useDocpixie") ?? false
72
+ };
73
+ this.settingsCachedAt = now;
74
+ return this.cachedSettings;
75
+ }
76
+ /** Call after saving settings so the next request reads fresh values */
77
+ invalidateSettingsCache() {
78
+ this.cachedSettings = null;
79
+ }
80
+ // ── Main entry point ──────────────────────────────────────────────────────
81
+ async route(ctx, attachment, defaultParser) {
82
+ var _a;
83
+ const settings = await this.getSettings();
84
+ if (settings.imagePassThrough && ((_a = attachment.mimetype) == null ? void 0 : _a.startsWith("image/"))) {
85
+ return defaultParser();
86
+ }
87
+ if (settings.includedExtnames.length > 0) {
88
+ const ext = resolveExtname(attachment);
89
+ if (!settings.includedExtnames.includes(ext)) {
90
+ return defaultParser();
91
+ }
92
+ }
93
+ if (settings.useDocpixie) {
94
+ const docpixieResult = await this.routeDocpixie(ctx, attachment);
95
+ if (docpixieResult) return docpixieResult;
96
+ }
97
+ switch (settings.mode) {
98
+ case "default":
99
+ return defaultParser();
100
+ case "internal":
101
+ return this.routeInternal(ctx, attachment, settings, defaultParser);
102
+ case "external":
103
+ return this.routeExternal(ctx, attachment, settings, defaultParser);
104
+ default:
105
+ return defaultParser();
106
+ }
107
+ }
108
+ // ── DocPixie routing ─────────────────────────────────────────────────────
109
+ /**
110
+ * Index the attachment into DocPixie and return a metadata reference block.
111
+ *
112
+ * Strategy:
113
+ * 1. Get the DocPixie plugin instance (returns null if not loaded/active)
114
+ * 2. Download the file buffer and write to a temp file (works for S3 / local)
115
+ * 3. Call docpixieService.processDocument() — this:
116
+ * a. Creates a DB record immediately → returns documentId fast
117
+ * b. Continues extracting pages + summarizing in the background
118
+ * 4. Build a content block with metadata + LLM instructions to use docpixie:query
119
+ * 5. Clean up temp file
120
+ *
121
+ * Returns null if DocPixie is unavailable or not ready so caller can fall through.
122
+ */
123
+ async routeDocpixie(ctx, attachment) {
124
+ var _a, _b, _c, _d, _e, _f;
125
+ const docpixiePlugin = this.getDocpixiePlugin();
126
+ if (!(docpixiePlugin == null ? void 0 : docpixiePlugin.service)) {
127
+ return null;
128
+ }
129
+ const service = docpixiePlugin.service;
130
+ if (!service.isReady()) {
131
+ (_b = (_a = ctx.log) == null ? void 0 : _a.warn) == null ? void 0 : _b.call(_a, "[DocumentParser] DocPixie service is not ready (not configured) \u2014 skipping");
132
+ return null;
133
+ }
134
+ const filename = attachment.filename ?? attachment.name ?? "document";
135
+ const mimetype = attachment.mimetype ?? "application/octet-stream";
136
+ let tempPath = null;
137
+ try {
138
+ const { buffer } = await this.getFileBuffer(ctx, attachment);
139
+ const ext = resolveExtname(attachment) || ".bin";
140
+ tempPath = (0, import_path.join)((0, import_os.tmpdir)(), `docparser-${Date.now()}-${Math.random().toString(36).slice(2)}${ext}`);
141
+ await (0, import_promises.writeFile)(tempPath, buffer);
142
+ const documentId = await service.processDocument(tempPath, { name: filename });
143
+ (_d = (_c = ctx.log) == null ? void 0 : _c.info) == null ? void 0 : _d.call(_c, `[DocumentParser] DocPixie indexing started: documentId=${documentId} file="${filename}"`);
144
+ return docpixieReferenceBlock(documentId, filename, mimetype);
145
+ } catch (err) {
146
+ (_f = (_e = ctx.log) == null ? void 0 : _e.warn) == null ? void 0 : _f.call(_e, `[DocumentParser] DocPixie indexing failed for "${filename}": ${err}`);
147
+ return null;
148
+ } finally {
149
+ if (tempPath) {
150
+ (0, import_promises.unlink)(tempPath).catch(() => {
151
+ });
152
+ }
153
+ }
154
+ }
155
+ // ── Internal routing ──────────────────────────────────────────────────────
156
+ async routeInternal(ctx, attachment, settings, defaultParser) {
157
+ var _a, _b;
158
+ try {
159
+ const result = await this.internalRegistry.parse(attachment, ctx);
160
+ if (!result.handled) {
161
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
162
+ }
163
+ return textToContentBlock(result.text, attachment);
164
+ } catch (err) {
165
+ (_b = (_a = ctx.log) == null ? void 0 : _a.warn) == null ? void 0 : _b.call(_a, `[DocumentParser] internal parse failed for "${attachment.filename}": ${err}`);
166
+ if (settings.fallbackToDefault) {
167
+ return defaultParser();
168
+ }
169
+ throw err;
170
+ }
171
+ }
172
+ // ── External routing ──────────────────────────────────────────────────────
173
+ async routeExternal(ctx, attachment, settings, defaultParser) {
174
+ var _a, _b, _c, _d, _e, _f;
175
+ if (!settings.activeProviderId) {
176
+ (_b = (_a = ctx.log) == null ? void 0 : _a.warn) == null ? void 0 : _b.call(_a, "[DocumentParser] mode=external but no activeProviderId configured");
177
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
178
+ }
179
+ const providerRecord = await this.getProvidersRepo().findById(settings.activeProviderId);
180
+ if (!providerRecord || !providerRecord.get("enabled")) {
181
+ (_d = (_c = ctx.log) == null ? void 0 : _c.warn) == null ? void 0 : _d.call(_c, `[DocumentParser] External provider ${settings.activeProviderId} not found or disabled`);
182
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
183
+ }
184
+ const providerConfig = this.recordToProviderConfig(providerRecord);
185
+ const supportedMimetypes = providerConfig["supportedMimetypes"] ?? [];
186
+ if (supportedMimetypes.length > 0 && attachment.mimetype && !supportedMimetypes.includes(attachment.mimetype)) {
187
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
188
+ }
189
+ try {
190
+ const { buffer, url } = await this.getFileBuffer(ctx, attachment);
191
+ const text = await (0, import_external_ocr_client.callExternalOcr)(providerConfig, {
192
+ fileBuffer: buffer,
193
+ filename: attachment.filename ?? attachment.name ?? "file",
194
+ mimetype: attachment.mimetype ?? "application/octet-stream",
195
+ fileUrl: url
196
+ });
197
+ return textToContentBlock(text, attachment);
198
+ } catch (err) {
199
+ (_f = (_e = ctx.log) == null ? void 0 : _e.warn) == null ? void 0 : _f.call(_e, `[DocumentParser] external OCR failed for "${attachment.filename}": ${err}`);
200
+ if (settings.fallbackToDefault) {
201
+ return defaultParser();
202
+ }
203
+ throw err;
204
+ }
205
+ }
206
+ // ── Helpers ───────────────────────────────────────────────────────────────
207
+ recordToProviderConfig(record) {
208
+ return {
209
+ apiEndpoint: record.get("apiEndpoint"),
210
+ authType: record.get("authType"),
211
+ apiKey: record.get("apiKey"),
212
+ authConfig: record.get("authConfig") ?? {},
213
+ requestFormat: record.get("requestFormat") ?? "multipart",
214
+ requestConfig: record.get("requestConfig") ?? {},
215
+ responseTextPath: record.get("responseTextPath") ?? "text",
216
+ timeout: record.get("timeout") ?? 6e4,
217
+ supportedMimetypes: record.get("supportedMimetypes") ?? []
218
+ };
219
+ }
220
+ unsupportedResult(attachment) {
221
+ return {
222
+ placement: "contentBlocks",
223
+ content: {
224
+ type: "text",
225
+ text: `[Attachment: ${attachment.filename ?? attachment.name ?? "file"} \u2014 no parser available]`
226
+ }
227
+ };
228
+ }
229
+ }
230
+ function resolveExtname(attachment) {
231
+ if (attachment.extname) return attachment.extname.toLowerCase();
232
+ const name = attachment.filename ?? attachment.name ?? "";
233
+ const idx = name.lastIndexOf(".");
234
+ return idx >= 0 ? name.slice(idx).toLowerCase() : "";
235
+ }
236
+ function docpixieReferenceBlock(documentId, filename, mimetype) {
237
+ const text = [
238
+ `<document_indexed filename="${filename}" type="${mimetype}" docpixie_id="${documentId}">`,
239
+ `This document has been submitted to DocPixie for deep indexing (Document ID: ${documentId}).`,
240
+ ``,
241
+ `IMPORTANT: Do NOT attempt to read the raw file content inline.`,
242
+ `Instead, use the \`docpixie:query\` tool to retrieve information from this document.`,
243
+ ``,
244
+ `Usage examples:`,
245
+ ` - Summarize: docpixie:query { "query": "summarize this document", "documentIds": [${documentId}] }`,
246
+ ` - Find info: docpixie:query { "query": "<user question>", "documentIds": [${documentId}] }`,
247
+ ``,
248
+ `Note: Indexing runs in the background. If you query immediately and get no results,`,
249
+ `wait a moment and retry \u2014 complex documents (PDF with many pages) take longer to index.`,
250
+ `</document_indexed>`
251
+ ].join("\n");
252
+ return {
253
+ placement: "contentBlocks",
254
+ content: { type: "text", text }
255
+ };
256
+ }
257
+ function textToContentBlock(text, attachment) {
258
+ const filename = attachment.filename ?? attachment.name ?? "document";
259
+ const mimetype = attachment.mimetype ?? "";
260
+ return {
261
+ placement: "contentBlocks",
262
+ content: {
263
+ type: "text",
264
+ text: `<document filename="${filename}" type="${mimetype}">
265
+ ${text}
266
+ </document>`
267
+ }
268
+ };
269
+ }
270
+ // Annotate the CommonJS export names for ESM import in node:
271
+ 0 && (module.exports = {
272
+ ParseRouter
273
+ });
package/package.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "name": "plugin-document-parser",
3
+ "displayName": "Document Parser",
4
+ "displayName.zh-CN": "文档解析器",
5
+ "description": "Intercept and route AI chat file attachments through configurable internal or external OCR/parse providers.",
6
+ "version": "1.0.1",
7
+ "main": "dist/server/index.js",
8
+ "files": [
9
+ "dist",
10
+ "client.js",
11
+ "client.d.ts",
12
+ "server.js",
13
+ "server.d.ts",
14
+ "src"
15
+ ],
16
+ "nocobase": {
17
+ "supportedVersions": ["2.x"]
18
+ },
19
+ "peerDependencies": {
20
+ "@nocobase/client": "2.x",
21
+ "@nocobase/server": "2.x",
22
+ "@nocobase/database": "2.x",
23
+ "@nocobase/plugin-ai": "2.x",
24
+ "@nocobase/plugin-file-manager": "2.x",
25
+ "@nocobase/test": "2.x"
26
+ },
27
+ "dependencies": {
28
+ "axios": "^1.6.0",
29
+ "form-data": "^4.0.0"
30
+ },
31
+ "keywords": ["AI", "document", "parser", "OCR"],
32
+ "license": "Apache-2.0"
33
+ }
package/server.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ export * from './dist/server';
2
+ export { default } from './dist/server';
package/server.js ADDED
@@ -0,0 +1 @@
1
+ module.exports = require('./dist/server/index.js');