plugin-document-parser 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/client.d.ts +2 -0
  2. package/client.js +1 -0
  3. package/dist/client/01b8a5798a872638.js +10 -0
  4. package/dist/client/022be20abc96fdb4.js +10 -0
  5. package/dist/client/12e97e7a84d900e0.js +10 -0
  6. package/dist/client/index.js +10 -0
  7. package/dist/externalVersion.js +20 -0
  8. package/dist/index.js +48 -0
  9. package/dist/locale/en-US.json +54 -0
  10. package/dist/locale/vi-VN.json +54 -0
  11. package/dist/node_modules/form-data/License +19 -0
  12. package/dist/node_modules/form-data/index.d.ts +62 -0
  13. package/dist/node_modules/form-data/lib/browser.js +4 -0
  14. package/dist/node_modules/form-data/lib/form_data.js +14 -0
  15. package/dist/node_modules/form-data/lib/populate.js +10 -0
  16. package/dist/node_modules/form-data/package.json +1 -0
  17. package/dist/server/collections/doc-parser-providers.js +137 -0
  18. package/dist/server/collections/doc-parser-settings.js +85 -0
  19. package/dist/server/index.js +51 -0
  20. package/dist/server/plugin.js +181 -0
  21. package/dist/server/resource/docParserProviders.js +91 -0
  22. package/dist/server/services/builtin-ai-handler.js +63 -0
  23. package/dist/server/services/external-ocr-client.js +189 -0
  24. package/dist/server/services/internal-parser-registry.js +82 -0
  25. package/dist/server/services/parse-router.js +273 -0
  26. package/package.json +33 -0
  27. package/server.d.ts +2 -0
  28. package/server.js +1 -0
  29. package/src/client/components/GlobalSettings.tsx +151 -0
  30. package/src/client/components/ProviderForm.tsx +266 -0
  31. package/src/client/components/ProviderList.tsx +193 -0
  32. package/src/client/components/SettingsPage.tsx +43 -0
  33. package/src/client/index.tsx +2 -0
  34. package/src/client/locale.ts +12 -0
  35. package/src/client/plugin.tsx +34 -0
  36. package/src/index.ts +2 -0
  37. package/src/locale/en-US.json +54 -0
  38. package/src/locale/vi-VN.json +54 -0
  39. package/src/server/collections/doc-parser-providers.ts +107 -0
  40. package/src/server/collections/doc-parser-settings.ts +59 -0
  41. package/src/server/index.ts +10 -0
  42. package/src/server/plugin.ts +172 -0
  43. package/src/server/resource/docParserProviders.ts +72 -0
  44. package/src/server/services/builtin-ai-handler.ts +49 -0
  45. package/src/server/services/external-ocr-client.ts +233 -0
  46. package/src/server/services/internal-parser-registry.ts +126 -0
  47. package/src/server/services/parse-router.ts +357 -0
@@ -0,0 +1,357 @@
1
+ import { Context } from '@nocobase/actions';
2
+ import { tmpdir } from 'os';
3
+ import { join } from 'path';
4
+ import { writeFile, unlink } from 'fs/promises';
5
+ import type { Repository } from '@nocobase/database';
6
+ import { callExternalOcr, OcrProviderConfig } from './external-ocr-client';
7
+ import type { InternalParserRegistry, AttachmentLike } from './internal-parser-registry';
8
+
9
+ export type ParsedAttachmentResult = {
10
+ placement: string;
11
+ content: any;
12
+ };
13
+
14
+ export type DefaultParserFn = () => Promise<ParsedAttachmentResult>;
15
+
16
+ type Settings = {
17
+ mode: 'default' | 'internal' | 'external';
18
+ activeProviderId?: number | string | null;
19
+ fallbackToDefault: boolean;
20
+ imagePassThrough: boolean;
21
+ includedExtnames: string[];
22
+ useDocpixie: boolean;
23
+ };
24
+
25
+ /**
26
+ * Decides how to process each attachment:
27
+ *
28
+ * 1. If `imagePassThrough` is true and the file is an image → default
29
+ * 2. If `includedExtnames` is non-empty and extname is not in the list → default
30
+ * 3. Based on `mode`:
31
+ * - default → call the original provider.parseAttachment()
32
+ * - internal → run through InternalParserRegistry
33
+ * - external → call the configured external OCR API
34
+ * 4. On failure, if `fallbackToDefault` → call original provider as fallback
35
+ */
36
+ export class ParseRouter {
37
+ constructor(
38
+ private readonly getSettingsRepo: () => Repository,
39
+ private readonly getProvidersRepo: () => Repository,
40
+ private readonly internalRegistry: InternalParserRegistry,
41
+ private readonly getFileBuffer: (ctx: Context, attachment: AttachmentLike) => Promise<{ buffer: Buffer; url: string }>,
42
+ /** Returns the docpixie plugin instance if it is loaded & active, else null */
43
+ private readonly getDocpixiePlugin: () => any | null = () => null,
44
+ ) {}
45
+
46
+ // ── Settings cache (invalidated every N ms) ───────────────────────────────
47
+
48
+ private cachedSettings: Settings | null = null;
49
+ private settingsCachedAt = 0;
50
+ private readonly CACHE_TTL_MS = 5_000;
51
+
52
+ private async getSettings(): Promise<Settings> {
53
+ const now = Date.now();
54
+ if (this.cachedSettings && now - this.settingsCachedAt < this.CACHE_TTL_MS) {
55
+ return this.cachedSettings;
56
+ }
57
+
58
+ const repo = this.getSettingsRepo();
59
+ let record = await repo.findOne({});
60
+ if (!record) {
61
+ // Auto-create defaults on first access
62
+ record = await repo.create({
63
+ values: {
64
+ mode: 'default',
65
+ fallbackToDefault: true,
66
+ imagePassThrough: true,
67
+ includedExtnames: [],
68
+ },
69
+ });
70
+ }
71
+
72
+ this.cachedSettings = {
73
+ mode: record.get('mode') ?? 'default',
74
+ activeProviderId: record.get('activeProviderId') ?? null,
75
+ fallbackToDefault: record.get('fallbackToDefault') ?? true,
76
+ imagePassThrough: record.get('imagePassThrough') ?? true,
77
+ includedExtnames: record.get('includedExtnames') ?? [],
78
+ useDocpixie: record.get('useDocpixie') ?? false,
79
+ };
80
+ this.settingsCachedAt = now;
81
+ return this.cachedSettings!;
82
+ }
83
+
84
+ /** Call after saving settings so the next request reads fresh values */
85
+ invalidateSettingsCache(): void {
86
+ this.cachedSettings = null;
87
+ }
88
+
89
+ // ── Main entry point ──────────────────────────────────────────────────────
90
+
91
+ async route(
92
+ ctx: Context,
93
+ attachment: AttachmentLike,
94
+ defaultParser: DefaultParserFn,
95
+ ): Promise<ParsedAttachmentResult> {
96
+ const settings = await this.getSettings();
97
+
98
+ // 1. Always pass images through if configured
99
+ if (settings.imagePassThrough && attachment.mimetype?.startsWith('image/')) {
100
+ return defaultParser();
101
+ }
102
+
103
+ // 2. If an explicit extension whitelist is set and this file isn't in it → default
104
+ if (settings.includedExtnames.length > 0) {
105
+ const ext = resolveExtname(attachment);
106
+ if (!settings.includedExtnames.includes(ext)) {
107
+ return defaultParser();
108
+ }
109
+ }
110
+
111
+ // 3. DocPixie indexing — runs BEFORE normal mode routing when enabled
112
+ // Indexes the document asynchronously and returns a metadata reference block
113
+ // so the LLM uses docpixie:query tool instead of reading raw file content.
114
+ if (settings.useDocpixie) {
115
+ const docpixieResult = await this.routeDocpixie(ctx, attachment);
116
+ if (docpixieResult) return docpixieResult;
117
+ // docpixie unavailable/failed — fall through to normal routing
118
+ }
119
+
120
+ // 4. Route based on mode
121
+ switch (settings.mode) {
122
+ case 'default':
123
+ return defaultParser();
124
+
125
+ case 'internal':
126
+ return this.routeInternal(ctx, attachment, settings, defaultParser);
127
+
128
+ case 'external':
129
+ return this.routeExternal(ctx, attachment, settings, defaultParser);
130
+
131
+ default:
132
+ return defaultParser();
133
+ }
134
+ }
135
+
136
+ // ── DocPixie routing ─────────────────────────────────────────────────────
137
+
138
+ /**
139
+ * Index the attachment into DocPixie and return a metadata reference block.
140
+ *
141
+ * Strategy:
142
+ * 1. Get the DocPixie plugin instance (returns null if not loaded/active)
143
+ * 2. Download the file buffer and write to a temp file (works for S3 / local)
144
+ * 3. Call docpixieService.processDocument() — this:
145
+ * a. Creates a DB record immediately → returns documentId fast
146
+ * b. Continues extracting pages + summarizing in the background
147
+ * 4. Build a content block with metadata + LLM instructions to use docpixie:query
148
+ * 5. Clean up temp file
149
+ *
150
+ * Returns null if DocPixie is unavailable or not ready so caller can fall through.
151
+ */
152
+ private async routeDocpixie(
153
+ ctx: Context,
154
+ attachment: AttachmentLike,
155
+ ): Promise<ParsedAttachmentResult | null> {
156
+ const docpixiePlugin = this.getDocpixiePlugin();
157
+ if (!docpixiePlugin?.service) {
158
+ return null;
159
+ }
160
+
161
+ const service = docpixiePlugin.service;
162
+ if (!service.isReady()) {
163
+ ctx.log?.warn?.('[DocumentParser] DocPixie service is not ready (not configured) — skipping');
164
+ return null;
165
+ }
166
+
167
+ const filename = attachment.filename ?? attachment.name ?? 'document';
168
+ const mimetype = attachment.mimetype ?? 'application/octet-stream';
169
+ let tempPath: string | null = null;
170
+
171
+ try {
172
+ // Download file bytes (handles S3 URLs and local paths uniformly)
173
+ const { buffer } = await this.getFileBuffer(ctx, attachment);
174
+
175
+ // Write to a uniquely-named temp file so DocPixie can read it by path
176
+ const ext = resolveExtname(attachment) || '.bin';
177
+ tempPath = join(tmpdir(), `docparser-${Date.now()}-${Math.random().toString(36).slice(2)}${ext}`);
178
+ await writeFile(tempPath, buffer);
179
+
180
+ // Kick off DocPixie indexing — processDocument creates the DB record fast
181
+ // and continues ingestion (OCR + LLM summarization) asynchronously
182
+ const documentId: number = await service.processDocument(tempPath, { name: filename });
183
+
184
+ ctx.log?.info?.(`[DocumentParser] DocPixie indexing started: documentId=${documentId} file="${filename}"`);
185
+
186
+ return docpixieReferenceBlock(documentId, filename, mimetype);
187
+ } catch (err) {
188
+ ctx.log?.warn?.(`[DocumentParser] DocPixie indexing failed for "${filename}": ${err}`);
189
+ return null; // fall through to normal routing
190
+ } finally {
191
+ // Clean up temp file (best-effort — DocPixie has already read it)
192
+ if (tempPath) {
193
+ unlink(tempPath).catch(() => {});
194
+ }
195
+ }
196
+ }
197
+
198
+ // ── Internal routing ──────────────────────────────────────────────────────
199
+
200
+ private async routeInternal(
201
+ ctx: Context,
202
+ attachment: AttachmentLike,
203
+ settings: Settings,
204
+ defaultParser: DefaultParserFn,
205
+ ): Promise<ParsedAttachmentResult> {
206
+ try {
207
+ const result = await this.internalRegistry.parse(attachment, ctx);
208
+
209
+ if (!result.handled) {
210
+ // No handler claimed this file type; fall through
211
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
212
+ }
213
+
214
+ return textToContentBlock(result.text, attachment);
215
+ } catch (err) {
216
+ ctx.log?.warn?.(`[DocumentParser] internal parse failed for "${attachment.filename}": ${err}`);
217
+ if (settings.fallbackToDefault) {
218
+ return defaultParser();
219
+ }
220
+ throw err;
221
+ }
222
+ }
223
+
224
+ // ── External routing ──────────────────────────────────────────────────────
225
+
226
+ private async routeExternal(
227
+ ctx: Context,
228
+ attachment: AttachmentLike,
229
+ settings: Settings,
230
+ defaultParser: DefaultParserFn,
231
+ ): Promise<ParsedAttachmentResult> {
232
+ if (!settings.activeProviderId) {
233
+ ctx.log?.warn?.('[DocumentParser] mode=external but no activeProviderId configured');
234
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
235
+ }
236
+
237
+ const providerRecord = await this.getProvidersRepo().findById(settings.activeProviderId);
238
+ if (!providerRecord || !providerRecord.get('enabled')) {
239
+ ctx.log?.warn?.(`[DocumentParser] External provider ${settings.activeProviderId} not found or disabled`);
240
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
241
+ }
242
+
243
+ const providerConfig = this.recordToProviderConfig(providerRecord);
244
+
245
+ // Check MIME type scope if the provider declares one
246
+ const supportedMimetypes: string[] = providerConfig['supportedMimetypes'] ?? [];
247
+ if (supportedMimetypes.length > 0 && attachment.mimetype && !supportedMimetypes.includes(attachment.mimetype)) {
248
+ // This provider doesn't handle this MIME type — fall back
249
+ return settings.fallbackToDefault ? defaultParser() : this.unsupportedResult(attachment);
250
+ }
251
+
252
+ try {
253
+ const { buffer, url } = await this.getFileBuffer(ctx, attachment);
254
+
255
+ const text = await callExternalOcr(providerConfig, {
256
+ fileBuffer: buffer,
257
+ filename: attachment.filename ?? attachment.name ?? 'file',
258
+ mimetype: attachment.mimetype ?? 'application/octet-stream',
259
+ fileUrl: url,
260
+ });
261
+
262
+ return textToContentBlock(text, attachment);
263
+ } catch (err) {
264
+ ctx.log?.warn?.(`[DocumentParser] external OCR failed for "${attachment.filename}": ${err}`);
265
+ if (settings.fallbackToDefault) {
266
+ return defaultParser();
267
+ }
268
+ throw err;
269
+ }
270
+ }
271
+
272
+ // ── Helpers ───────────────────────────────────────────────────────────────
273
+
274
+ private recordToProviderConfig(record: any): OcrProviderConfig & { supportedMimetypes?: string[] } {
275
+ return {
276
+ apiEndpoint: record.get('apiEndpoint'),
277
+ authType: record.get('authType'),
278
+ apiKey: record.get('apiKey'),
279
+ authConfig: record.get('authConfig') ?? {},
280
+ requestFormat: record.get('requestFormat') ?? 'multipart',
281
+ requestConfig: record.get('requestConfig') ?? {},
282
+ responseTextPath: record.get('responseTextPath') ?? 'text',
283
+ timeout: record.get('timeout') ?? 60000,
284
+ supportedMimetypes: record.get('supportedMimetypes') ?? [],
285
+ };
286
+ }
287
+
288
+ private unsupportedResult(attachment: AttachmentLike): ParsedAttachmentResult {
289
+ return {
290
+ placement: 'contentBlocks',
291
+ content: {
292
+ type: 'text',
293
+ text: `[Attachment: ${attachment.filename ?? attachment.name ?? 'file'} — no parser available]`,
294
+ },
295
+ };
296
+ }
297
+ }
298
+
299
+ // ─── Pure helpers ─────────────────────────────────────────────────────────────
300
+
301
+ function resolveExtname(attachment: AttachmentLike): string {
302
+ if (attachment.extname) return attachment.extname.toLowerCase();
303
+ const name = attachment.filename ?? attachment.name ?? '';
304
+ const idx = name.lastIndexOf('.');
305
+ return idx >= 0 ? name.slice(idx).toLowerCase() : '';
306
+ }
307
+
308
+ /**
309
+ * Build a content block that tells the LLM:
310
+ * "This document is indexed in DocPixie — do NOT try to read it inline,
311
+ * use the docpixie:query tool with the given documentId instead."
312
+ *
313
+ * The block intentionally omits full text content to avoid filling the
314
+ * context window. Instead it provides:
315
+ * - Document metadata (name, type, DocPixie ID)
316
+ * - Explicit instructions for tool usage
317
+ * - A ready-to-use query example
318
+ */
319
+ function docpixieReferenceBlock(
320
+ documentId: number,
321
+ filename: string,
322
+ mimetype: string,
323
+ ): ParsedAttachmentResult {
324
+ const text = [
325
+ `<document_indexed filename="${filename}" type="${mimetype}" docpixie_id="${documentId}">`,
326
+ `This document has been submitted to DocPixie for deep indexing (Document ID: ${documentId}).`,
327
+ ``,
328
+ `IMPORTANT: Do NOT attempt to read the raw file content inline.`,
329
+ `Instead, use the \`docpixie:query\` tool to retrieve information from this document.`,
330
+ ``,
331
+ `Usage examples:`,
332
+ ` - Summarize: docpixie:query { "query": "summarize this document", "documentIds": [${documentId}] }`,
333
+ ` - Find info: docpixie:query { "query": "<user question>", "documentIds": [${documentId}] }`,
334
+ ``,
335
+ `Note: Indexing runs in the background. If you query immediately and get no results,`,
336
+ `wait a moment and retry — complex documents (PDF with many pages) take longer to index.`,
337
+ `</document_indexed>`,
338
+ ].join('\n');
339
+
340
+ return {
341
+ placement: 'contentBlocks',
342
+ content: { type: 'text', text },
343
+ };
344
+ }
345
+
346
+ /** Wrap extracted text into the `ParsedAttachmentResult` shape that plugin-ai expects */
347
+ function textToContentBlock(text: string, attachment: AttachmentLike): ParsedAttachmentResult {
348
+ const filename = attachment.filename ?? attachment.name ?? 'document';
349
+ const mimetype = attachment.mimetype ?? '';
350
+ return {
351
+ placement: 'contentBlocks',
352
+ content: {
353
+ type: 'text',
354
+ text: `<document filename="${filename}" type="${mimetype}">\n${text}\n</document>`,
355
+ },
356
+ };
357
+ }