plugin-file-preview-auth 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/client-v2.d.ts +2 -0
  2. package/client-v2.js +1 -0
  3. package/dist/client/713.79a55458f5b67f39.js +30 -0
  4. package/dist/client/823.8b0ab22c181d4523.js +10 -0
  5. package/dist/client/828.ae8e47a2e7a3bc9e.js +49 -0
  6. package/dist/client/892.a568eb42fd6f0047.js +10 -0
  7. package/dist/client/index.js +1 -1
  8. package/dist/client-v2/index.js +10 -0
  9. package/dist/externalVersion.js +8 -7
  10. package/dist/node_modules/@aws-sdk/client-s3/dist-cjs/index.js +3086 -3725
  11. package/dist/node_modules/@aws-sdk/client-s3/node_modules/.bin/fxparser +16 -0
  12. package/dist/node_modules/@aws-sdk/client-s3/node_modules/.bin/fxparser.cmd +17 -0
  13. package/dist/node_modules/@aws-sdk/client-s3/node_modules/.bin/fxparser.ps1 +28 -0
  14. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-cjs/index.js +110 -0
  15. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-es/SignatureV4MultiRegion.js +66 -0
  16. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-es/index.js +2 -0
  17. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-es/signature-v4-crt-container.js +3 -0
  18. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/SignatureV4MultiRegion.d.ts +30 -0
  19. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/index.d.ts +5 -0
  20. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/signature-v4-crt-container.d.ts +28 -0
  21. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/ts3.4/SignatureV4MultiRegion.d.ts +40 -0
  22. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/ts3.4/index.d.ts +2 -0
  23. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/dist-types/ts3.4/signature-v4-crt-container.d.ts +20 -0
  24. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@aws-sdk/signature-v4-multi-region/package.json +57 -0
  25. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/AdaptiveRetryStrategy.js +1 -0
  26. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/ConfiguredRetryStrategy.js +1 -0
  27. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/DefaultRateLimiter.js +1 -0
  28. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/StandardRetryStrategy.js +1 -0
  29. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/config.js +1 -0
  30. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/constants.js +1 -0
  31. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/defaultRetryBackoffStrategy.js +1 -0
  32. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/defaultRetryToken.js +1 -0
  33. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/index.js +358 -0
  34. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-cjs/types.js +1 -0
  35. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/AdaptiveRetryStrategy.js +24 -0
  36. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/ConfiguredRetryStrategy.js +18 -0
  37. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/DefaultRateLimiter.js +100 -0
  38. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/StandardRetryStrategy.js +65 -0
  39. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/config.js +7 -0
  40. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/constants.js +9 -0
  41. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/defaultRetryBackoffStrategy.js +14 -0
  42. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/defaultRetryToken.js +11 -0
  43. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/index.js +7 -0
  44. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-es/types.js +1 -0
  45. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/AdaptiveRetryStrategy.d.ts +33 -0
  46. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ConfiguredRetryStrategy.d.ts +32 -0
  47. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/DefaultRateLimiter.d.ts +49 -0
  48. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/StandardRetryStrategy.d.ts +26 -0
  49. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/config.d.ts +20 -0
  50. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/constants.d.ts +59 -0
  51. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/defaultRetryBackoffStrategy.d.ts +5 -0
  52. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/defaultRetryToken.d.ts +9 -0
  53. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/index.d.ts +7 -0
  54. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/AdaptiveRetryStrategy.d.ts +33 -0
  55. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/ConfiguredRetryStrategy.d.ts +32 -0
  56. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/DefaultRateLimiter.d.ts +49 -0
  57. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/StandardRetryStrategy.d.ts +26 -0
  58. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/config.d.ts +20 -0
  59. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/constants.d.ts +59 -0
  60. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/defaultRetryBackoffStrategy.d.ts +5 -0
  61. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/defaultRetryToken.d.ts +9 -0
  62. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/index.d.ts +7 -0
  63. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/ts3.4/types.d.ts +19 -0
  64. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/dist-types/types.d.ts +19 -0
  65. package/dist/node_modules/@aws-sdk/client-s3/node_modules/@smithy/util-retry/package.json +68 -0
  66. package/dist/node_modules/@aws-sdk/client-s3/package.json +1 -1
  67. package/dist/node_modules/xlsx/package.json +1 -1
  68. package/dist/server/ocr/tesseract-runner.js +3 -1
  69. package/dist/server/plugin.js +22 -4
  70. package/package.json +57 -45
  71. package/src/client/AIFilePreviewAction.tsx +282 -0
  72. package/src/client/__tests__/ocr-utils.test.ts +85 -0
  73. package/src/client/client.d.ts +258 -0
  74. package/src/client/index.tsx +1807 -0
  75. package/src/client/locale.ts +21 -0
  76. package/src/client-v2/index.tsx +1 -0
  77. package/src/client-v2/plugin.tsx +7 -0
  78. package/{dist/index.d.ts → src/index.ts} +11 -10
  79. package/src/locale/en-US.json +14 -0
  80. package/src/locale/vi-VN.json +14 -0
  81. package/src/locale/zh-CN.json +14 -0
  82. package/src/server/__tests__/smoke.test.ts +17 -0
  83. package/src/server/collections/attachment-ocr-results.ts +40 -0
  84. package/{dist/server/collections/file-preview-auth.d.ts → src/server/collections/file-preview-auth.ts} +15 -14
  85. package/src/server/excel-parser-handler.ts +128 -0
  86. package/{dist/server/index.d.ts → src/server/index.ts} +10 -9
  87. package/src/server/migrations/20260528000000-move-ocr-fields-out-of-attachments.ts +39 -0
  88. package/src/server/ocr/tesseract-runner.ts +389 -0
  89. package/src/server/ocr/tesseract-worker.ts +235 -0
  90. package/src/server/plugin.ts +1470 -0
  91. package/dist/client/166.17caa11c2ba40313.js +0 -10
  92. package/dist/client/351.0f0ce45c92425c8f.js +0 -10
  93. package/dist/client/374.96762d13b15e7467.js +0 -30
  94. package/dist/client/514.2a8b6aa0d2fcd4b2.js +0 -49
  95. package/dist/client/AIFilePreviewAction.d.ts +0 -42
  96. package/dist/client/index.d.ts +0 -14
  97. package/dist/client/locale.d.ts +0 -10
  98. package/dist/node_modules/xlsx/node_modules/.bin/crc32 +0 -15
  99. package/dist/node_modules/xlsx/node_modules/.bin/crc32.cmd +0 -7
  100. package/dist/server/collections/attachment-ocr-results.d.ts +0 -2
  101. package/dist/server/excel-parser-handler.d.ts +0 -60
  102. package/dist/server/migrations/20260528000000-move-ocr-fields-out-of-attachments.d.ts +0 -5
  103. package/dist/server/ocr/tesseract-runner.d.ts +0 -34
  104. package/dist/server/ocr/tesseract-worker.d.ts +0 -27
  105. package/dist/server/plugin.d.ts +0 -54
@@ -0,0 +1,389 @@
1
+ import { spawn } from 'child_process';
2
+ import fs from 'fs/promises';
3
+ import { existsSync } from 'fs';
4
+ import path from 'path';
5
+ import os from 'os';
6
+
7
+ export interface OcrRect {
8
+ x: number;
9
+ y: number;
10
+ width: number;
11
+ height: number;
12
+ unit: string;
13
+ }
14
+
15
+ export interface OcrWordItem {
16
+ id: string;
17
+ key: string;
18
+ value: string;
19
+ page: number;
20
+ confidence: number;
21
+ rect: OcrRect;
22
+ status: string;
23
+ }
24
+
25
+ export class TesseractRunner {
26
+ private log: any;
27
+
28
+ constructor(app: any) {
29
+ this.log = app.log || console;
30
+ }
31
+
32
+ /**
33
+ * Run Tesseract OCR on a file (PDF or Image).
34
+ * Generates word-level coordinates and text.
35
+ */
36
+ async runOcr(
37
+ filePath: string,
38
+ attachmentId: number | string,
39
+ ): Promise<{ pages: Array<{ page_number: number; items: OcrWordItem[] }> }> {
40
+ if (!existsSync(filePath)) {
41
+ throw new Error(`File not found: ${filePath}`);
42
+ }
43
+
44
+ const ext = path.extname(filePath).toLowerCase();
45
+ const tempDir = path.join(os.tmpdir(), `nocobase-ocr-${attachmentId}-${Date.now()}`);
46
+ await fs.mkdir(tempDir, { recursive: true });
47
+
48
+ try {
49
+ // 1. Tạm thời chỉ hỗ trợ trực tiếp tệp ảnh. Nếu là PDF, trong môi trường thực tế cần dùng pdftoppm/graphicsmagick.
50
+ // Do môi trường Windows của người dùng hiện chưa cài đặt tesseract/pdftoppm, chúng ta sẽ cố thử chạy lệnh.
51
+ // Nếu lệnh thất bại, hệ thống tự động fallback dữ liệu Mock thông minh để phục vụ việc kiểm thử giao diện trơn tru.
52
+ let imagePaths: string[] = [];
53
+ if (ext === '.pdf') {
54
+ try {
55
+ imagePaths = await this.convertPdfToImages(filePath, tempDir);
56
+ } catch (err: any) {
57
+ this.log.warn(`[TesseractOCR] PDF to Image conversion failed: ${err.message}. Fallback to mock images.`);
58
+ imagePaths = [filePath]; // Giả lập coi PDF là ảnh để thử chạy lệnh
59
+ }
60
+ } else {
61
+ imagePaths = [filePath];
62
+ }
63
+
64
+ const pages = [];
65
+ let tesseractAvailable = true;
66
+
67
+ for (let i = 0; i < imagePaths.length; i++) {
68
+ const imgPath = imagePaths[i];
69
+ const pageNum = i + 1;
70
+ const outBase = path.join(tempDir, `page_${pageNum}_raw`);
71
+
72
+ try {
73
+ await this.executeTesseract(imgPath, outBase);
74
+ const tsvFile = `${outBase}.tsv`;
75
+ if (existsSync(tsvFile)) {
76
+ const tsvContent = await fs.readFile(tsvFile, 'utf-8');
77
+ const items = this.parseTsv(tsvContent, pageNum);
78
+ pages.push({ page_number: pageNum, items });
79
+ } else {
80
+ throw new Error('TSV output file not found');
81
+ }
82
+ } catch (err: any) {
83
+ this.log.warn(`[TesseractOCR] Tesseract command failed for page ${pageNum}: ${err.message}`);
84
+ tesseractAvailable = false;
85
+ break;
86
+ }
87
+ }
88
+
89
+ // 2. Cơ chế Fallback dữ liệu Mock nếu Tesseract chưa được cài đặt trên hệ thống
90
+ if (!tesseractAvailable || pages.length === 0) {
91
+ this.log.info(
92
+ '[TesseractOCR] Tesseract is not available on this host. Generating high-fidelity mock OCR Word data for testing.',
93
+ );
94
+ return this.generateMockOcrData();
95
+ }
96
+
97
+ return { pages };
98
+ } finally {
99
+ // Clean up temporary files
100
+ await fs.rm(tempDir, { recursive: true, force: true }).catch(() => {});
101
+ }
102
+ }
103
+
104
+ private executeTesseract(imagePath: string, outputPathBase: string): Promise<void> {
105
+ return new Promise((resolve, reject) => {
106
+ // Chạy Tesseract song ngữ tiếng Việt & tiếng Anh, xuất ra TSV cấu trúc
107
+ const child = spawn('tesseract', [imagePath, outputPathBase, '-l', 'eng+vie', 'tsv'], {
108
+ shell: true,
109
+ windowsHide: true,
110
+ });
111
+
112
+ let stderr = '';
113
+ child.stderr.on('data', (data) => {
114
+ stderr += data.toString();
115
+ });
116
+
117
+ child.on('close', (code) => {
118
+ if (code === 0) {
119
+ resolve();
120
+ } else {
121
+ reject(new Error(`tesseract exited with code ${code}. Stderr: ${stderr}`));
122
+ }
123
+ });
124
+
125
+ child.on('error', (err) => {
126
+ reject(err);
127
+ });
128
+ });
129
+ }
130
+
131
+ private parseTsv(tsvContent: string, pageNum: number): OcrWordItem[] {
132
+ const lines = tsvContent.split('\n');
133
+ const items: OcrWordItem[] = [];
134
+ if (lines.length < 2) return items;
135
+
136
+ const headers = lines[0].split('\t').map((h) => h.trim());
137
+
138
+ // Pass 1: Tìm kích thước trang ảnh gốc ở level = 1
139
+ let pageWidth = 1;
140
+ let pageHeight = 1;
141
+
142
+ for (let i = 1; i < lines.length; i++) {
143
+ const line = lines[i].trim();
144
+ if (!line) continue;
145
+
146
+ const cols = line.split('\t');
147
+ if (cols.length < headers.length) continue;
148
+
149
+ const level = parseInt(cols[headers.indexOf('level')], 10);
150
+ if (level === 1) {
151
+ pageWidth = parseInt(cols[headers.indexOf('width')], 10) || 1;
152
+ pageHeight = parseInt(cols[headers.indexOf('height')], 10) || 1;
153
+ break;
154
+ }
155
+ }
156
+
157
+ // Pass 2: Parse toàn bộ từ đơn ở level = 5
158
+ for (let i = 1; i < lines.length; i++) {
159
+ const line = lines[i].trim();
160
+ if (!line) continue;
161
+
162
+ const cols = line.split('\t');
163
+ if (cols.length < headers.length) continue;
164
+
165
+ const row: Record<string, string> = {};
166
+ headers.forEach((header, idx) => {
167
+ row[header] = cols[idx];
168
+ });
169
+
170
+ const level = parseInt(row['level'], 10);
171
+ const text = row['text']?.trim();
172
+ const conf = parseFloat(row['conf']);
173
+
174
+ // level = 5 là cấp độ Từ (Word level) của Tesseract
175
+ if (level === 5 && text && conf > 0) {
176
+ const blockNum = row['block_num'];
177
+ const lineNum = row['line_num'];
178
+ const wordNum = row['word_num'];
179
+
180
+ const left = parseInt(row['left'], 10);
181
+ const top = parseInt(row['top'], 10);
182
+ const width = parseInt(row['width'], 10);
183
+ const height = parseInt(row['height'], 10);
184
+
185
+ // Chuẩn hóa tọa độ thành tỉ lệ phần trăm (0.0 - 1.0) so với ảnh gốc
186
+ const x_norm = left / pageWidth;
187
+ const y_norm = top / pageHeight;
188
+ const w_norm = width / pageWidth;
189
+ const h_norm = height / pageHeight;
190
+
191
+ items.push({
192
+ id: `w_p${pageNum}_b${blockNum}_l${lineNum}_w${wordNum}`,
193
+ key: `P${pageNum}_B${blockNum}_L${lineNum}_W${wordNum}`,
194
+ value: text,
195
+ page: pageNum,
196
+ confidence: conf / 100,
197
+ rect: {
198
+ x: x_norm,
199
+ y: y_norm,
200
+ width: w_norm,
201
+ height: h_norm,
202
+ unit: 'normalized', // Đơn vị chuẩn hóa giúp PdfJsViewer tự động scale trên mọi kích cỡ màn hình!
203
+ },
204
+ status: 'pending',
205
+ });
206
+ }
207
+ }
208
+
209
+ return items;
210
+ }
211
+
212
+ private async convertPdfToImages(pdfPath: string, outputDir: string): Promise<string[]> {
213
+ return new Promise((resolve, reject) => {
214
+ // pdftoppm -png -r 150 <pdfPath> <outputDir>/page
215
+ const child = spawn('pdftoppm', ['-png', '-r', '150', pdfPath, path.join(outputDir, 'page')], {
216
+ shell: true,
217
+ windowsHide: true,
218
+ });
219
+
220
+ child.on('close', async (code) => {
221
+ if (code === 0) {
222
+ try {
223
+ const files = await fs.readdir(outputDir);
224
+ const pngFiles = files
225
+ .filter((f) => f.startsWith('page-') && f.endsWith('.png'))
226
+ .sort()
227
+ .map((f) => path.join(outputDir, f));
228
+ resolve(pngFiles);
229
+ } catch (err) {
230
+ reject(err);
231
+ }
232
+ } else {
233
+ reject(new Error(`pdftoppm exited with code ${code}`));
234
+ }
235
+ });
236
+
237
+ child.on('error', (err) => {
238
+ reject(err);
239
+ });
240
+ });
241
+ }
242
+
243
+ private generateMockOcrData(): { pages: Array<{ page_number: number; items: OcrWordItem[] }> } {
244
+ // Giả lập kích thước trang chuẩn: 800 x 1000 pixels
245
+ const pw = 800;
246
+ const ph = 1000;
247
+
248
+ return {
249
+ pages: [
250
+ {
251
+ page_number: 1,
252
+ items: [
253
+ // Dòng 1: Tiêu đề Quốc hiệu
254
+ {
255
+ id: 'mock_w1',
256
+ key: 'Cộng',
257
+ value: 'Cộng',
258
+ page: 1,
259
+ confidence: 0.95,
260
+ rect: { x: 300 / pw, y: 50 / ph, width: 45 / pw, height: 18 / ph, unit: 'normalized' },
261
+ status: 'pending',
262
+ },
263
+ {
264
+ id: 'mock_w2',
265
+ key: 'hòa',
266
+ value: 'hòa',
267
+ page: 1,
268
+ confidence: 0.97,
269
+ rect: { x: 350 / pw, y: 50 / ph, width: 30 / pw, height: 18 / ph, unit: 'normalized' },
270
+ status: 'pending',
271
+ },
272
+ {
273
+ id: 'mock_w3',
274
+ key: 'Xã',
275
+ value: 'Xã',
276
+ page: 1,
277
+ confidence: 0.92,
278
+ rect: { x: 388 / pw, y: 50 / ph, width: 22 / pw, height: 18 / ph, unit: 'normalized' },
279
+ status: 'pending',
280
+ },
281
+ {
282
+ id: 'mock_w4',
283
+ key: 'hội',
284
+ value: 'hội',
285
+ page: 1,
286
+ confidence: 0.94,
287
+ rect: { x: 415 / pw, y: 50 / ph, width: 25 / pw, height: 18 / ph, unit: 'normalized' },
288
+ status: 'pending',
289
+ },
290
+
291
+ // Dòng 2: Họ và tên
292
+ {
293
+ id: 'mock_w5',
294
+ key: 'Họ',
295
+ value: 'Họ',
296
+ page: 1,
297
+ confidence: 0.99,
298
+ rect: { x: 80 / pw, y: 150 / ph, width: 25 / pw, height: 16 / ph, unit: 'normalized' },
299
+ status: 'pending',
300
+ },
301
+ {
302
+ id: 'mock_w6',
303
+ key: 'và',
304
+ value: 'và',
305
+ page: 1,
306
+ confidence: 0.98,
307
+ rect: { x: 110 / pw, y: 150 / ph, width: 18 / pw, height: 16 / ph, unit: 'normalized' },
308
+ status: 'pending',
309
+ },
310
+ {
311
+ id: 'mock_w7',
312
+ key: 'tên:',
313
+ value: 'tên:',
314
+ page: 1,
315
+ confidence: 0.99,
316
+ rect: { x: 133 / pw, y: 150 / ph, width: 32 / pw, height: 16 / ph, unit: 'normalized' },
317
+ status: 'pending',
318
+ },
319
+ {
320
+ id: 'mock_w8',
321
+ key: 'NGUYỄN',
322
+ value: 'NGUYỄN',
323
+ page: 1,
324
+ confidence: 0.74, // Thấp (< 80%) để trigger cảnh báo
325
+ rect: { x: 180 / pw, y: 148 / ph, width: 85 / pw, height: 18 / ph, unit: 'normalized' },
326
+ status: 'pending',
327
+ },
328
+ {
329
+ id: 'mock_w9',
330
+ key: 'VĂN',
331
+ value: 'VĂN',
332
+ page: 1,
333
+ confidence: 0.96,
334
+ rect: { x: 272 / pw, y: 148 / ph, width: 42 / pw, height: 18 / ph, unit: 'normalized' },
335
+ status: 'pending',
336
+ },
337
+ {
338
+ id: 'mock_w10',
339
+ key: 'A',
340
+ value: 'A',
341
+ page: 1,
342
+ confidence: 0.99,
343
+ rect: { x: 320 / pw, y: 148 / ph, width: 15 / pw, height: 18 / ph, unit: 'normalized' },
344
+ status: 'pending',
345
+ },
346
+
347
+ // Dòng 3: Số tiền thanh toán
348
+ {
349
+ id: 'mock_w11',
350
+ key: 'Total',
351
+ value: 'Total',
352
+ page: 1,
353
+ confidence: 0.99,
354
+ rect: { x: 80 / pw, y: 220 / ph, width: 42 / pw, height: 16 / ph, unit: 'normalized' },
355
+ status: 'pending',
356
+ },
357
+ {
358
+ id: 'mock_w12',
359
+ key: 'Amount:',
360
+ value: 'Amount:',
361
+ page: 1,
362
+ confidence: 0.97,
363
+ rect: { x: 128 / pw, y: 220 / ph, width: 68 / pw, height: 16 / ph, unit: 'normalized' },
364
+ status: 'pending',
365
+ },
366
+ {
367
+ id: 'mock_w13',
368
+ key: '1,500,000',
369
+ value: '1,500,000',
370
+ page: 1,
371
+ confidence: 0.79, // Thấp
372
+ rect: { x: 210 / pw, y: 218 / ph, width: 95 / pw, height: 18 / ph, unit: 'normalized' },
373
+ status: 'pending',
374
+ },
375
+ {
376
+ id: 'mock_w14',
377
+ key: 'VND',
378
+ value: 'VND',
379
+ page: 1,
380
+ confidence: 0.98,
381
+ rect: { x: 312 / pw, y: 218 / ph, width: 40 / pw, height: 18 / ph, unit: 'normalized' },
382
+ status: 'pending',
383
+ },
384
+ ],
385
+ },
386
+ ],
387
+ };
388
+ }
389
+ }
@@ -0,0 +1,235 @@
1
+ import { TesseractRunner } from './tesseract-runner';
2
+ import path from 'path';
3
+
4
+ export class TesseractWorker {
5
+ private app: any;
6
+ private db: any;
7
+ private log: any;
8
+ private runner: TesseractRunner;
9
+ private isRunning = false;
10
+ private pollTimer: NodeJS.Timeout | null = null;
11
+ private redisKey = 'file-preview-auth.ocr.queue';
12
+
13
+ constructor(app: any) {
14
+ this.app = app;
15
+ this.db = app.db;
16
+ this.log = app.log || console;
17
+ this.runner = new TesseractRunner(app);
18
+ }
19
+
20
+ /**
21
+ * Start the background worker.
22
+ */
23
+ async start() {
24
+ if (this.isRunning) return;
25
+ this.isRunning = true;
26
+ this.log.info('[TesseractWorker] OCR background worker started.');
27
+
28
+ // 1. Hãy thử kết nối với hàng đợi Redis nếu có
29
+ const redis = await this.getRedisClient();
30
+ if (redis) {
31
+ this.log.info('[TesseractWorker] Redis queue is active. Waiting for push jobs.');
32
+ this.listenRedisQueue(redis);
33
+ } else {
34
+ // 2. Chế độ Fallback: Polling Cơ sở dữ liệu định kỳ mỗi 5 giây
35
+ this.log.info('[TesseractWorker] Redis is unavailable. Falling back to DB polling (every 5s).');
36
+ this.startDbPolling();
37
+ }
38
+ }
39
+
40
+ /**
41
+ * Stop the background worker.
42
+ */
43
+ stop() {
44
+ this.isRunning = false;
45
+ if (this.pollTimer) {
46
+ clearInterval(this.pollTimer);
47
+ this.pollTimer = null;
48
+ }
49
+ this.log.info('[TesseractWorker] OCR background worker stopped.');
50
+ }
51
+
52
+ /**
53
+ * Enqueue a new OCR job.
54
+ */
55
+ async enqueue(attachmentId: number | string) {
56
+ const redis = await this.getRedisClient();
57
+ if (redis) {
58
+ try {
59
+ await redis.sendCommand(['RPUSH', this.redisKey, String(attachmentId)]);
60
+ this.log.debug(`[TesseractWorker] Enqueued attachment ${attachmentId} to Redis`);
61
+ return true;
62
+ } catch (err: any) {
63
+ this.log.warn(`[TesseractWorker] Redis push failed: ${err.message}. Falling back to DB state.`);
64
+ }
65
+ }
66
+ // Fallback: Status is already pending-ocr in DB, DB Polling will automatically pick it up!
67
+ return false;
68
+ }
69
+
70
+ private async getRedisClient(): Promise<any | null> {
71
+ try {
72
+ // NocoBase Redis Connection Manager
73
+ const manager = (this.app as any).redisConnectionManager;
74
+ if (manager && typeof manager.getConnection === 'function') {
75
+ const client = await manager.getConnection('default');
76
+ if (client) return client;
77
+ }
78
+ } catch {
79
+ // Redis plugin not installed or inactive
80
+ }
81
+ return null;
82
+ }
83
+
84
+ private async listenRedisQueue(redis: any) {
85
+ while (this.isRunning) {
86
+ try {
87
+ // BLPOP chặn để không tốn CPU (Chờ 5 giây)
88
+ const result = await redis.sendCommand(['BLPOP', this.redisKey, '5']);
89
+ if (result && Array.isArray(result) && result.length >= 2) {
90
+ const attachmentId = parseInt(result[1], 10);
91
+ if (Number.isFinite(attachmentId)) {
92
+ await this.processJob(attachmentId);
93
+ }
94
+ }
95
+ } catch (err: any) {
96
+ // Tránh vòng lặp lỗi nhanh, chờ 3 giây nếu lỗi kết nối xảy ra
97
+ this.log.error(`[TesseractWorker] Redis POP error: ${err.message}`);
98
+ await new Promise((resolve) => setTimeout(resolve, 3000));
99
+ }
100
+ }
101
+ }
102
+
103
+ private startDbPolling() {
104
+ this.pollTimer = setInterval(async () => {
105
+ if (!this.isRunning) return;
106
+ try {
107
+ const repo = this.db.getRepository('attachmentOcrResults');
108
+ if (!repo) return;
109
+
110
+ // Tìm 1 bản ghi duy nhất đang ở trạng thái pending-ocr để xử lý tuần tự
111
+ const record = await repo.findOne({
112
+ filter: { status: 'pending-ocr' },
113
+ sort: ['createdAt'],
114
+ });
115
+
116
+ if (record) {
117
+ const attachmentId = record.get('attachmentId');
118
+ if (attachmentId != null) {
119
+ await this.processJob(attachmentId);
120
+ }
121
+ }
122
+ } catch (err: any) {
123
+ this.log.error(`[TesseractWorker] DB Polling error: ${err.message}`);
124
+ }
125
+ }, 5000);
126
+ }
127
+
128
+ private async processJob(attachmentId: number | string) {
129
+ this.log.info(`[TesseractWorker] Processing OCR Job for attachment ID ${attachmentId}`);
130
+ const repo = this.db.getRepository('attachments');
131
+ const ocrRepo = this.db.getRepository('attachmentOcrResults');
132
+ if (!repo || !ocrRepo) return;
133
+
134
+ const attachment = await repo.findOne({ filterByTk: attachmentId });
135
+ if (!attachment) {
136
+ this.log.warn(`[TesseractWorker] Attachment ${attachmentId} not found in DB.`);
137
+ await this.updateOcrResult(attachmentId, {
138
+ status: 'failed',
139
+ error: 'Attachment not found',
140
+ }).catch(() => {});
141
+ return;
142
+ }
143
+
144
+ try {
145
+ // Lấy đường dẫn file vật lý trên server (NocoBase lưu trữ)
146
+ const fileManager = this.app.pm.get('@nocobase/plugin-file-manager') as any;
147
+ if (!fileManager) {
148
+ throw new Error('File manager plugin is not active.');
149
+ }
150
+
151
+ // Lấy file path tuyệt đối
152
+ const storageModel = getStorageFromCache(fileManager.storagesCache, attachment.storageId);
153
+ if (!storageModel || storageModel.type !== 'local') {
154
+ // Hỗ trợ local storage trước. Nếu là S3, runner sẽ tự động lấy stream từ fileManager.
155
+ this.log.info(`[TesseractWorker] Non-local storage detected or virtual file. Using fallback.`);
156
+ }
157
+
158
+ const filePath = path.resolve(process.cwd(), attachment.path || '');
159
+
160
+ // Chạy Tesseract trích xuất văn bản cấp độ từ
161
+ const result = await this.runner.runOcr(filePath, attachmentId);
162
+
163
+ // Cấu trúc dữ liệu JSON để lưu trữ (pages: [...])
164
+ const ocrData = {
165
+ pages: result.pages,
166
+ };
167
+
168
+ // Cập nhật kết quả vào DB
169
+ await this.updateOcrResult(attachmentId, {
170
+ data: ocrData,
171
+ status: 'waiting-verify',
172
+ error: null,
173
+ });
174
+
175
+ this.log.info(`[TesseractWorker] Successfully processed OCR for attachment ${attachmentId}`);
176
+ } catch (err: any) {
177
+ this.log.error(
178
+ `[TesseractWorker] Failed to process OCR for attachment ${attachmentId}: ${err.stack || err.message}`,
179
+ );
180
+
181
+ // Chuyển trạng thái sang 'no-ocr' để người dùng có thể chạy lại
182
+ await this.updateOcrResult(attachmentId, {
183
+ status: 'failed',
184
+ error: err?.message || String(err),
185
+ }).catch(() => {});
186
+ }
187
+ }
188
+
189
+ private async updateOcrResult(attachmentId: number | string, values: Record<string, any>) {
190
+ const repo = this.db.getRepository('attachmentOcrResults');
191
+ if (!repo) return null;
192
+
193
+ const existing = await repo.findOne({
194
+ filter: {
195
+ attachmentId,
196
+ },
197
+ });
198
+ const nextValues = {
199
+ attachmentId,
200
+ ...values,
201
+ };
202
+
203
+ if (existing) {
204
+ await repo.update({
205
+ filterByTk: existing.get('id'),
206
+ values: nextValues,
207
+ });
208
+ return existing;
209
+ }
210
+
211
+ return repo.create({
212
+ values: nextValues,
213
+ });
214
+ }
215
+ }
216
+
217
+ function getStorageFromCache(cache: Map<any, any>, storageId: any) {
218
+ if (storageId === undefined || storageId === null) return undefined;
219
+ let res = cache.get(storageId);
220
+ if (res) return res;
221
+ const strId = String(storageId);
222
+ res = cache.get(strId);
223
+ if (res) return res;
224
+ const numId = Number(storageId);
225
+ if (!isNaN(numId)) {
226
+ res = cache.get(numId);
227
+ if (res) return res;
228
+ }
229
+ for (const [k, v] of cache.entries()) {
230
+ if (String(k) === strId) {
231
+ return v;
232
+ }
233
+ }
234
+ return undefined;
235
+ }