@peopl-health/nexus 2.4.9-fix-pdf-processing → 2.4.9-fix-mime-type
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/helpers/filesHelper.js +93 -45
- package/lib/helpers/llmsHelper.js +24 -6
- package/package.json +1 -1
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
const { PDFDocument } = require('pdf-lib');
|
|
2
2
|
const { execFile } = require('child_process');
|
|
3
3
|
const fs = require('fs').promises;
|
|
4
|
+
const fsSync = require('fs');
|
|
4
5
|
const path = require('path');
|
|
5
6
|
const sharp = require('sharp');
|
|
6
7
|
|
|
@@ -22,28 +23,52 @@ async function convertPdfToImages(pdfName, existingPdfPath = null) {
|
|
|
22
23
|
const args = ['-jpeg', pdfPath, outputPattern];
|
|
23
24
|
logger.info('[convertPdfToImages] Running: pdftoppm', args.join(' '));
|
|
24
25
|
|
|
25
|
-
|
|
26
|
+
const timeout = 30000;
|
|
27
|
+
let timedOut = false;
|
|
28
|
+
|
|
29
|
+
const child = execFile('pdftoppm', args, { timeout, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
|
|
30
|
+
if (timedOut) {
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
|
|
26
34
|
if (error) {
|
|
27
35
|
logger.error('[convertPdfToImages] Error details:', {
|
|
28
36
|
error: error.message,
|
|
29
37
|
stderr,
|
|
30
38
|
pdfPath,
|
|
31
|
-
pdfExists:
|
|
39
|
+
pdfExists: fsSync.existsSync(pdfPath),
|
|
40
|
+
killed: error.killed,
|
|
41
|
+
signal: error.signal
|
|
32
42
|
});
|
|
33
43
|
return reject(new Error(`Error splitting PDF: ${stderr || error.message}`));
|
|
34
44
|
}
|
|
35
45
|
|
|
36
|
-
|
|
37
|
-
if (err) {
|
|
38
|
-
return reject(new Error(`Error reading output directory: ${err.message}`));
|
|
39
|
-
}
|
|
46
|
+
logger.info('[convertPdfToImages] pdftoppm completed successfully');
|
|
40
47
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
48
|
+
fs.readdir(outputDir)
|
|
49
|
+
.then(files => {
|
|
50
|
+
const jpgFiles = files
|
|
51
|
+
.filter(file => file.startsWith(sanitizedName) && file.endsWith('.jpg'))
|
|
52
|
+
.map(file => path.join(outputDir, file));
|
|
44
53
|
|
|
45
|
-
|
|
46
|
-
|
|
54
|
+
logger.info(`[convertPdfToImages] Found ${jpgFiles.length} image files`);
|
|
55
|
+
resolve(jpgFiles);
|
|
56
|
+
})
|
|
57
|
+
.catch(err => {
|
|
58
|
+
logger.error('[convertPdfToImages] Error reading output directory:', { error: err.message });
|
|
59
|
+
reject(new Error(`Error reading output directory: ${err.message}`));
|
|
60
|
+
});
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
const timeoutId = setTimeout(() => {
|
|
64
|
+
timedOut = true;
|
|
65
|
+
child.kill('SIGTERM');
|
|
66
|
+
logger.error('[convertPdfToImages] Process timed out after 30 seconds', { pdfPath });
|
|
67
|
+
reject(new Error('PDF conversion timed out after 30 seconds'));
|
|
68
|
+
}, timeout);
|
|
69
|
+
|
|
70
|
+
child.on('exit', () => {
|
|
71
|
+
clearTimeout(timeoutId);
|
|
47
72
|
});
|
|
48
73
|
});
|
|
49
74
|
}
|
|
@@ -136,46 +161,69 @@ const cleanupFiles = async (files) => {
|
|
|
136
161
|
};
|
|
137
162
|
|
|
138
163
|
async function downloadMediaAndCreateFile(code, reply) {
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
164
|
+
try {
|
|
165
|
+
const resultMedia = await Message.findOne({
|
|
166
|
+
message_id: reply.message_id,
|
|
167
|
+
timestamp: reply.timestamp,
|
|
168
|
+
media: { $ne: null }
|
|
169
|
+
});
|
|
144
170
|
|
|
145
|
-
|
|
171
|
+
if (!resultMedia) return [];
|
|
146
172
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
173
|
+
if (!resultMedia.media || !resultMedia.media.key) {
|
|
174
|
+
logger.info('[downloadMediaAndCreateFile] No valid media found for message:', reply.message_id);
|
|
175
|
+
return [];
|
|
176
|
+
}
|
|
151
177
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
178
|
+
const { bucketName, key } = resultMedia.media;
|
|
179
|
+
if (!bucketName || !key) return [];
|
|
180
|
+
|
|
181
|
+
const [subType, fileName] = key.split('/');
|
|
182
|
+
|
|
183
|
+
const sanitizedCode = sanitizeFilename(code, 20);
|
|
184
|
+
const sanitizedSubType = sanitizeFilename(subType, 10);
|
|
185
|
+
const sanitizedFileName = sanitizeFilename(fileName, 50);
|
|
186
|
+
|
|
187
|
+
const sourceFile = `${sanitizedCode}-${sanitizedSubType}-${sanitizedFileName}`;
|
|
188
|
+
const downloadPath = path.join(__dirname, 'assets', 'tmp', sourceFile);
|
|
189
|
+
|
|
190
|
+
logger.info('[downloadMediaAndCreateFile] Downloading file', { sourceFile, downloadPath, bucketName, key });
|
|
191
|
+
|
|
192
|
+
await fs.mkdir(path.dirname(downloadPath), { recursive: true });
|
|
193
|
+
await downloadFileFromS3(bucketName, key, downloadPath);
|
|
168
194
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
195
|
+
const { name: baseName } = path.parse(sourceFile);
|
|
196
|
+
let fileNames = [];
|
|
197
|
+
|
|
198
|
+
if (subType === 'document' || subType === 'application') {
|
|
199
|
+
try {
|
|
200
|
+
fileNames = await convertPdfToImages(baseName, downloadPath);
|
|
201
|
+
logger.info('[downloadMediaAndCreateFile] PDF converted successfully', { imageCount: fileNames.length });
|
|
202
|
+
} catch (conversionError) {
|
|
203
|
+
logger.error('[downloadMediaAndCreateFile] PDF conversion failed:', {
|
|
204
|
+
error: conversionError.message,
|
|
205
|
+
sourceFile
|
|
206
|
+
});
|
|
207
|
+
fileNames = [];
|
|
208
|
+
} finally {
|
|
209
|
+
try {
|
|
210
|
+
await fs.unlink(downloadPath);
|
|
211
|
+
} catch (unlinkError) {
|
|
212
|
+
logger.warn('[downloadMediaAndCreateFile] Failed to delete PDF:', { error: unlinkError.message });
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
} else {
|
|
216
|
+
fileNames = [downloadPath];
|
|
217
|
+
}
|
|
173
218
|
|
|
174
|
-
|
|
175
|
-
|
|
219
|
+
return fileNames;
|
|
220
|
+
} catch (error) {
|
|
221
|
+
logger.error('[downloadMediaAndCreateFile] Error processing media:', {
|
|
222
|
+
error: error.message,
|
|
223
|
+
message_id: reply.message_id
|
|
224
|
+
});
|
|
225
|
+
return [];
|
|
176
226
|
}
|
|
177
|
-
|
|
178
|
-
return fileNames;
|
|
179
227
|
}
|
|
180
228
|
|
|
181
229
|
module.exports = {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
const llmConfig = require('../config/llmConfig.js');
|
|
2
2
|
const { logger } = require('../utils/logger');
|
|
3
3
|
const fs = require('fs');
|
|
4
|
-
const
|
|
4
|
+
const path = require('path');
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async function analyzeImage(imagePath, isSticker = false, contentType = null) {
|
|
@@ -30,14 +30,30 @@ async function analyzeImage(imagePath, isSticker = false, contentType = null) {
|
|
|
30
30
|
};
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
+
// Determine mime type from file extension
|
|
33
34
|
let mimeType = contentType;
|
|
34
35
|
if (!mimeType) {
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
36
|
+
const ext = path.extname(imagePath).toLowerCase();
|
|
37
|
+
const mimeMap = {
|
|
38
|
+
'.jpg': 'image/jpeg',
|
|
39
|
+
'.jpeg': 'image/jpeg',
|
|
40
|
+
'.png': 'image/png',
|
|
41
|
+
'.gif': 'image/gif',
|
|
42
|
+
'.webp': 'image/webp'
|
|
43
|
+
};
|
|
44
|
+
mimeType = mimeMap[ext] || 'image/jpeg'; // Default to jpeg for pdftoppm output
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Validate that mime type is supported by Claude
|
|
48
|
+
const supportedMimeTypes = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
|
|
49
|
+
if (!supportedMimeTypes.includes(mimeType)) {
|
|
50
|
+
logger.warn('[analyzeImage] Unsupported mime type, defaulting to image/jpeg:', {
|
|
51
|
+
originalMimeType: mimeType,
|
|
52
|
+
imagePath
|
|
53
|
+
});
|
|
54
|
+
mimeType = 'image/jpeg';
|
|
40
55
|
}
|
|
56
|
+
|
|
41
57
|
if (mimeType === 'image/vnd.wap.wbmp') {
|
|
42
58
|
logger.info('Skipping image with MIME type:', mimeType);
|
|
43
59
|
return {
|
|
@@ -49,6 +65,7 @@ async function analyzeImage(imagePath, isSticker = false, contentType = null) {
|
|
|
49
65
|
};
|
|
50
66
|
}
|
|
51
67
|
// Read the image file and convert to base64
|
|
68
|
+
logger.info('[analyzeImage] Reading image file:', { imagePath: imagePath.split('/').pop() });
|
|
52
69
|
const imageBuffer = await fs.promises.readFile(imagePath);
|
|
53
70
|
const base64Image = imageBuffer.toString('base64');
|
|
54
71
|
|
|
@@ -77,6 +94,7 @@ async function analyzeImage(imagePath, isSticker = false, contentType = null) {
|
|
|
77
94
|
},
|
|
78
95
|
],
|
|
79
96
|
});
|
|
97
|
+
logger.info('[analyzeImage] Description received');
|
|
80
98
|
const description = messageDescription.content[0].text;
|
|
81
99
|
|
|
82
100
|
// For stickers, skip medical analysis and table extraction
|