pompelmi 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ # Next.js Integration
2
+
3
+ Scan uploaded files in Next.js API routes before writing them to disk or forwarding them to S3. Covers both the App Router (Next.js 13+) and the Pages Router.
4
+
5
+ ---
6
+
7
+ ## App Router (Next.js 13+)
8
+
9
+ ### Scan from `formData()` — scan buffer, upload to S3 if clean
10
+
11
+ In the App Router, request bodies are parsed via the Web Fetch API. Use `request.formData()` to get the file as a `Blob`, convert it to a Node.js `Buffer`, then call `scanBuffer()`.
12
+
13
+ ```ts
14
+ // app/api/upload/route.ts
15
+ import { NextRequest, NextResponse } from 'next/server';
16
+ import { scanBuffer, Verdict } from 'pompelmi';
17
+ import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3';
18
+
19
+ const s3 = new S3Client({ region: process.env.AWS_REGION });
20
+
21
+ const SCAN_OPTS = {
22
+ host: process.env.CLAMAV_HOST,
23
+ port: Number(process.env.CLAMAV_PORT) || 3310,
24
+ timeout: 30_000,
25
+ };
26
+
27
+ export async function POST(request: NextRequest) {
28
+ const form = await request.formData();
29
+ const file = form.get('file');
30
+
31
+ if (!(file instanceof File)) {
32
+ return NextResponse.json({ error: 'No file uploaded.' }, { status: 400 });
33
+ }
34
+
35
+ const arrayBuffer = await file.arrayBuffer();
36
+ const buffer = Buffer.from(arrayBuffer);
37
+
38
+ let result: symbol;
39
+ try {
40
+ result = await scanBuffer(buffer, SCAN_OPTS);
41
+ } catch (err) {
42
+ return NextResponse.json(
43
+ { error: `Scan failed: ${(err as Error).message}` },
44
+ { status: 500 }
45
+ );
46
+ }
47
+
48
+ if (result === Verdict.Malicious) {
49
+ return NextResponse.json({ error: 'Malicious file rejected.' }, { status: 422 });
50
+ }
51
+
52
+ if (result === Verdict.ScanError) {
53
+ return NextResponse.json(
54
+ { error: 'Scan incomplete — file rejected as precaution.' },
55
+ { status: 422 }
56
+ );
57
+ }
58
+
59
+ // Upload to S3 only after a clean scan
60
+ await s3.send(new PutObjectCommand({
61
+ Bucket: process.env.S3_BUCKET,
62
+ Key: `uploads/${Date.now()}-${file.name}`,
63
+ Body: buffer,
64
+ ContentType: file.type,
65
+ }));
66
+
67
+ return NextResponse.json({ ok: true });
68
+ }
69
+ ```
70
+
71
+ ### Disable Next.js body parsing
72
+
73
+ By default, Next.js App Router does not parse multipart bodies — the `request.formData()` call handles it natively. No special config needed.
74
+
75
+ ---
76
+
77
+ ## Pages Router
78
+
79
+ ### With `formidable` — scan by file path
80
+
81
+ The Pages Router does not handle multipart natively. Use `formidable` to parse the upload, then scan the temp file path.
82
+
83
+ ```bash
84
+ npm install pompelmi formidable
85
+ npm install -D @types/formidable
86
+ ```
87
+
88
+ ```ts
89
+ // pages/api/upload.ts
90
+ import type { NextApiRequest, NextApiResponse } from 'next';
91
+ import formidable, { File as FormidableFile } from 'formidable';
92
+ import fs from 'fs';
93
+ import { scan, Verdict } from 'pompelmi';
94
+
95
+ export const config = {
96
+ api: { bodyParser: false }, // required for multipart
97
+ };
98
+
99
+ const SCAN_OPTS = {
100
+ host: process.env.CLAMAV_HOST,
101
+ port: Number(process.env.CLAMAV_PORT) || 3310,
102
+ };
103
+
104
+ function parseForm(req: NextApiRequest): Promise<{ file: FormidableFile }> {
105
+ return new Promise((resolve, reject) => {
106
+ const form = formidable({ uploadDir: '/tmp', keepExtensions: true });
107
+ form.parse(req, (err, _fields, files) => {
108
+ if (err) return reject(err);
109
+ const file = Array.isArray(files.file) ? files.file[0] : files.file;
110
+ if (!file) return reject(new Error('No file.'));
111
+ resolve({ file });
112
+ });
113
+ });
114
+ }
115
+
116
+ export default async function handler(req: NextApiRequest, res: NextApiResponse) {
117
+ if (req.method !== 'POST') return res.status(405).end();
118
+
119
+ let file: FormidableFile;
120
+ try {
121
+ ({ file } = await parseForm(req));
122
+ } catch (err) {
123
+ return res.status(400).json({ error: 'Upload failed.' });
124
+ }
125
+
126
+ const filePath = file.filepath;
127
+
128
+ try {
129
+ const result = await scan(filePath, SCAN_OPTS);
130
+
131
+ if (result !== Verdict.Clean) {
132
+ fs.unlinkSync(filePath);
133
+ return res.status(422).json({ error: `Upload rejected: ${result.description}` });
134
+ }
135
+
136
+ // Move or store the clean file
137
+ return res.status(200).json({ ok: true });
138
+ } catch (err) {
139
+ try { fs.unlinkSync(filePath); } catch {}
140
+ return res.status(500).json({ error: `Scan failed: ${(err as Error).message}` });
141
+ }
142
+ }
143
+ ```
144
+
145
+ ---
146
+
147
+ ## Client-side upload
148
+
149
+ ```tsx
150
+ // components/UploadForm.tsx
151
+ 'use client';
152
+ import { useState } from 'react';
153
+
154
+ export default function UploadForm() {
155
+ const [status, setStatus] = useState('');
156
+
157
+ async function handleSubmit(e: React.FormEvent<HTMLFormElement>) {
158
+ e.preventDefault();
159
+ const form = e.currentTarget;
160
+ const data = new FormData(form);
161
+
162
+ const res = await fetch('/api/upload', { method: 'POST', body: data });
163
+ const json = await res.json();
164
+
165
+ if (!res.ok) {
166
+ setStatus(`Error: ${json.error}`);
167
+ } else {
168
+ setStatus('File uploaded successfully.');
169
+ }
170
+ }
171
+
172
+ return (
173
+ <form onSubmit={handleSubmit}>
174
+ <input type="file" name="file" required />
175
+ <button type="submit">Upload</button>
176
+ {status && <p>{status}</p>}
177
+ </form>
178
+ );
179
+ }
180
+ ```
181
+
182
+ ---
183
+
184
+ ## Environment variables
185
+
186
+ Add to `.env.local`:
187
+
188
+ ```
189
+ CLAMAV_HOST=127.0.0.1
190
+ CLAMAV_PORT=3310
191
+ S3_BUCKET=my-upload-bucket
192
+ AWS_REGION=us-east-1
193
+ ```
194
+
195
+ In production (Docker), set `CLAMAV_HOST` to the clamd service name (e.g. `clamav`).
196
+
197
+ ---
198
+
199
+ ## Notes
200
+
201
+ - **Vercel / serverless:** ClamAV cannot be installed on Vercel's serverless functions. Use TCP mode pointing to a self-hosted clamd instance (fly.io, Railway, EC2) or switch to a dedicated scan microservice.
202
+ - **File size limits:** Next.js has a default request body size limit (4 MB for Pages Router). Increase it via `export const config = { api: { bodyParser: { sizeLimit: '20mb' } } }` or disable parsing for multipart routes.
203
+ - **App Router streaming:** The App Router supports streaming request bodies via `request.body` (`ReadableStream`). To use `scanStream()`, convert with `Readable.fromWeb(request.body)` (Node.js 18+).
204
+
205
+ ```ts
206
+ import { Readable } from 'stream';
207
+ const nodeStream = Readable.fromWeb(request.body as ReadableStream);
208
+ const result = await scanStream(nodeStream, SCAN_OPTS);
209
+ ```
@@ -0,0 +1,178 @@
1
+ # Performance
2
+
3
+ Understanding pompelmi's performance characteristics helps you choose the right mode, concurrency level, and file handling strategy for your workload.
4
+
5
+ ---
6
+
7
+ ## Latency: local mode vs TCP mode
8
+
9
+ | Scenario | Local mode | TCP mode (LAN) |
10
+ |----------|-----------|----------------|
11
+ | Small file (< 1 MB) | 400–800 ms | 5–20 ms |
12
+ | Medium file (5–10 MB) | 800–1500 ms | 20–80 ms |
13
+ | Large file (50 MB) | 2000–4000 ms | 100–400 ms |
14
+ | ZIP archive (1 MB compressed) | 600–1200 ms | 15–60 ms |
15
+
16
+ Local mode is dominated by the time ClamAV takes to load the virus database (~300 MB) into memory on each invocation. TCP mode reuses a persistent clamd daemon that keeps the database resident.
17
+
18
+ > These are rough estimates. Actual latency depends on disk I/O speed, CPU, ClamAV version, and virus definition size.
19
+
20
+ ---
21
+
22
+ ## Throughput: concurrent scans
23
+
24
+ ### Local mode
25
+
26
+ Each local scan spawns a `clamscan` process that loads the database from disk. On a 4-core machine:
27
+
28
+ ```
29
+ ~2–4 concurrent scans before CPU saturation
30
+ ~1–2 scans/second sustained throughput
31
+ ```
32
+
33
+ Increasing concurrency beyond 4 in local mode degrades performance rather than improving it — processes compete for disk and CPU.
34
+
35
+ ### TCP mode
36
+
37
+ clamd keeps the virus database in memory and handles requests on a single thread. Multiple connections are accepted and queued:
38
+
39
+ ```
40
+ ~5–10 concurrent scans before clamd is saturated (single instance)
41
+ ~50–200 scans/second sustained throughput (single clamd, depends on file size)
42
+ ```
43
+
44
+ Scale horizontally by running multiple clamd instances behind a load balancer.
45
+
46
+ ---
47
+
48
+ ## Memory usage
49
+
50
+ ### `scan()` (file path)
51
+
52
+ Memory usage is minimal in the application process — pompelmi reads a path and delegates. ClamAV allocates memory to load the database and scan the file (especially for archive extraction).
53
+
54
+ ### `scanBuffer()` with large files
55
+
56
+ The full file content is held in memory as a Node.js `Buffer` for the duration of the scan. For a 50 MB upload:
57
+
58
+ - Application process: ~50 MB Buffer
59
+ - clamd (TCP mode): streams the buffer, does not accumulate it all at once
60
+ - Local mode: writes a temp file, so memory usage is minimal in the app process
61
+
62
+ **Avoid `scanBuffer()` for files > 50 MB.** Use `scan()` (disk) or `scanStream()` (streaming) instead.
63
+
64
+ ### `scanStream()` with TCP mode
65
+
66
+ The stream is piped directly to clamd in 64 KB chunks. The application process never holds the full file in memory — peak memory usage is approximately 64 KB for the chunk buffer plus stream buffering overhead. This is the most memory-efficient option for large files.
67
+
68
+ ---
69
+
70
+ ## Temp file cleanup in local mode
71
+
72
+ `scanBuffer()` and `scanStream()` in local mode write a temp file to `os.tmpdir()` before scanning. pompelmi deletes the temp file in a `finally` block — it is always removed regardless of scan outcome.
73
+
74
+ However, if your process is killed with `SIGKILL` (not `SIGTERM`), the `finally` block does not run and the temp file persists. Add a startup cleanup or use a system temp cleaner (Linux `systemd-tmpfiles`, macOS `/tmp` auto-clean) to handle this case.
75
+
76
+ ```js
77
+ const os = require('os');
78
+ const fs = require('fs');
79
+ const path = require('path');
80
+
81
+ function cleanTempFiles() {
82
+ const tmpDir = os.tmpdir();
83
+ const files = fs.readdirSync(tmpDir);
84
+ const stale = files.filter(f => f.startsWith('scan-') && f.endsWith('.tmp'));
85
+
86
+ for (const f of stale) {
87
+ const full = path.join(tmpDir, f);
88
+ const age = Date.now() - fs.statSync(full).mtimeMs;
89
+ if (age > 60_000) { // older than 1 minute
90
+ try { fs.unlinkSync(full); } catch {}
91
+ }
92
+ }
93
+ }
94
+
95
+ // Run at startup
96
+ cleanTempFiles();
97
+ ```
98
+
99
+ ---
100
+
101
+ ## Connection considerations for TCP mode
102
+
103
+ pompelmi opens a new TCP connection per scan call. For sporadic uploads, this is fine — the connection overhead is small (< 1 ms on LAN).
104
+
105
+ For sustained high-throughput workloads (hundreds of scans per second), the connection overhead accumulates. Options:
106
+
107
+ 1. **Keep-alive / connection reuse:** pompelmi does not implement connection pooling. If this becomes a bottleneck, implement a pool using Node.js `net.Socket` that reuses open connections.
108
+
109
+ 2. **Increase clamd connection limit:** Check `MaxConnections` in `clamd.conf` (default: 30). Increase it if you are running many concurrent scans.
110
+
111
+ 3. **Scale horizontally:** Run multiple clamd instances behind a load balancer and distribute scan requests across them.
112
+
113
+ ---
114
+
115
+ ## `scanDirectory()` performance
116
+
117
+ `scanDirectory()` scans all files concurrently (bounded internally). For very large directories (thousands of files), it may open many simultaneous connections to clamd.
118
+
119
+ If you observe clamd connection errors with large directories, use `p-limit` to wrap individual `scan()` calls instead:
120
+
121
+ ```js
122
+ const pLimit = require('p-limit');
123
+ const { scan, Verdict } = require('pompelmi');
124
+ const fs = require('fs');
125
+
126
+ async function scanDirLimited(dirPath, concurrency = 5) {
127
+ const limit = pLimit(concurrency);
128
+ const files = fs.readdirSync(dirPath, { recursive: true })
129
+ .filter(f => !fs.statSync(`${dirPath}/${f}`).isDirectory())
130
+ .map(f => `${dirPath}/${f}`);
131
+
132
+ return Promise.allSettled(
133
+ files.map(f => limit(async () => ({
134
+ path: f,
135
+ verdict: await scan(f, { host: 'clamav', port: 3310 }),
136
+ })))
137
+ );
138
+ }
139
+ ```
140
+
141
+ ---
142
+
143
+ ## Profiling scan latency in production
144
+
145
+ Wrap your scan calls with timing instrumentation:
146
+
147
+ ```js
148
+ async function timedScan(filePath, opts) {
149
+ const start = Date.now();
150
+ const result = await scan(filePath, opts);
151
+ const ms = Date.now() - start;
152
+
153
+ logger.info({
154
+ event: 'scan_complete',
155
+ filePath,
156
+ verdict: result.description,
157
+ ms,
158
+ size: fs.statSync(filePath).size,
159
+ });
160
+
161
+ return result;
162
+ }
163
+ ```
164
+
165
+ Track the `ms` metric in your observability system. Sudden increases indicate clamd overload, disk I/O contention, or stale virus definitions.
166
+
167
+ ---
168
+
169
+ ## Choosing the right function for your workload
170
+
171
+ | Scenario | Recommended function | Reason |
172
+ |----------|---------------------|--------|
173
+ | File uploaded to disk | `scan(filePath)` | Zero buffer overhead |
174
+ | multer memoryStorage, small files (< 10 MB) | `scanBuffer(buffer)` | Simple, no temp file in TCP |
175
+ | multer memoryStorage, large files | `scanStream(stream)` | No full buffer in memory |
176
+ | S3 getObject | `scanStream(response.Body)` | No disk, no full buffer |
177
+ | Batch of files in a folder | `scanDirectory(dirPath)` | Single call, concurrent |
178
+ | High-throughput uploads | TCP mode + `scanStream()` | Lowest latency, no disk |
@@ -0,0 +1,260 @@
1
+ # Quarantine Workflow
2
+
3
+ Deleting malicious files immediately is the simplest response, but a quarantine folder lets you retain infected files for forensic review, audit logging, and pattern analysis before permanent deletion.
4
+
5
+ ---
6
+
7
+ ## Basic quarantine: move instead of delete
8
+
9
+ ```js
10
+ const fs = require('fs');
11
+ const path = require('path');
12
+ const { scan, Verdict } = require('pompelmi');
13
+
14
+ const QUARANTINE_DIR = path.join(__dirname, 'quarantine');
15
+ fs.mkdirSync(QUARANTINE_DIR, { recursive: true });
16
+
17
+ async function scanAndQuarantine(filePath) {
18
+ const result = await scan(filePath, { host: process.env.CLAMAV_HOST, port: 3310 });
19
+
20
+ if (result === Verdict.Malicious) {
21
+ const filename = path.basename(filePath);
22
+ const dest = path.join(QUARANTINE_DIR, `${Date.now()}-${filename}`);
23
+
24
+ fs.renameSync(filePath, dest);
25
+
26
+ console.warn({
27
+ event: 'quarantined',
28
+ original: filePath,
29
+ dest,
30
+ verdict: result.description,
31
+ });
32
+
33
+ return { quarantined: true, dest };
34
+ }
35
+
36
+ if (result === Verdict.ScanError) {
37
+ fs.unlinkSync(filePath);
38
+ return { quarantined: false, deleted: true, reason: 'scan_error' };
39
+ }
40
+
41
+ return { quarantined: false, verdict: result.description };
42
+ }
43
+ ```
44
+
45
+ `fs.renameSync` is atomic on the same filesystem. If `filePath` and `QUARANTINE_DIR` are on different filesystems, copy then delete:
46
+
47
+ ```js
48
+ fs.copyFileSync(filePath, dest);
49
+ fs.unlinkSync(filePath);
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Quarantine folder structure
55
+
56
+ Organise quarantine files for easy review. A date-based hierarchy keeps any single directory manageable:
57
+
58
+ ```
59
+ quarantine/
60
+ 2024/
61
+ 04/
62
+ 28/
63
+ 1714300800000-invoice.pdf
64
+ 1714301200000-resume.doc
65
+ ```
66
+
67
+ ```js
68
+ function quarantinePath(originalPath) {
69
+ const now = new Date();
70
+ const year = now.getFullYear();
71
+ const month = String(now.getMonth() + 1).padStart(2, '0');
72
+ const day = String(now.getDate()).padStart(2, '0');
73
+ const dir = path.join(QUARANTINE_DIR, String(year), month, day);
74
+ const filename = `${Date.now()}-${path.basename(originalPath)}`;
75
+
76
+ fs.mkdirSync(dir, { recursive: true });
77
+ return path.join(dir, filename);
78
+ }
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Logging quarantined files to a database
84
+
85
+ Store a record of every quarantined file for audit and reporting:
86
+
87
+ ```js
88
+ const { scan, Verdict } = require('pompelmi');
89
+
90
+ async function scanAndLog(filePath, db, userId) {
91
+ let result;
92
+ try {
93
+ result = await scan(filePath, { host: 'clamav', port: 3310 });
94
+ } catch (err) {
95
+ await db.scanEvents.insert({
96
+ filePath,
97
+ userId,
98
+ event: 'scan_error',
99
+ error: err.message,
100
+ createdAt: new Date(),
101
+ });
102
+ throw err;
103
+ }
104
+
105
+ if (result === Verdict.Malicious) {
106
+ const dest = quarantinePath(filePath);
107
+ fs.renameSync(filePath, dest);
108
+
109
+ await db.scanEvents.insert({
110
+ originalPath: filePath,
111
+ quarantinePath: dest,
112
+ userId,
113
+ event: 'quarantined',
114
+ verdict: 'malicious',
115
+ createdAt: new Date(),
116
+ });
117
+
118
+ return { quarantined: true, dest };
119
+ }
120
+
121
+ await db.scanEvents.insert({
122
+ filePath,
123
+ userId,
124
+ event: 'clean',
125
+ verdict: 'clean',
126
+ createdAt: new Date(),
127
+ });
128
+
129
+ return { quarantined: false };
130
+ }
131
+ ```
132
+
133
+ ---
134
+
135
+ ## Alerting on quarantine events
136
+
137
+ Send a notification when malware is detected. Use any alerting mechanism — email, Slack, PagerDuty, a webhook:
138
+
139
+ ```js
140
+ async function notifyAdmin(event) {
141
+ const message = [
142
+ `Malicious file quarantined`,
143
+ `Original path: ${event.originalPath}`,
144
+ `Quarantine path: ${event.quarantinePath}`,
145
+ `User: ${event.userId}`,
146
+ `Time: ${event.createdAt.toISOString()}`,
147
+ ].join('\n');
148
+
149
+ await fetch(process.env.SLACK_WEBHOOK_URL, {
150
+ method: 'POST',
151
+ headers: { 'Content-Type': 'application/json' },
152
+ body: JSON.stringify({ text: message }),
153
+ });
154
+ }
155
+ ```
156
+
157
+ ---
158
+
159
+ ## Express integration with quarantine
160
+
161
+ ```js
162
+ const express = require('express');
163
+ const multer = require('multer');
164
+ const { scan, Verdict } = require('pompelmi');
165
+
166
+ const app = express();
167
+ const upload = multer({ dest: './uploads' });
168
+
169
+ app.post('/upload', upload.single('file'), async (req, res) => {
170
+ if (!req.file) return res.status(400).json({ error: 'No file.' });
171
+
172
+ const filePath = req.file.path;
173
+ const result = await scan(filePath, { host: 'clamav', port: 3310 }).catch(err => {
174
+ try { fs.unlinkSync(filePath); } catch {}
175
+ throw err;
176
+ });
177
+
178
+ if (result === Verdict.Malicious) {
179
+ const dest = quarantinePath(filePath);
180
+ fs.renameSync(filePath, dest);
181
+ logger.warn({ event: 'quarantined', dest, userId: req.user?.id });
182
+ return res.status(422).json({ error: 'Malicious file rejected.' });
183
+ }
184
+
185
+ if (result === Verdict.ScanError) {
186
+ fs.unlinkSync(filePath);
187
+ return res.status(422).json({ error: 'Scan incomplete — file rejected.' });
188
+ }
189
+
190
+ return res.json({ ok: true, filename: req.file.filename });
191
+ });
192
+ ```
193
+
194
+ ---
195
+
196
+ ## Reviewing quarantined files
197
+
198
+ To review what was quarantined:
199
+
200
+ ```bash
201
+ # List quarantined files with sizes
202
+ find quarantine/ -type f -exec ls -lh {} \;
203
+
204
+ # Count by day
205
+ find quarantine/ -type f | cut -d/ -f2-4 | sort | uniq -c
206
+ ```
207
+
208
+ From a Node.js admin script:
209
+
210
+ ```js
211
+ const { scanDirectory } = require('pompelmi');
212
+
213
+ // Re-scan the quarantine folder to verify signatures (optional)
214
+ const results = await scanDirectory('./quarantine', { host: 'clamav', port: 3310 });
215
+ console.log(`Quarantine: ${results.malicious.length} confirmed malicious, ${results.clean.length} clean`);
216
+ ```
217
+
218
+ ---
219
+
220
+ ## Cleanup policy
221
+
222
+ Quarantined files should not accumulate indefinitely. Implement a retention policy:
223
+
224
+ ```js
225
+ const fs = require('fs');
226
+ const path = require('path');
227
+
228
+ const RETENTION_DAYS = 30;
229
+
230
+ function pruneQuarantine(dir) {
231
+ const cutoff = Date.now() - RETENTION_DAYS * 24 * 60 * 60 * 1000;
232
+
233
+ for (const file of fs.readdirSync(dir, { recursive: true })) {
234
+ const fullPath = path.join(dir, file);
235
+ const stat = fs.statSync(fullPath);
236
+
237
+ if (stat.isFile() && stat.mtimeMs < cutoff) {
238
+ fs.unlinkSync(fullPath);
239
+ console.log(`Deleted expired quarantine file: ${fullPath}`);
240
+ }
241
+ }
242
+ }
243
+
244
+ pruneQuarantine('./quarantine');
245
+ ```
246
+
247
+ Run this as a daily cron job. Adjust `RETENTION_DAYS` based on your audit or compliance requirements.
248
+
249
+ ---
250
+
251
+ ## Permissions
252
+
253
+ Ensure the quarantine directory is not web-accessible. Never serve files from the quarantine folder through your web server. Set restrictive filesystem permissions:
254
+
255
+ ```bash
256
+ mkdir -p quarantine
257
+ chmod 700 quarantine
258
+ ```
259
+
260
+ On Linux, assign ownership to the user running your Node.js process and deny access to all others.