npm - pompelmi - Versions diffs - 1.5.0 → 1.6.0 - Mend

pompelmi 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +53 -59
package/llms.txt +22 -99
package/package.json +1 -1
package/release-notes-v1.4.0.md +25 -0
package/release-notes-v1.5.0.md +37 -0
package/src/BufferScanner.js +20 -17
package/src/ClamAVScanner.js +4 -4
package/src/ClamdScanner.js +18 -15
package/src/StreamScanner.js +20 -17
package/wiki/api-reference.md +268 -0
package/wiki/cli-usage.md +263 -0
package/wiki/concurrent-scanning.md +199 -0
package/wiki/docker-compose-production.md +190 -0
package/wiki/docker-setup.md +178 -0
package/wiki/error-handling.md +242 -0
package/wiki/express-integration.md +227 -0
package/wiki/fastify-integration.md +207 -0
package/wiki/home.md +0 -0
package/wiki/local-vs-tcp-mode.md +179 -0
package/wiki/multer-memory-storage.md +166 -0
package/wiki/nestjs-integration.md +228 -0
package/wiki/nextjs-integration.md +209 -0
package/wiki/performance.md +178 -0
package/wiki/quarantine-workflow.md +260 -0
package/wiki/rest-api-server.md +297 -0
package/wiki/s3-integration.md +233 -0
package/wiki/security-considerations.md +192 -0
package/wiki/typescript-usage.md +239 -0
package/wiki/verdicts.md +192 -0
package/wiki/virus-definitions.md +194 -0

package/wiki/nextjs-integration.md ADDED Viewed

@@ -0,0 +1,209 @@
+# Next.js Integration
+Scan uploaded files in Next.js API routes before writing them to disk or forwarding them to S3. Covers both the App Router (Next.js 13+) and the Pages Router.
+---
+## App Router (Next.js 13+)
+### Scan from `formData()` — scan buffer, upload to S3 if clean
+In the App Router, request bodies are parsed via the Web Fetch API. Use `request.formData()` to get the file as a `Blob`, convert it to a Node.js `Buffer`, then call `scanBuffer()`.
+```ts
+// app/api/upload/route.ts
+import { NextRequest, NextResponse } from 'next/server';
+import { scanBuffer, Verdict } from 'pompelmi';
+import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3';
+const s3 = new S3Client({ region: process.env.AWS_REGION });
+const SCAN_OPTS = {
+  host: process.env.CLAMAV_HOST,
+  port: Number(process.env.CLAMAV_PORT) || 3310,
+  timeout: 30_000,
+};
+export async function POST(request: NextRequest) {
+  const form = await request.formData();
+  const file = form.get('file');
+  if (!(file instanceof File)) {
+    return NextResponse.json({ error: 'No file uploaded.' }, { status: 400 });
+  }
+  const arrayBuffer = await file.arrayBuffer();
+  const buffer = Buffer.from(arrayBuffer);
+  let result: symbol;
+  try {
+    result = await scanBuffer(buffer, SCAN_OPTS);
+  } catch (err) {
+    return NextResponse.json(
+      { error: `Scan failed: ${(err as Error).message}` },
+      { status: 500 }
+    );
+  }
+  if (result === Verdict.Malicious) {
+    return NextResponse.json({ error: 'Malicious file rejected.' }, { status: 422 });
+  }
+  if (result === Verdict.ScanError) {
+    return NextResponse.json(
+      { error: 'Scan incomplete — file rejected as precaution.' },
+      { status: 422 }
+    );
+  }
+  // Upload to S3 only after a clean scan
+  await s3.send(new PutObjectCommand({
+    Bucket: process.env.S3_BUCKET,
+    Key:    `uploads/${Date.now()}-${file.name}`,
+    Body:   buffer,
+    ContentType: file.type,
+  }));
+  return NextResponse.json({ ok: true });
+}
+```
+### Disable Next.js body parsing
+By default, Next.js App Router does not parse multipart bodies — the `request.formData()` call handles it natively. No special config needed.
+---
+## Pages Router
+### With `formidable` — scan by file path
+The Pages Router does not handle multipart natively. Use `formidable` to parse the upload, then scan the temp file path.
+```bash
+npm install pompelmi formidable
+npm install -D @types/formidable
+```
+```ts
+// pages/api/upload.ts
+import type { NextApiRequest, NextApiResponse } from 'next';
+import formidable, { File as FormidableFile } from 'formidable';
+import fs from 'fs';
+import { scan, Verdict } from 'pompelmi';
+export const config = {
+  api: { bodyParser: false }, // required for multipart
+};
+const SCAN_OPTS = {
+  host: process.env.CLAMAV_HOST,
+  port: Number(process.env.CLAMAV_PORT) || 3310,
+};
+function parseForm(req: NextApiRequest): Promise<{ file: FormidableFile }> {
+  return new Promise((resolve, reject) => {
+    const form = formidable({ uploadDir: '/tmp', keepExtensions: true });
+    form.parse(req, (err, _fields, files) => {
+      if (err) return reject(err);
+      const file = Array.isArray(files.file) ? files.file[0] : files.file;
+      if (!file) return reject(new Error('No file.'));
+      resolve({ file });
+    });
+  });
+}
+export default async function handler(req: NextApiRequest, res: NextApiResponse) {
+  if (req.method !== 'POST') return res.status(405).end();
+  let file: FormidableFile;
+  try {
+    ({ file } = await parseForm(req));
+  } catch (err) {
+    return res.status(400).json({ error: 'Upload failed.' });
+  }
+  const filePath = file.filepath;
+  try {
+    const result = await scan(filePath, SCAN_OPTS);
+    if (result !== Verdict.Clean) {
+      fs.unlinkSync(filePath);
+      return res.status(422).json({ error: `Upload rejected: ${result.description}` });
+    }
+    // Move or store the clean file
+    return res.status(200).json({ ok: true });
+  } catch (err) {
+    try { fs.unlinkSync(filePath); } catch {}
+    return res.status(500).json({ error: `Scan failed: ${(err as Error).message}` });
+  }
+}
+```
+---
+## Client-side upload
+```tsx
+// components/UploadForm.tsx
+'use client';
+import { useState } from 'react';
+export default function UploadForm() {
+  const [status, setStatus] = useState('');
+  async function handleSubmit(e: React.FormEvent<HTMLFormElement>) {
+    e.preventDefault();
+    const form = e.currentTarget;
+    const data = new FormData(form);
+    const res = await fetch('/api/upload', { method: 'POST', body: data });
+    const json = await res.json();
+    if (!res.ok) {
+      setStatus(`Error: ${json.error}`);
+    } else {
+      setStatus('File uploaded successfully.');
+    }
+  }
+  return (
+    <form onSubmit={handleSubmit}>
+      <input type="file" name="file" required />
+      <button type="submit">Upload</button>
+      {status && <p>{status}</p>}
+    </form>
+  );
+}
+```
+---
+## Environment variables
+Add to `.env.local`:
+```
+CLAMAV_HOST=127.0.0.1
+CLAMAV_PORT=3310
+S3_BUCKET=my-upload-bucket
+AWS_REGION=us-east-1
+```
+In production (Docker), set `CLAMAV_HOST` to the clamd service name (e.g. `clamav`).
+---
+## Notes
+- **Vercel / serverless:** ClamAV cannot be installed on Vercel's serverless functions. Use TCP mode pointing to a self-hosted clamd instance (fly.io, Railway, EC2) or switch to a dedicated scan microservice.
+- **File size limits:** Next.js has a default request body size limit (4 MB for Pages Router). Increase it via `export const config = { api: { bodyParser: { sizeLimit: '20mb' } } }` or disable parsing for multipart routes.
+- **App Router streaming:** The App Router supports streaming request bodies via `request.body` (`ReadableStream`). To use `scanStream()`, convert with `Readable.fromWeb(request.body)` (Node.js 18+).
+```ts
+import { Readable } from 'stream';
+const nodeStream = Readable.fromWeb(request.body as ReadableStream);
+const result = await scanStream(nodeStream, SCAN_OPTS);
+```

package/wiki/performance.md ADDED Viewed

@@ -0,0 +1,178 @@
+# Performance
+Understanding pompelmi's performance characteristics helps you choose the right mode, concurrency level, and file handling strategy for your workload.
+---
+## Latency: local mode vs TCP mode
+| Scenario | Local mode | TCP mode (LAN) |
+|----------|-----------|----------------|
+| Small file (< 1 MB) | 400–800 ms | 5–20 ms |
+| Medium file (5–10 MB) | 800–1500 ms | 20–80 ms |
+| Large file (50 MB) | 2000–4000 ms | 100–400 ms |
+| ZIP archive (1 MB compressed) | 600–1200 ms | 15–60 ms |
+Local mode is dominated by the time ClamAV takes to load the virus database (~300 MB) into memory on each invocation. TCP mode reuses a persistent clamd daemon that keeps the database resident.
+> These are rough estimates. Actual latency depends on disk I/O speed, CPU, ClamAV version, and virus definition size.
+---
+## Throughput: concurrent scans
+### Local mode
+Each local scan spawns a `clamscan` process that loads the database from disk. On a 4-core machine:
+```
+~2–4 concurrent scans before CPU saturation
+~1–2 scans/second sustained throughput
+```
+Increasing concurrency beyond 4 in local mode degrades performance rather than improving it — processes compete for disk and CPU.
+### TCP mode
+clamd keeps the virus database in memory and handles requests on a single thread. Multiple connections are accepted and queued:
+```
+~5–10 concurrent scans before clamd is saturated (single instance)
+~50–200 scans/second sustained throughput (single clamd, depends on file size)
+```
+Scale horizontally by running multiple clamd instances behind a load balancer.
+---
+## Memory usage
+### `scan()` (file path)
+Memory usage is minimal in the application process — pompelmi reads a path and delegates. ClamAV allocates memory to load the database and scan the file (especially for archive extraction).
+### `scanBuffer()` with large files
+The full file content is held in memory as a Node.js `Buffer` for the duration of the scan. For a 50 MB upload:
+- Application process: ~50 MB Buffer
+- clamd (TCP mode): streams the buffer, does not accumulate it all at once
+- Local mode: writes a temp file, so memory usage is minimal in the app process
+**Avoid `scanBuffer()` for files > 50 MB.** Use `scan()` (disk) or `scanStream()` (streaming) instead.
+### `scanStream()` with TCP mode
+The stream is piped directly to clamd in 64 KB chunks. The application process never holds the full file in memory — peak memory usage is approximately 64 KB for the chunk buffer plus stream buffering overhead. This is the most memory-efficient option for large files.
+---
+## Temp file cleanup in local mode
+`scanBuffer()` and `scanStream()` in local mode write a temp file to `os.tmpdir()` before scanning. pompelmi deletes the temp file in a `finally` block — it is always removed regardless of scan outcome.
+However, if your process is killed with `SIGKILL` (not `SIGTERM`), the `finally` block does not run and the temp file persists. Add a startup cleanup or use a system temp cleaner (Linux `systemd-tmpfiles`, macOS `/tmp` auto-clean) to handle this case.
+```js
+const os   = require('os');
+const fs   = require('fs');
+const path = require('path');
+function cleanTempFiles() {
+  const tmpDir = os.tmpdir();
+  const files  = fs.readdirSync(tmpDir);
+  const stale  = files.filter(f => f.startsWith('scan-') && f.endsWith('.tmp'));
+  for (const f of stale) {
+    const full = path.join(tmpDir, f);
+    const age  = Date.now() - fs.statSync(full).mtimeMs;
+    if (age > 60_000) { // older than 1 minute
+      try { fs.unlinkSync(full); } catch {}
+    }
+  }
+}
+// Run at startup
+cleanTempFiles();
+```
+---
+## Connection considerations for TCP mode
+pompelmi opens a new TCP connection per scan call. For sporadic uploads, this is fine — the connection overhead is small (< 1 ms on LAN).
+For sustained high-throughput workloads (hundreds of scans per second), the connection overhead accumulates. Options:
+1. **Keep-alive / connection reuse:** pompelmi does not implement connection pooling. If this becomes a bottleneck, implement a pool using Node.js `net.Socket` that reuses open connections.
+2. **Increase clamd connection limit:** Check `MaxConnections` in `clamd.conf` (default: 30). Increase it if you are running many concurrent scans.
+3. **Scale horizontally:** Run multiple clamd instances behind a load balancer and distribute scan requests across them.
+---
+## `scanDirectory()` performance
+`scanDirectory()` scans all files concurrently (bounded internally). For very large directories (thousands of files), it may open many simultaneous connections to clamd.
+If you observe clamd connection errors with large directories, use `p-limit` to wrap individual `scan()` calls instead:
+```js
+const pLimit = require('p-limit');
+const { scan, Verdict } = require('pompelmi');
+const fs = require('fs');
+async function scanDirLimited(dirPath, concurrency = 5) {
+  const limit = pLimit(concurrency);
+  const files = fs.readdirSync(dirPath, { recursive: true })
+    .filter(f => !fs.statSync(`${dirPath}/${f}`).isDirectory())
+    .map(f => `${dirPath}/${f}`);
+  return Promise.allSettled(
+    files.map(f => limit(async () => ({
+      path: f,
+      verdict: await scan(f, { host: 'clamav', port: 3310 }),
+    })))
+  );
+}
+```
+---
+## Profiling scan latency in production
+Wrap your scan calls with timing instrumentation:
+```js
+async function timedScan(filePath, opts) {
+  const start  = Date.now();
+  const result = await scan(filePath, opts);
+  const ms     = Date.now() - start;
+  logger.info({
+    event:   'scan_complete',
+    filePath,
+    verdict: result.description,
+    ms,
+    size:    fs.statSync(filePath).size,
+  });
+  return result;
+}
+```
+Track the `ms` metric in your observability system. Sudden increases indicate clamd overload, disk I/O contention, or stale virus definitions.
+---
+## Choosing the right function for your workload
+| Scenario | Recommended function | Reason |
+|----------|---------------------|--------|
+| File uploaded to disk | `scan(filePath)` | Zero buffer overhead |
+| multer memoryStorage, small files (< 10 MB) | `scanBuffer(buffer)` | Simple, no temp file in TCP |
+| multer memoryStorage, large files | `scanStream(stream)` | No full buffer in memory |
+| S3 getObject | `scanStream(response.Body)` | No disk, no full buffer |
+| Batch of files in a folder | `scanDirectory(dirPath)` | Single call, concurrent |
+| High-throughput uploads | TCP mode + `scanStream()` | Lowest latency, no disk |

package/wiki/quarantine-workflow.md ADDED Viewed

@@ -0,0 +1,260 @@
+# Quarantine Workflow
+Deleting malicious files immediately is the simplest response, but a quarantine folder lets you retain infected files for forensic review, audit logging, and pattern analysis before permanent deletion.
+---
+## Basic quarantine: move instead of delete
+```js
+const fs   = require('fs');
+const path = require('path');
+const { scan, Verdict } = require('pompelmi');
+const QUARANTINE_DIR = path.join(__dirname, 'quarantine');
+fs.mkdirSync(QUARANTINE_DIR, { recursive: true });
+async function scanAndQuarantine(filePath) {
+  const result = await scan(filePath, { host: process.env.CLAMAV_HOST, port: 3310 });
+  if (result === Verdict.Malicious) {
+    const filename = path.basename(filePath);
+    const dest     = path.join(QUARANTINE_DIR, `${Date.now()}-${filename}`);
+    fs.renameSync(filePath, dest);
+    console.warn({
+      event:    'quarantined',
+      original: filePath,
+      dest,
+      verdict:  result.description,
+    });
+    return { quarantined: true, dest };
+  }
+  if (result === Verdict.ScanError) {
+    fs.unlinkSync(filePath);
+    return { quarantined: false, deleted: true, reason: 'scan_error' };
+  }
+  return { quarantined: false, verdict: result.description };
+}
+```
+`fs.renameSync` is atomic on the same filesystem. If `filePath` and `QUARANTINE_DIR` are on different filesystems, copy then delete:
+```js
+fs.copyFileSync(filePath, dest);
+fs.unlinkSync(filePath);
+```
+---
+## Quarantine folder structure
+Organise quarantine files for easy review. A date-based hierarchy keeps any single directory manageable:
+```
+quarantine/
+  2024/
+    04/
+      28/
+        1714300800000-invoice.pdf
+        1714301200000-resume.doc
+```
+```js
+function quarantinePath(originalPath) {
+  const now      = new Date();
+  const year     = now.getFullYear();
+  const month    = String(now.getMonth() + 1).padStart(2, '0');
+  const day      = String(now.getDate()).padStart(2, '0');
+  const dir      = path.join(QUARANTINE_DIR, String(year), month, day);
+  const filename = `${Date.now()}-${path.basename(originalPath)}`;
+  fs.mkdirSync(dir, { recursive: true });
+  return path.join(dir, filename);
+}
+```
+---
+## Logging quarantined files to a database
+Store a record of every quarantined file for audit and reporting:
+```js
+const { scan, Verdict } = require('pompelmi');
+async function scanAndLog(filePath, db, userId) {
+  let result;
+  try {
+    result = await scan(filePath, { host: 'clamav', port: 3310 });
+  } catch (err) {
+    await db.scanEvents.insert({
+      filePath,
+      userId,
+      event:     'scan_error',
+      error:     err.message,
+      createdAt: new Date(),
+    });
+    throw err;
+  }
+  if (result === Verdict.Malicious) {
+    const dest = quarantinePath(filePath);
+    fs.renameSync(filePath, dest);
+    await db.scanEvents.insert({
+      originalPath: filePath,
+      quarantinePath: dest,
+      userId,
+      event:     'quarantined',
+      verdict:   'malicious',
+      createdAt: new Date(),
+    });
+    return { quarantined: true, dest };
+  }
+  await db.scanEvents.insert({
+    filePath,
+    userId,
+    event:   'clean',
+    verdict: 'clean',
+    createdAt: new Date(),
+  });
+  return { quarantined: false };
+}
+```
+---
+## Alerting on quarantine events
+Send a notification when malware is detected. Use any alerting mechanism — email, Slack, PagerDuty, a webhook:
+```js
+async function notifyAdmin(event) {
+  const message = [
+    `Malicious file quarantined`,
+    `Original path: ${event.originalPath}`,
+    `Quarantine path: ${event.quarantinePath}`,
+    `User: ${event.userId}`,
+    `Time: ${event.createdAt.toISOString()}`,
+  ].join('\n');
+  await fetch(process.env.SLACK_WEBHOOK_URL, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ text: message }),
+  });
+}
+```
+---
+## Express integration with quarantine
+```js
+const express = require('express');
+const multer  = require('multer');
+const { scan, Verdict } = require('pompelmi');
+const app    = express();
+const upload = multer({ dest: './uploads' });
+app.post('/upload', upload.single('file'), async (req, res) => {
+  if (!req.file) return res.status(400).json({ error: 'No file.' });
+  const filePath = req.file.path;
+  const result   = await scan(filePath, { host: 'clamav', port: 3310 }).catch(err => {
+    try { fs.unlinkSync(filePath); } catch {}
+    throw err;
+  });
+  if (result === Verdict.Malicious) {
+    const dest = quarantinePath(filePath);
+    fs.renameSync(filePath, dest);
+    logger.warn({ event: 'quarantined', dest, userId: req.user?.id });
+    return res.status(422).json({ error: 'Malicious file rejected.' });
+  }
+  if (result === Verdict.ScanError) {
+    fs.unlinkSync(filePath);
+    return res.status(422).json({ error: 'Scan incomplete — file rejected.' });
+  }
+  return res.json({ ok: true, filename: req.file.filename });
+});
+```
+---
+## Reviewing quarantined files
+To review what was quarantined:
+```bash
+# List quarantined files with sizes
+find quarantine/ -type f -exec ls -lh {} \;
+# Count by day
+find quarantine/ -type f | cut -d/ -f2-4 | sort | uniq -c
+```
+From a Node.js admin script:
+```js
+const { scanDirectory } = require('pompelmi');
+// Re-scan the quarantine folder to verify signatures (optional)
+const results = await scanDirectory('./quarantine', { host: 'clamav', port: 3310 });
+console.log(`Quarantine: ${results.malicious.length} confirmed malicious, ${results.clean.length} clean`);
+```
+---
+## Cleanup policy
+Quarantined files should not accumulate indefinitely. Implement a retention policy:
+```js
+const fs   = require('fs');
+const path = require('path');
+const RETENTION_DAYS = 30;
+function pruneQuarantine(dir) {
+  const cutoff = Date.now() - RETENTION_DAYS * 24 * 60 * 60 * 1000;
+  for (const file of fs.readdirSync(dir, { recursive: true })) {
+    const fullPath = path.join(dir, file);
+    const stat     = fs.statSync(fullPath);
+    if (stat.isFile() && stat.mtimeMs < cutoff) {
+      fs.unlinkSync(fullPath);
+      console.log(`Deleted expired quarantine file: ${fullPath}`);
+    }
+  }
+}
+pruneQuarantine('./quarantine');
+```
+Run this as a daily cron job. Adjust `RETENTION_DAYS` based on your audit or compliance requirements.
+---
+## Permissions
+Ensure the quarantine directory is not web-accessible. Never serve files from the quarantine folder through your web server. Set restrictive filesystem permissions:
+```bash
+mkdir -p quarantine
+chmod 700 quarantine
+```
+On Linux, assign ownership to the user running your Node.js process and deny access to all others.