qpdf-compress 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,610 @@
1
+ #include <napi.h>
2
+
3
+ #include <cerrno>
4
+ #include <csetjmp>
5
+ #include <cstdio>
6
+ #include <cstring>
7
+ #include <filesystem>
8
+ #include <limits>
9
+ #include <map>
10
+ #include <memory>
11
+ #include <set>
12
+ #include <string>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+
16
+ #include <jpeglib.h>
17
+
18
+ #include <qpdf/Buffer.hh>
19
+ #include <qpdf/Pl_Flate.hh>
20
+ #include <qpdf/QPDF.hh>
21
+ #include <qpdf/QPDFObjectHandle.hh>
22
+ #include <qpdf/QPDFPageDocumentHelper.hh>
23
+ #include <qpdf/QPDFWriter.hh>
24
+
25
+ #include "stb_image_write.h"
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // stb_image_write callback — writes JPEG data to a vector
29
+ // ---------------------------------------------------------------------------
30
+
31
+ static void stbi_write_to_vector(void *context, void *data, int size) {
32
+ if (!context || !data || size <= 0)
33
+ return;
34
+ auto *vec = static_cast<std::vector<uint8_t> *>(context);
35
+ auto *bytes = static_cast<uint8_t *>(data);
36
+ vec->insert(vec->end(), bytes, bytes + size);
37
+ }
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // JPEG error handler — prevents libjpeg from calling exit() on errors
41
+ // ---------------------------------------------------------------------------
42
+
43
+ struct JpegErrorMgr {
44
+ struct jpeg_error_mgr pub;
45
+ std::jmp_buf jmpbuf;
46
+ };
47
+
48
+ static void jpegErrorExit(j_common_ptr cinfo) {
49
+ auto *myerr = reinterpret_cast<JpegErrorMgr *>(cinfo->err);
50
+ std::longjmp(myerr->jmpbuf, 1);
51
+ }
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // Lossless JPEG optimization — rewrites Huffman tables at the DCT coefficient
55
+ // level without touching pixel data. Typically saves 2–15%.
56
+ // ---------------------------------------------------------------------------
57
+
58
+ // isolated setjmp scope — no C++ objects with non-trivial destructors
59
+ // may be live when longjmp fires, avoiding undefined behavior
60
+ static bool losslessJpegOptimizeImpl(const unsigned char *data, size_t size,
61
+ unsigned char **outbuf,
62
+ unsigned long *outsize) {
63
+ struct jpeg_decompress_struct srcinfo = {};
64
+ struct jpeg_compress_struct dstinfo = {};
65
+ JpegErrorMgr jerr = {};
66
+
67
+ srcinfo.err = jpeg_std_error(&jerr.pub);
68
+ jerr.pub.error_exit = jpegErrorExit;
69
+ dstinfo.err = &jerr.pub;
70
+
71
+ jpeg_create_decompress(&srcinfo);
72
+ jpeg_create_compress(&dstinfo);
73
+
74
+ if (setjmp(jerr.jmpbuf)) {
75
+ jpeg_destroy_decompress(&srcinfo);
76
+ jpeg_destroy_compress(&dstinfo);
77
+ return false;
78
+ }
79
+
80
+ jpeg_mem_src(&srcinfo, data, static_cast<unsigned long>(size));
81
+
82
+ if (jpeg_read_header(&srcinfo, TRUE) != JPEG_HEADER_OK) {
83
+ jpeg_destroy_decompress(&srcinfo);
84
+ jpeg_destroy_compress(&dstinfo);
85
+ return false;
86
+ }
87
+
88
+ // read DCT coefficients — zero quality loss
89
+ jvirt_barray_ptr *coef_arrays = jpeg_read_coefficients(&srcinfo);
90
+ if (!coef_arrays) {
91
+ jpeg_destroy_decompress(&srcinfo);
92
+ jpeg_destroy_compress(&dstinfo);
93
+ return false;
94
+ }
95
+
96
+ *outsize = 0;
97
+ jpeg_mem_dest(&dstinfo, outbuf, outsize);
98
+
99
+ jpeg_copy_critical_parameters(&srcinfo, &dstinfo);
100
+ dstinfo.optimize_coding = TRUE;
101
+
102
+ jpeg_write_coefficients(&dstinfo, coef_arrays);
103
+ jpeg_finish_compress(&dstinfo);
104
+ jpeg_finish_decompress(&srcinfo);
105
+
106
+ jpeg_destroy_compress(&dstinfo);
107
+ jpeg_destroy_decompress(&srcinfo);
108
+
109
+ return *outbuf != nullptr && *outsize > 0;
110
+ }
111
+
112
+ static bool losslessJpegOptimize(const unsigned char *data, size_t size,
113
+ std::vector<uint8_t> &out) {
114
+ unsigned char *outbuf = nullptr;
115
+ unsigned long outsize = 0;
116
+
117
+ bool ok = losslessJpegOptimizeImpl(data, size, &outbuf, &outsize);
118
+ if (ok && outbuf && outsize > 0) {
119
+ out.assign(outbuf, outbuf + outsize);
120
+ }
121
+ free(outbuf);
122
+ return ok;
123
+ }
124
+
125
+ // ---------------------------------------------------------------------------
126
+ // Image recompression for lossy mode
127
+ // ---------------------------------------------------------------------------
128
+
129
+ static void optimizeImages(QPDF &qpdf, int quality) {
130
+ for (auto &page : QPDFPageDocumentHelper(qpdf).getAllPages()) {
131
+ auto pageObj = page.getObjectHandle();
132
+ auto resources = pageObj.getKey("/Resources");
133
+ if (!resources.isDictionary())
134
+ continue;
135
+ auto xobjects = resources.getKey("/XObject");
136
+ if (!xobjects.isDictionary())
137
+ continue;
138
+
139
+ for (auto &key : xobjects.getKeys()) {
140
+ auto xobj = xobjects.getKey(key);
141
+ if (!xobj.isStream())
142
+ continue;
143
+
144
+ auto dict = xobj.getDict();
145
+ if (!dict.getKey("/Subtype").isName() ||
146
+ dict.getKey("/Subtype").getName() != "/Image")
147
+ continue;
148
+
149
+ // only handle 8-bit images
150
+ if (!dict.getKey("/BitsPerComponent").isInteger() ||
151
+ dict.getKey("/BitsPerComponent").getIntValue() != 8)
152
+ continue;
153
+
154
+ int width = 0, height = 0, components = 0;
155
+ if (dict.getKey("/Width").isInteger())
156
+ width = static_cast<int>(dict.getKey("/Width").getIntValue());
157
+ if (dict.getKey("/Height").isInteger())
158
+ height = static_cast<int>(dict.getKey("/Height").getIntValue());
159
+
160
+ if (width <= 0 || height <= 0 || width > 16384 || height > 16384)
161
+ continue;
162
+
163
+ // determine color components
164
+ auto cs = dict.getKey("/ColorSpace");
165
+ if (cs.isName()) {
166
+ if (cs.getName() == "/DeviceRGB")
167
+ components = 3;
168
+ else if (cs.getName() == "/DeviceGray")
169
+ components = 1;
170
+ else
171
+ continue; // skip CMYK, Lab, etc. for now
172
+ } else {
173
+ continue; // skip indexed, ICCBased, etc.
174
+ }
175
+
176
+ // skip tiny images (logos, icons) — not worth recompressing
177
+ if (width * height < 2500)
178
+ continue;
179
+
180
+ // get fully decoded stream data (raw pixels)
181
+ std::shared_ptr<Buffer> streamData;
182
+ try {
183
+ streamData = xobj.getStreamData(qpdf_dl_all);
184
+ } catch (...) {
185
+ continue; // can't decode — skip
186
+ }
187
+
188
+ // overflow-safe size calculation
189
+ auto w = static_cast<size_t>(width);
190
+ auto h = static_cast<size_t>(height);
191
+ auto c = static_cast<size_t>(components);
192
+ if (h > 0 && w > std::numeric_limits<size_t>::max() / h)
193
+ continue;
194
+ if (c > 0 && (w * h) > std::numeric_limits<size_t>::max() / c)
195
+ continue;
196
+ size_t expectedSize = w * h * c;
197
+ if (streamData->getSize() != expectedSize)
198
+ continue;
199
+
200
+ // check if recompression would actually help:
201
+ // skip if already a small JPEG
202
+ auto currentFilter = dict.getKey("/Filter");
203
+ bool isCurrentlyJpeg =
204
+ currentFilter.isName() && currentFilter.getName() == "/DCTDecode";
205
+
206
+ // encode as JPEG
207
+ std::vector<uint8_t> jpegData;
208
+ jpegData.reserve(expectedSize / 4); // estimate
209
+ int writeOk =
210
+ stbi_write_jpg_to_func(stbi_write_to_vector, &jpegData, width, height,
211
+ components, streamData->getBuffer(), quality);
212
+
213
+ if (!writeOk || jpegData.empty())
214
+ continue;
215
+
216
+ // only replace if we actually reduced size
217
+ if (isCurrentlyJpeg) {
218
+ auto rawData = xobj.getRawStreamData();
219
+ if (jpegData.size() >= rawData->getSize())
220
+ continue; // new JPEG is larger, keep original
221
+ }
222
+
223
+ // replace stream data with JPEG
224
+ std::string jpegStr(reinterpret_cast<char *>(jpegData.data()),
225
+ jpegData.size());
226
+ xobj.replaceStreamData(jpegStr, QPDFObjectHandle::newName("/DCTDecode"),
227
+ QPDFObjectHandle::newNull());
228
+
229
+ // update dictionary — remove FlateDecode-specific params
230
+ if (dict.hasKey("/DecodeParms"))
231
+ dict.removeKey("/DecodeParms");
232
+ if (dict.hasKey("/Predictor"))
233
+ dict.removeKey("/Predictor");
234
+ }
235
+ }
236
+ }
237
+
238
+ // ---------------------------------------------------------------------------
239
+ // Duplicate image detection — replaces identical image objects with
240
+ // references to a single canonical copy. Dropped duplicates become
241
+ // unreferenced and are omitted from the output.
242
+ // ---------------------------------------------------------------------------
243
+
244
+ static void deduplicateImages(QPDF &qpdf) {
245
+ struct ImageEntry {
246
+ QPDFObjGen og;
247
+ size_t dataSize;
248
+ QPDFObjectHandle handle;
249
+ };
250
+
251
+ std::unordered_map<size_t, std::vector<ImageEntry>> hashGroups;
252
+ std::set<QPDFObjGen> seen;
253
+
254
+ // first pass: collect all image objects and hash their raw data
255
+ for (auto &page : QPDFPageDocumentHelper(qpdf).getAllPages()) {
256
+ auto resources = page.getObjectHandle().getKey("/Resources");
257
+ if (!resources.isDictionary())
258
+ continue;
259
+ auto xobjects = resources.getKey("/XObject");
260
+ if (!xobjects.isDictionary())
261
+ continue;
262
+
263
+ for (auto &key : xobjects.getKeys()) {
264
+ auto xobj = xobjects.getKey(key);
265
+ if (!xobj.isStream())
266
+ continue;
267
+ auto og = xobj.getObjGen();
268
+ if (seen.count(og))
269
+ continue;
270
+ seen.insert(og);
271
+
272
+ auto dict = xobj.getDict();
273
+ if (!dict.getKey("/Subtype").isName() ||
274
+ dict.getKey("/Subtype").getName() != "/Image")
275
+ continue;
276
+
277
+ try {
278
+ auto rawData = xobj.getRawStreamData();
279
+ size_t size = rawData->getSize();
280
+
281
+ // FNV-1a hash
282
+ size_t hash = 14695981039346656037ULL;
283
+ auto *p = rawData->getBuffer();
284
+ for (size_t i = 0; i < size; ++i) {
285
+ hash ^= static_cast<size_t>(p[i]);
286
+ hash *= 1099511628211ULL;
287
+ }
288
+
289
+ hashGroups[hash].push_back({og, size, xobj});
290
+ } catch (...) {
291
+ continue;
292
+ }
293
+ }
294
+ }
295
+
296
+ // second pass: verify hash collisions with full byte comparison
297
+ std::map<QPDFObjGen, QPDFObjectHandle> replacements;
298
+
299
+ for (auto &[hash, group] : hashGroups) {
300
+ if (group.size() < 2)
301
+ continue;
302
+
303
+ for (size_t i = 0; i < group.size(); ++i) {
304
+ if (replacements.count(group[i].og))
305
+ continue;
306
+
307
+ auto rawI = group[i].handle.getRawStreamData();
308
+ for (size_t j = i + 1; j < group.size(); ++j) {
309
+ if (replacements.count(group[j].og))
310
+ continue;
311
+
312
+ auto rawJ = group[j].handle.getRawStreamData();
313
+ if (rawI->getSize() != rawJ->getSize())
314
+ continue;
315
+
316
+ if (memcmp(rawI->getBuffer(), rawJ->getBuffer(), rawI->getSize()) ==
317
+ 0) {
318
+ replacements[group[j].og] = group[i].handle;
319
+ }
320
+ }
321
+ }
322
+ }
323
+
324
+ if (replacements.empty())
325
+ return;
326
+
327
+ // third pass: rewrite XObject references to point to canonical objects
328
+ for (auto &page : QPDFPageDocumentHelper(qpdf).getAllPages()) {
329
+ auto resources = page.getObjectHandle().getKey("/Resources");
330
+ if (!resources.isDictionary())
331
+ continue;
332
+ auto xobjects = resources.getKey("/XObject");
333
+ if (!xobjects.isDictionary())
334
+ continue;
335
+
336
+ for (auto &key : xobjects.getKeys()) {
337
+ auto xobj = xobjects.getKey(key);
338
+ auto it = replacements.find(xobj.getObjGen());
339
+ if (it != replacements.end()) {
340
+ xobjects.replaceKey(key, it->second);
341
+ }
342
+ }
343
+ }
344
+ }
345
+
346
+ // ---------------------------------------------------------------------------
347
+ // Lossless optimization of existing embedded JPEG images — optimizes Huffman
348
+ // tables at the DCT coefficient level without any quality loss.
349
+ // ---------------------------------------------------------------------------
350
+
351
+ static void optimizeExistingJpegs(QPDF &qpdf) {
352
+ std::set<QPDFObjGen> processed;
353
+
354
+ for (auto &page : QPDFPageDocumentHelper(qpdf).getAllPages()) {
355
+ auto resources = page.getObjectHandle().getKey("/Resources");
356
+ if (!resources.isDictionary())
357
+ continue;
358
+ auto xobjects = resources.getKey("/XObject");
359
+ if (!xobjects.isDictionary())
360
+ continue;
361
+
362
+ for (auto &key : xobjects.getKeys()) {
363
+ auto xobj = xobjects.getKey(key);
364
+ if (!xobj.isStream())
365
+ continue;
366
+
367
+ auto og = xobj.getObjGen();
368
+ if (processed.count(og))
369
+ continue;
370
+ processed.insert(og);
371
+
372
+ auto dict = xobj.getDict();
373
+ auto filter = dict.getKey("/Filter");
374
+ if (!filter.isName() || filter.getName() != "/DCTDecode")
375
+ continue;
376
+
377
+ try {
378
+ auto rawData = xobj.getRawStreamData();
379
+
380
+ std::vector<uint8_t> optimized;
381
+ if (!losslessJpegOptimize(rawData->getBuffer(), rawData->getSize(),
382
+ optimized))
383
+ continue;
384
+
385
+ // only replace if strictly smaller
386
+ if (optimized.size() >= rawData->getSize())
387
+ continue;
388
+
389
+ std::string jpegStr(reinterpret_cast<char *>(optimized.data()),
390
+ optimized.size());
391
+ xobj.replaceStreamData(jpegStr, QPDFObjectHandle::newName("/DCTDecode"),
392
+ QPDFObjectHandle::newNull());
393
+ } catch (...) {
394
+ continue;
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+ // ---------------------------------------------------------------------------
401
+ // CompressWorker — async PDF compression
402
+ // ---------------------------------------------------------------------------
403
+
404
+ class CompressWorker : public Napi::AsyncWorker {
405
+ public:
406
+ // buffer variant
407
+ CompressWorker(Napi::Env env, std::vector<uint8_t> data, bool lossy,
408
+ int quality, std::string outputPath)
409
+ : Napi::AsyncWorker(env), deferred_(Napi::Promise::Deferred::New(env)),
410
+ bufferData_(std::move(data)), lossy_(lossy), quality_(quality),
411
+ useFile_(false), outputPath_(std::move(outputPath)) {}
412
+
413
+ // file path variant
414
+ CompressWorker(Napi::Env env, std::string path, bool lossy, int quality,
415
+ std::string outputPath)
416
+ : Napi::AsyncWorker(env), deferred_(Napi::Promise::Deferred::New(env)),
417
+ filePath_(std::move(path)), lossy_(lossy), quality_(quality),
418
+ useFile_(true), outputPath_(std::move(outputPath)) {}
419
+
420
+ Napi::Promise Promise() { return deferred_.Promise(); }
421
+
422
+ protected:
423
+ void Execute() override {
424
+ try {
425
+ QPDF qpdf;
426
+ qpdf.setAttemptRecovery(true);
427
+
428
+ if (useFile_) {
429
+ if (!std::filesystem::exists(filePath_)) {
430
+ SetError("Input file not found: " + filePath_);
431
+ return;
432
+ }
433
+ qpdf.processFile(filePath_.c_str());
434
+ } else {
435
+ // validate PDF header to prevent QPDF from aborting on garbage input
436
+ if (bufferData_.size() < 5 ||
437
+ memcmp(bufferData_.data(), "%PDF-", 5) != 0) {
438
+ SetError("Input is not a valid PDF (missing %PDF- header)");
439
+ return;
440
+ }
441
+ qpdf.processMemoryFile(
442
+ "input.pdf", reinterpret_cast<const char *>(bufferData_.data()),
443
+ bufferData_.size());
444
+ }
445
+
446
+ // deduplicate identical images across pages
447
+ deduplicateImages(qpdf);
448
+
449
+ // lossy: recompress embedded images as JPEG
450
+ if (lossy_) {
451
+ optimizeImages(qpdf, quality_);
452
+ }
453
+
454
+ // lossless JPEG Huffman table optimization
455
+ optimizeExistingJpegs(qpdf);
456
+
457
+ // maximum Flate compression level
458
+ Pl_Flate::setCompressionLevel(9);
459
+
460
+ QPDFWriter writer(qpdf);
461
+ writer.setOutputMemory();
462
+ writer.setStreamDataMode(qpdf_s_compress);
463
+ writer.setRecompressFlate(true);
464
+ writer.setObjectStreamMode(qpdf_o_generate);
465
+ writer.setCompressStreams(true);
466
+ // only decode generalized streams (Flate, LZW, etc.)
467
+ // this preserves DCTDecode (our recompressed JPEG images)
468
+ writer.setDecodeLevel(qpdf_dl_generalized);
469
+ writer.setPreserveUnreferencedObjects(false);
470
+ writer.write();
471
+
472
+ auto buf = writer.getBufferSharedPointer();
473
+ result_.assign(buf->getBuffer(), buf->getBuffer() + buf->getSize());
474
+
475
+ // write to file if output path was specified
476
+ if (!outputPath_.empty()) {
477
+ auto closer = [](FILE *fp) {
478
+ if (fp)
479
+ fclose(fp);
480
+ };
481
+ std::unique_ptr<FILE, decltype(closer)> f(
482
+ fopen(outputPath_.c_str(), "wb"), closer);
483
+ if (!f) {
484
+ auto parentDir = std::filesystem::path(outputPath_).parent_path();
485
+ if (!parentDir.empty() && !std::filesystem::is_directory(parentDir)) {
486
+ SetError("Parent directory does not exist: " + parentDir.string());
487
+ } else {
488
+ SetError("Failed to open output file: " + outputPath_ + " (" +
489
+ std::strerror(errno) + ")");
490
+ }
491
+ return;
492
+ }
493
+ size_t written = fwrite(result_.data(), 1, result_.size(), f.get());
494
+ if (written != result_.size()) {
495
+ SetError("Failed to write output file: " + outputPath_ + " (" +
496
+ std::strerror(errno) + ")");
497
+ return;
498
+ }
499
+ if (fflush(f.get()) != 0) {
500
+ SetError("Failed to flush output file: " + outputPath_ + " (" +
501
+ std::strerror(errno) + ")");
502
+ return;
503
+ }
504
+ result_.clear();
505
+ }
506
+ } catch (std::exception &e) {
507
+ SetError(e.what());
508
+ }
509
+ }
510
+
511
+ void OnOK() override {
512
+ if (outputPath_.empty()) {
513
+ auto buffer =
514
+ Napi::Buffer<uint8_t>::Copy(Env(), result_.data(), result_.size());
515
+ deferred_.Resolve(buffer);
516
+ } else {
517
+ deferred_.Resolve(Env().Undefined());
518
+ }
519
+ }
520
+
521
+ void OnError(Napi::Error const &error) override {
522
+ deferred_.Reject(error.Value());
523
+ }
524
+
525
+ private:
526
+ Napi::Promise::Deferred deferred_;
527
+ std::vector<uint8_t> bufferData_;
528
+ std::string filePath_;
529
+ bool lossy_;
530
+ int quality_;
531
+ bool useFile_;
532
+ std::string outputPath_;
533
+ std::vector<uint8_t> result_;
534
+ };
535
+
536
+ // ---------------------------------------------------------------------------
537
+ // JS API: compress(input, options)
538
+ // ---------------------------------------------------------------------------
539
+
540
+ static Napi::Value Compress(const Napi::CallbackInfo &info) {
541
+ Napi::Env env = info.Env();
542
+
543
+ if (info.Length() < 1) {
544
+ Napi::TypeError::New(env, "Expected input (Buffer or string)")
545
+ .ThrowAsJavaScriptException();
546
+ return env.Undefined();
547
+ }
548
+
549
+ // parse options
550
+ bool lossy = false;
551
+ int quality = 75;
552
+ std::string outputPath;
553
+
554
+ if (info.Length() >= 2 && info[1].IsObject()) {
555
+ auto options = info[1].As<Napi::Object>();
556
+
557
+ if (options.Has("mode")) {
558
+ auto mode = options.Get("mode").As<Napi::String>().Utf8Value();
559
+ if (mode != "lossy" && mode != "lossless") {
560
+ Napi::TypeError::New(env, "Mode must be 'lossy' or 'lossless'")
561
+ .ThrowAsJavaScriptException();
562
+ return env.Undefined();
563
+ }
564
+ lossy = (mode == "lossy");
565
+ }
566
+
567
+ if (options.Has("quality")) {
568
+ quality = options.Get("quality").As<Napi::Number>().Int32Value();
569
+ if (quality < 1)
570
+ quality = 1;
571
+ if (quality > 100)
572
+ quality = 100;
573
+ }
574
+
575
+ if (options.Has("output"))
576
+ outputPath = options.Get("output").As<Napi::String>().Utf8Value();
577
+ }
578
+
579
+ if (info[0].IsBuffer()) {
580
+ auto buf = info[0].As<Napi::Buffer<uint8_t>>();
581
+ std::vector<uint8_t> data(buf.Data(), buf.Data() + buf.Length());
582
+ auto worker = new CompressWorker(env, std::move(data), lossy, quality,
583
+ std::move(outputPath));
584
+ worker->Queue();
585
+ return worker->Promise();
586
+ }
587
+
588
+ if (info[0].IsString()) {
589
+ auto path = info[0].As<Napi::String>().Utf8Value();
590
+ auto worker = new CompressWorker(env, std::move(path), lossy, quality,
591
+ std::move(outputPath));
592
+ worker->Queue();
593
+ return worker->Promise();
594
+ }
595
+
596
+ Napi::TypeError::New(env, "Input must be a Buffer or file path string")
597
+ .ThrowAsJavaScriptException();
598
+ return env.Undefined();
599
+ }
600
+
601
+ // ---------------------------------------------------------------------------
602
+ // Module init
603
+ // ---------------------------------------------------------------------------
604
+
605
+ static Napi::Object Init(Napi::Env env, Napi::Object exports) {
606
+ exports.Set("compress", Napi::Function::New(env, Compress));
607
+ return exports;
608
+ }
609
+
610
+ NODE_API_MODULE(qpdf_compress, Init)