RubyGems - pdfshaver - Versions diffs - 0.0.1.alpha - Mend

pdfshaver 0.0.1.alpha

Files changed (27) hide show

checksums.yaml +7 -0
data/Gemfile +3 -0
data/Rakefile +10 -0
data/Readme.md +14 -0
data/ext/pdfium_ruby/document.cpp +131 -0
data/ext/pdfium_ruby/document.h +53 -0
data/ext/pdfium_ruby/extconf.rb +68 -0
data/ext/pdfium_ruby/page.cpp +190 -0
data/ext/pdfium_ruby/page.h +36 -0
data/ext/pdfium_ruby/pdfium_ruby.cpp +17 -0
data/ext/pdfium_ruby/pdfium_ruby.h +20 -0
data/lib/pdfshaver.rb +13 -0
data/lib/pdfshaver/document.rb +21 -0
data/lib/pdfshaver/page.rb +117 -0
data/lib/pdfshaver/page_set.rb +83 -0
data/lib/pdfshaver/version.rb +3 -0
data/pdfshaver.gemspec +29 -0
data/test/document_spec.rb +36 -0
data/test/fixtures/completely_encrypted.pdf +0 -0
data/test/fixtures/encrypted.pdf +0 -0
data/test/fixtures/letter-to-canadians-from-jack-layton.pdf +0 -0
data/test/fixtures/uncharter.pdf +0 -0
data/test/gm_compatability_spec.rb +92 -0
data/test/page_set_spec.rb +62 -0
data/test/page_spec.rb +133 -0
data/test/spec_helper.rb +13 -0
metadata +140 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 5a34c73145fff7cd843f8770b4cdb14648b0d8e8
+  data.tar.gz: 543a70e710dc09646580d1bb0fff7bce6f41fe5e
+SHA512:
+  metadata.gz: 3bca072b82f7710883dec16c6be56fcd696ef074c40b55ca1139f3671985ea7690fad1f2e7cfb0333f4ec480d1b92b2c722e3fa990a1e7a8b19f6b2bbc814ac0
+  data.tar.gz: 3ec48a4b055c0b985567f5922194dbd853a489e392f64cf5ee9b641dd3e45ce9641b2028b4b93d6d802c10516b8cfb4e37575e73c29d27d6a831c31f0a551bf1

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/Rakefile ADDED Viewed

@@ -0,0 +1,10 @@
+require "bundler/gem_tasks"
+require 'rake/extensiontask'
+Rake::ExtensionTask.new('pdfium_ruby')
+task :test => :compile do
+  Dir.glob(File.join File.dirname(__FILE__), "test", "**", "*_spec.rb").each{ |test| require test }
+end
+task(default: :test)

data/Readme.md ADDED Viewed

@@ -0,0 +1,14 @@
+# PDFShaver
+Shave pages off of PDFs as images
+### N.B. This is a work in process
+### Examples
+    require 'pdfshaver'
+    document = PDFShaver::Document.new('./path/to/document.pdf')
+    landscape_pages = document.pages.select{ |page| page.aspect > 1 }
+    landscape_pages.each{ |page| page.render("./page_#{page.number}.gif") }
+copyright 2015 Ted Han, Nathan Stitt & DocumentCloud

data/ext/pdfium_ruby/document.cpp ADDED Viewed

@@ -0,0 +1,131 @@
+#include "document.h"
+/********************************************
+* C++ Document definition
+*********************************************/
+Document::Document() {
+  // Initialize state variables
+  // to mark whether document has been used
+  // and whether document is freeable.
+  this->opened            = false;
+  this->ready_to_be_freed = true;
+}
+Document::~Document() {
+  // make sure the document exists and was initialized before
+  // trying to close it.
+  if (this->opened) { FPDF_CloseDocument(this->fpdf_document); }
+}
+int Document::load(VALUE path) {
+  // load the document via PDFium.
+  // returns false if loading document fails.
+  this->fpdf_document = FPDF_LoadDocument(StringValuePtr(path), NULL);
+  int parse_status = FPDF_GetLastError();
+  // indicate that Ruby is still using this document.
+  this->opened = this->isValid();
+  this->ready_to_be_freed = false;
+  return parse_status;
+}
+int Document::length() { return FPDF_GetPageCount(this->fpdf_document); }
+bool Document::isValid() { return !!(this->fpdf_document); }
+void Document::flagDocumentAsReadyForRelease() { this->ready_to_be_freed = true; }
+void Document::notifyPageOpened(Page* page) {
+  //ruby_puts_cstring("Adding page to open pages");
+  this->open_pages.insert(page);
+}
+void Document::notifyPageClosed(Page* page) {
+  //ruby_puts_cstring("Removing page from open pages");
+  this->open_pages.erase(page);
+  this->destroyUnlessPagesAreOpen();
+}
+void Document::destroyUnlessPagesAreOpen() {
+  // once the document is no longer being used, and none of its child pages are open
+  // it's safe to destroy.
+  if (!(this->opened) || (this->opened && this->ready_to_be_freed && this->open_pages.empty())) {
+    //ruby_puts_cstring("Deleting Document");
+    delete this;
+  }
+}
+/********************************************
+* Ruby class definition and initialization
+*********************************************/
+void Define_Document() {
+  // Get the PDFShaver namespace and get the `Document` class inside it.
+  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
+  rb_define_alloc_func(rb_PDFShaver_Document, *document_allocate);
+  rb_define_private_method(rb_PDFShaver_Document, "open_document_with_pdfium",
+                            CPP_RUBY_METHOD_FUNC(initialize_document_internals), -1);
+};
+VALUE document_allocate(VALUE rb_PDFShaver_Document) {
+  Document* document = new Document();
+  return Data_Wrap_Struct(rb_PDFShaver_Document, NULL, destroy_document_when_safe, document);
+}
+// Entry point for PDFShaver::Document's ruby initializer into C++ land
+VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
+  // Get the PDFShaver namespace and get the `Document` class inside it.
+  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
+  // use Ruby's argument scanner to pull out a required
+  // `path` argument and an optional `options` hash.
+  VALUE path, options;
+  int number_of_args = rb_scan_args(arg_count, args, "11", &path, &options);
+  // attempt to open document.
+  // path should at this point be validated & known to exist.
+  Document* document;
+  Data_Get_Struct(self, Document, document);
+  int parse_status = document->load(path);
+  //document_handle_parse_status(parse_status, path);
+  if (!document->isValid()) { rb_raise(rb_eArgError, "failed to open file (%" PRIsVALUE")", path); }
+  // get the document length and store it as an instance variable on the class.
+  rb_ivar_set(self, rb_intern("@length"), INT2FIX(document->length()));
+  return self;
+}
+void document_handle_parse_status(int status, VALUE path) {
+  //printf("\nSTATUS: %d\n", status);
+  VALUE rb_PDFShaver            = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  VALUE rb_eEncryptionError     = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
+  VALUE rb_eInvalidFormatError  = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
+  VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
+  //switch (status) {
+  //  case PDFPARSE_ERROR_SUCCESS:
+  //    break;
+  //  case PDFPARSE_ERROR_FILE:
+  //    rb_raise(rb_eArgError, "unable to open file (%" PRIsVALUE")", path);
+  //    break;
+  //  case PDFPARSE_ERROR_FORMAT:
+  //    rb_raise(rb_eInvalidFormatError, "file (%" PRIsVALUE") is not a valid PDF", path);
+  //    break;
+  //  case PDFPARSE_ERROR_PASSWORD:
+  //    rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
+  //    break;
+  //  case PDFPARSE_ERROR_CERT:
+  //    rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
+  //    break;
+  //  case PDFPARSE_ERROR_HANDLER:
+  //    rb_raise(rb_eMissingHandlerError, "could not find handler for media objects in file (%" PRIsVALUE")", path);
+  //    break;
+  //}
+}
+static void destroy_document_when_safe(Document* document) {
+  document->flagDocumentAsReadyForRelease();
+  document->destroyUnlessPagesAreOpen();
+}

data/ext/pdfium_ruby/document.h ADDED Viewed

@@ -0,0 +1,53 @@
+#ifndef __DOCUMENT_H__
+#define __DOCUMENT_H__
+// forward declaration since Page/Document classes are interdependent
+class Page;
+#include "pdfium_ruby.h"
+#include "fpdf_ext.h"
+//#include "core/include/fpdfapi/fpdf_parser.h"
+#include "page.h"
+#include <unordered_set>
+// C++ Class to wrap lifecycle of
+// PDF Documents opened through PDFium.
+class Document {
+  public:
+    FPDF_DOCUMENT fpdf_document;
+    // constructor
+    Document();
+    int load(VALUE path);
+    // wrapper for PDFium's pageCount
+    int length();
+    bool isValid();
+    // flag to set instances as ready to be disposed of
+    // pending ensuring all its pages have been first closed.
+    void flagDocumentAsReadyForRelease();
+    // a guard for the destructor.
+    void destroyUnlessPagesAreOpen();
+    void notifyPageClosed(Page* page);
+    void notifyPageOpened(Page* page);
+    // destructor
+    ~Document();
+  private:
+    //bool subscribeToPage(Page* page);
+    bool opened;
+    bool ready_to_be_freed;
+    std::unordered_set<Page*> open_pages;
+};
+static void destroy_document_when_safe(Document* document);
+VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self);
+VALUE document_allocate(VALUE rb_PDFShaver_Document);
+void document_handle_parse_status(int status, VALUE path);
+#endif // __DOCUMENT_H__

data/ext/pdfium_ruby/extconf.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require "mkmf"
+require 'rbconfig'
+# List directories to search for PDFium headers and library files to link against
+def append_pdfium_directory_to paths
+  paths.map do |dir|
+    [
+      File.join(dir, 'pdfium'),
+      File.join(dir, 'pdfium', 'fpdfsdk', 'include'),
+      File.join(dir, 'pdfium', 'third_party', 'base', 'numerics')
+    ]
+  end.flatten + paths
+end
+LIB_DIRS    = append_pdfium_directory_to %w[
+  /usr/local/lib/
+  /usr/lib/
+]
+HEADER_DIRS = append_pdfium_directory_to %w[
+  /usr/local/include/
+  /usr/include/
+]
+# Tell ruby we want to search in the specified paths
+dir_config("pdfium", HEADER_DIRS, LIB_DIRS)
+LIB_FILES= %w[
+  javascript
+  bigint
+  freetype
+  fpdfdoc
+  fpdftext
+  formfiller
+  icudata
+  icuuc
+  icui18n
+  v8_libbase
+  v8_base
+  v8_snapshot
+  v8_libplatform
+  jsapi
+  pdfwindow
+  fxedit
+  fxcrt
+  fxcodec
+  fpdfdoc
+  fdrm
+  fxge
+  fpdfapi
+  freetype
+  pdfium
+  pthread
+  freeimage
+]
+LIB_FILES.each do | lib |
+  have_library(lib) or abort "Couldn't find library lib#{lib} in #{LIB_DIRS.join(', ')}"
+end
+$CPPFLAGS += " -fPIC -std=c++11"
+if RUBY_PLATFORM =~ /darwin/
+  have_library('objc')
+  FRAMEWORKS = %w{AppKit CoreFoundation}
+  $LDFLAGS << FRAMEWORKS.map { |f| " -framework #{f}" }.join
+else
+  #$CPPFLAGS += " -fPIC -std=c++11"
+end
+create_makefile "pdfium_ruby"

data/ext/pdfium_ruby/page.cpp ADDED Viewed

@@ -0,0 +1,190 @@
+#include "page.h"
+#include <FreeImage.h>
+/********************************************
+* C++ Page definition
+*********************************************/
+Page::Page() { this->opened = false; }
+bool Page::load(Document* document, int page_index) {
+  this->document = document;
+  this->page_index = page_index;
+  this->fpdf_page = FPDF_LoadPage(document->fpdf_document, page_index);
+  document->notifyPageOpened(this);
+  this->opened = true;
+  return this->opened;
+}
+double Page::width(){ return FPDF_GetPageWidth(this->fpdf_page); }
+double Page::height(){ return FPDF_GetPageHeight(this->fpdf_page); }
+double Page::aspect() { return width() / height(); }
+bool Page::render(char* path, int width, int height) {
+  // If no height or width is supplied, render at natural dimensions.
+  if (!width && !height) {
+    width  = (int)(this->width());
+    height = (int)(this->height());
+  }
+  // When given only a height or a width,
+  // infer the other by preserving page aspect ratio.
+  if ( width && !height) { height = width  / this->aspect(); }
+  if (!width &&  height) { width  = height * this->aspect(); }
+  //printf("Derp? %d, %d\n", width, height);
+  // Create bitmap.  width, height, alpha 1=enabled,0=disabled
+  bool alpha = false;
+  FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, alpha);
+  if (!bitmap) { return false; }
+  // fill all pixels with white for the background color
+  FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF);
+  // Render a page to a bitmap in RGBA format
+  // args are: *buffer, page, start_x, start_y, size_x, size_y, rotation, and flags
+  // flags are:
+  //      0 for normal display, or combination of flags defined below
+  //   0x01 Set if annotations are to be rendered
+  //   0x02 Set if using text rendering optimized for LCD display
+  //   0x04 Set if you don't want to use GDI+
+  int start_x = 0;
+  int start_y = 0;
+  int rotation = 0;
+  int flags = FPDF_PRINTING; // A flag defined in PDFium's codebase.
+  FPDF_RenderPageBitmap(bitmap, this->fpdf_page, start_x, start_y, width, height, rotation, flags);
+  // The stride holds the width of one row in bytes.  It may not be an exact
+  // multiple of the pixel width because the data may be packed to always end on a byte boundary
+  int stride = FPDFBitmap_GetStride(bitmap);
+  // Safety checks to make sure that the bitmap
+  // is properly sized and can be safely manipulated
+  bool bitmapIsntValid = (
+    (stride < 0) ||
+    (width > INT_MAX / height) ||
+    ((stride * height) > (INT_MAX / 3))
+  );
+  if (bitmapIsntValid){
+      FPDFBitmap_Destroy(bitmap);
+      return false;
+  }
+  // Read the FPDF bitmap into a FreeImage bitmap.
+  unsigned bpp        = 32;
+  unsigned red_mask   = 0xFF0000;
+  unsigned green_mask = 0x00FF00;
+  unsigned blue_mask  = 0x0000FF;
+  bool     topdown    = true;
+  FIBITMAP *raw = FreeImage_ConvertFromRawBits(
+    (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp, red_mask, green_mask, blue_mask, topdown);
+  // at this point we're done with the FPDF bitmap and can destroy it.
+  FPDFBitmap_Destroy(bitmap);
+  // Conversion to jpg or gif require that the bpp be set to 24
+  // since we're not exporting using alpha transparency above in FPDFBitmap_Create
+  FIBITMAP *image = FreeImage_ConvertTo24Bits(raw);
+  FreeImage_Unload(raw);
+  // figure out the desired format from the file extension
+  FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(path);
+  bool success = false;
+  if ( FIF_GIF == format ){
+      // Gif requires quantization to drop to 8bpp
+      FIBITMAP *gif = FreeImage_ColorQuantize(image, FIQ_WUQUANT);
+      success = FreeImage_Save(FIF_GIF, gif, path, GIF_DEFAULT);
+      FreeImage_Unload(gif);
+  } else {
+      // All other formats should be just a save call
+      success = FreeImage_Save(format, image, path, 0);
+  }
+  // unload the image
+  FreeImage_Unload(image);
+  return success;
+}
+Page::~Page() {
+  if (this->opened) {
+    FPDF_ClosePage(this->fpdf_page);
+    this->document->notifyPageClosed(this);
+  }
+}
+/********************************************
+* Ruby class definition and initialization
+*********************************************/
+void Define_Page() {
+  // Get the PDFShaver namespace and get the `Page` class inside it.
+  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
+  rb_define_alloc_func(rb_PDFShaver_Page, *page_allocate);
+  rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
+  rb_define_private_method(rb_PDFShaver_Page, "initialize_page_internals",
+                            CPP_RUBY_METHOD_FUNC(initialize_page_internals),-1);
+}
+VALUE page_allocate(VALUE rb_PDFShaver_Page) {
+  Page* page = new Page();
+  return Data_Wrap_Struct(rb_PDFShaver_Page, NULL, destroy_page, page);
+}
+//bool page_render(int arg_count, VALUE* args, VALUE self) {
+VALUE page_render(int arg_count, VALUE* args, VALUE self) {
+  VALUE path, options;
+  int width = 0, height = 0;
+  int number_of_args = rb_scan_args(arg_count, args, "1:", &path, &options);
+  if (arg_count > 1) {
+    VALUE rb_width  = rb_hash_aref(options, ID2SYM(rb_intern("width")));
+    VALUE rb_height = rb_hash_aref(options, ID2SYM(rb_intern("height")));
+    if (!(NIL_P(rb_width))) {
+      if (FIXNUM_P(rb_width)) { width = FIX2INT(rb_width); }
+      else { rb_raise(rb_eArgError, ":width must be a integer"); }
+    }
+    if (!(NIL_P(rb_height))) {
+      if (FIXNUM_P(rb_height)) { height = FIX2INT(rb_height); }
+      else { rb_raise(rb_eArgError, ":height must be a integer"); }
+    }
+  }
+  FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(StringValuePtr(path));
+  if (format == FIF_UNKNOWN) { rb_raise(rb_eArgError, "can't save to unrecognized image format"); }
+  Page* page;
+  Data_Get_Struct(self, Page, page);
+  return (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
+}
+VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
+  // use Ruby's argument scanner to pull out a required
+  VALUE rb_document, page_index, options;
+  int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
+  // Get the PDFShaver namespace and get the `Page` class inside it.
+  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
+  Document* document;
+  Data_Get_Struct(rb_document, Document, document);
+  Page* page;
+  Data_Get_Struct(self, Page, page);
+  page->load(document, FIX2INT(page_index));
+  rb_ivar_set(self, rb_intern("@width"),  INT2FIX(page->width()));
+  rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
+  rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
+  return self;
+}
+static void destroy_page(Page* page) { delete page; }