RubyGems - pdfshaver - Versions diffs - 0.0.1.alpha1 → 0.0.1 - Mend

pdfshaver 0.0.1.alpha1 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.gitignore +8 -0
data/Gemfile.lock +26 -0
data/Readme.md +56 -3
data/bench/data_loading_speed.rb +13 -0
data/bench/memory_stress.rb +20 -0
data/bench/setup.rb +53 -0
data/ext/pdfium_ruby/document.cpp +6 -9
data/ext/pdfium_ruby/extconf.rb +9 -4
data/ext/pdfium_ruby/page.cpp +97 -62
data/ext/pdfium_ruby/page.h +5 -1
data/lib/pdfshaver/page.rb +41 -3
data/pdfshaver.gemspec +14 -16
data/test/gc_spec.rb +23 -0
data/test/gm_compatability_spec.rb +2 -2
data/test/page_spec.rb +39 -15
metadata +38 -25
data/test/fixtures/completely_encrypted.pdf +0 -0
data/test/fixtures/encrypted.pdf +0 -0

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 47c0de381cc44eaea49ef37b21928598d0c905cf
-  data.tar.gz: 08cdd9fb20961c659c24667652c1fe07dc4c29bc
+  metadata.gz: 179892daf5c810a3516fef72ded8233423ebb6e7
+  data.tar.gz: ee7d17c718fff0ce34cec44de9f3abed6a86e2c2
 SHA512:
-  metadata.gz: 85f7c759a3ebd29a922db660626872f207579bd4706e2e1c4b14eee71d9b550ebca5e17d579ce30850f201ba63a418da0f3171850f0d6291a54af176f6a72801
-  data.tar.gz: 1c72fd8441a91e4b4245ec1e55894b4fd7693ac4b58a60d2178e00f8560ef952bcefcd2c5c562f1e8dcaf24be9d1c18a4740d3bd78c8828a28969a959b0d3dd4
+  metadata.gz: 97b911682b430e6f8d4314e39563f1f9d23332143d9b0170465b35f26a5caf0466b71543cba257329f9016c7fc304b7d769956f87008beed5bd0f39024fe377e
+  data.tar.gz: 78f35366a38991906e6097f79a1b13751bcd1f6dc75caada0825588407246422c01ed59550eb45709b5e2480cce8febb683a8b8954af2bc0332b404830270251

data/.gitignore ADDED

@@ -0,0 +1,8 @@
+tmp
+pkg
+mkmf.log
+*.bundle
+*.so
+test/output
+.DS_Store
+output

data/Gemfile.lock ADDED

@@ -0,0 +1,26 @@
+PATH
+  remote: .
+  specs:
+    pdfium (0.0.1)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    addressable (2.3.7)
+    fastimage (1.6.6)
+      addressable (~> 2.3, >= 2.3.5)
+    minitest (5.5.1)
+    rake (10.4.2)
+    rake-compiler (0.9.5)
+      rake
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.5)
+  fastimage
+  minitest
+  pdfium!
+  rake
+  rake-compiler

data/Readme.md CHANGED

@@ -1,14 +1,67 @@
 # PDFShaver
-Shave pages off of PDFs as images
+# N.B. THIS IS A WORK IN PROGRESS
-### N.B. This is a work in process
+Shave pages off of PDFs as images
 ### Examples
     require 'pdfshaver'
+    # open a document!
     document = PDFShaver::Document.new('./path/to/document.pdf')
+    # Iterate through its pages
     landscape_pages = document.pages.select{ |page| page.aspect > 1 }
     landscape_pages.each{ |page| page.render("./page_#{page.number}.gif") }
-copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
+copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
+## Installation
+PDFShaver is distributed as a Ruby gem.  Once you have it's dependencies installed, all you have to do is type `gem install pdfshaver` (although in some cases you'll need to stick a `sudo` before the command).
+PDFShaver depends on [Google Chrome's `PDFium` library][pdfium], and for now, installing `PDFium` takes a little bit of doing.
+[pdfium]: https://code.google.com/p/pdfium/
+In order install PDFium, you'll need Python, a C++ compiler, FreeImage and `git`.  All of these tools should be available for your operating system.
+### OSX
+#### C++ compiler
+Check whether you have the xcode command line tools installed by typing `xcode-select -p`.  If this command returns something like `/Applications/Xcode.app/Contents/Developer` then you have the command line tools installed already.
+If you do not already have the xcode commandline tools installed running `xcode-select --install` will start you off down the correct path.
+-------------------
+At this point, it may be convenient to install Homebrew.
+#### Python
+If you're using a recent Mac, you should already have Python 2.7 installed on your machine.  You can check what version of Python you're running by typing `python --version` into your terminal.  If you don't have a recent version of python (version 2.7 or greater) installed, you'll
+#### `git`
+If you have homebrew installed simply type `brew install git`
+### Linux (we'll assume ubuntu or debian)
+#### C++ Compiler
+`sudo apt-get install build-essential`
+#### `git`
+`sudo apt-get install git`
+#### FreeImage
+`sudo apt-get install libfreeimage-dev`
+### Getting PDFium's dependencies
+If you have any trouble check [PDFium's build instructions](https://code.google.com/p/pdfium/wiki/Build) for the most up to date instructions.
+### Getting the PDFium code
+`git clone https://pdfium.googlesource.com/pdfium`

data/bench/data_loading_speed.rb ADDED

@@ -0,0 +1,13 @@
+require_relative 'setup'
+require 'benchmark'
+here = File.dirname(__FILE__)
+path = File.join(here, '..', 'test', 'fixtures', 'uncharter.pdf')
+doc = PDFShaver::Document.new(path)
+out_dir = File.join(here, 'output', 'speed')
+count = 10
+Benchmark.bm do |test|
+  test.report("naive"){ count.times{ doc.pages.each{ |page| full_naive_render(page, out_dir) } } }
+  test.report("smart"){ count.times{ doc.pages.each{ |page| full_smart_render(page, out_dir) } } }
+end

data/bench/memory_stress.rb ADDED

@@ -0,0 +1,20 @@
+require_relative 'setup'
+some_number_of = ARGV.pop.to_i
+some_number_of = 5 if some_number_of <= 0
+puts "firing up #{some_number_of} forks"
+some_number_of.times do |n|
+  fork do
+    here = File.dirname(__FILE__)
+    path = File.join(here, '..', 'test', 'fixtures', 'uncharter.pdf')
+    extract(path, n)
+  end
+end
+=begin
+Questions:
+* What can/should trigger Ruby's GC?
+* What's the stack size look like?
+* Is Ruby accurately reporting the amount of memory allocated? (how do we compare?) no!
+* Can we notify Ruby about memory allocated in C/C++? No! \weep
+=end

data/bench/setup.rb ADDED

@@ -0,0 +1,53 @@
+require_relative '../lib/pdfshaver'
+require 'fileutils'
+require 'pp'
+def extract(doc_path, prefix=rand(10**10))
+  out_dir = File.join(".", "output", prefix.to_s)
+  FileUtils.mkdir_p(out_dir)
+  log = File.open(File.join(out_dir, "log.txt"), 'w')
+  log.sync = true
+  doc = PDFShaver::Document.new(doc_path)
+  doc.pages.each do |page|
+    log.puts("#{Time.now}: rendering page #{page.number}")
+    # shamelessly stolen from http://samsaffron.com/archive/2014/04/08/ruby-2-1-garbage-collection-ready-for-production
+    log.puts "RSS: #{`ps -eo rss,pid | grep #{Process.pid} | grep -v grep | awk '{ print $1;  }'`}"
+    #GC.start
+    #log.puts(GC.stat)
+    easy_render(page, out_dir)
+  end
+  log.puts ("#{Time.now}: Done!")
+end
+# A method to test basic rendering
+def easy_render(page, dir)
+  out_path = File.join(dir,"#{page.number}.gif")
+  page.render(out_path)
+end
+# A method for testing rendering a variety of pages
+# but as it turns out rendering isn't the problem so
+# this method isn't any heavier in memory usage than
+# the easy render!
+def full_naive_render(page, dir)
+  sizes = %w[1000x 700x 180x 60x75!]
+  sizes.each do |size_string|
+    dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
+    out_path = File.join(dir,"#{page.number}_#{size_string}.gif")
+    #puts out_path
+    page.render(out_path, dimensions)
+  end
+end
+def full_smart_render(p, dir)
+  p.with_data_loaded do |page|
+    sizes = %w[1000x 700x 180x 60x75!]
+    sizes.each do |size_string|
+      dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
+      out_path = File.join(dir,"#{page.number}_#{size_string}.gif")
+      #puts out_path
+      page.render(out_path, dimensions)
+    end
+  end
+end

data/ext/pdfium_ruby/document.cpp CHANGED

@@ -75,10 +75,6 @@ VALUE document_allocate(VALUE rb_PDFShaver_Document) {
 // Entry point for PDFShaver::Document's ruby initializer into C++ land
 VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
-  // Get the PDFShaver namespace and get the `Document` class inside it.
-  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
-  VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
   // use Ruby's argument scanner to pull out a required
   // `path` argument and an optional `options` hash.
   VALUE path, options;
@@ -88,7 +84,8 @@ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
   // path should at this point be validated & known to exist.
   Document* document;
   Data_Get_Struct(self, Document, document);
-  int parse_status = document->load(path);
+  //int parse_status =
+  document->load(path);
   //document_handle_parse_status(parse_status, path);
   if (!document->isValid()) { rb_raise(rb_eArgError, "failed to open file (%" PRIsVALUE")", path); }
@@ -99,10 +96,10 @@ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
 void document_handle_parse_status(int status, VALUE path) {
   //printf("\nSTATUS: %d\n", status);
-  VALUE rb_PDFShaver            = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
-  VALUE rb_eEncryptionError     = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
-  VALUE rb_eInvalidFormatError  = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
-  VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
+  //VALUE rb_PDFShaver            = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
+  //VALUE rb_eEncryptionError     = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
+  //VALUE rb_eInvalidFormatError  = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
+  //VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
   //switch (status) {
   //  case PDFPARSE_ERROR_SUCCESS:

data/ext/pdfium_ruby/extconf.rb CHANGED

@@ -2,9 +2,9 @@ require "mkmf"
 require 'rbconfig'
 # List directories to search for PDFium headers and library files to link against
 def append_pdfium_directory_to paths
-  paths.map do |dir|
+  paths.map do |dir|
     [
-      File.join(dir, 'pdfium'),
+      File.join(dir, 'pdfium'),
       File.join(dir, 'pdfium', 'fpdfsdk', 'include'),
       File.join(dir, 'pdfium', 'third_party', 'base', 'numerics')
     ]
@@ -56,13 +56,18 @@ LIB_FILES.each do | lib |
   have_library(lib) or abort "Couldn't find library lib#{lib} in #{LIB_DIRS.join(', ')}"
 end
-$CPPFLAGS += " -fPIC -std=c++11"
+$CPPFLAGS += " -fPIC -std=c++11 -Wall"
 if RUBY_PLATFORM =~ /darwin/
   have_library('objc')
   FRAMEWORKS = %w{AppKit CoreFoundation}
   $LDFLAGS << FRAMEWORKS.map { |f| " -framework #{f}" }.join
+end
+if ENV['DEBUG'] == '1'
+  $defs.push "-DDEBUG=1"
+  $CPPFLAGS += " -g"
 else
-  #$CPPFLAGS += " -fPIC -std=c++11"
+  $CPPFLAGS += " -O2"
 end
 create_makefile "pdfium_ruby"

data/ext/pdfium_ruby/page.cpp CHANGED

@@ -5,22 +5,49 @@
 * C++ Page definition
 *********************************************/
+// When created make sure C++ pages are marked as unopened.
 Page::Page() { this->opened = false; }
-bool Page::load(Document* document, int page_index) {
+// When destroying a C++ Page, make sure to dispose of the internals properly.
+// And notify the parent document that this page is no longer going to be used.
+Page::~Page() {
+  if (this->opened) {
+    this->unload();
+    this->document->notifyPageClosed(this);
+  }
+}
+// When the page is initialized through the Ruby lifecycle, store a reference
+// to its parent Document, the page number and notify the Document that this page
+// is available to be loaded.
+void Page::initialize(Document* document, int page_index) {
   this->document = document;
   this->page_index = page_index;
-  this->fpdf_page = FPDF_LoadPage(document->fpdf_document, page_index);
-  document->notifyPageOpened(this);
-  this->opened = true;
+  this->document->notifyPageOpened(this);
+}
+// Load the page through PDFium and flag the document as currently open.
+bool Page::load() {
+  if (!this->opened) {
+    this->fpdf_page = FPDF_LoadPage(this->document->fpdf_document, this->page_index);
+    this->opened = true;
+  }
   return this->opened;
 }
+// Unload the page (freeing the page's memory) and mark it as not open.
+void Page::unload() {
+  if (this->opened){ FPDF_ClosePage(this->fpdf_page); }
+  this->opened = false;
+}
+// readers for the page's dimensions.
 double Page::width(){ return FPDF_GetPageWidth(this->fpdf_page); }
 double Page::height(){ return FPDF_GetPageHeight(this->fpdf_page); }
 double Page::aspect() { return width() / height(); }
+// Render the page to a destination path with the dimensions
+// specified by width & height (or appropriate defaults).
 bool Page::render(char* path, int width, int height) {
   // If no height or width is supplied, render at natural dimensions.
   if (!width && !height) {
@@ -31,21 +58,16 @@ bool Page::render(char* path, int width, int height) {
   // infer the other by preserving page aspect ratio.
   if ( width && !height) { height = width  / this->aspect(); }
   if (!width &&  height) { width  = height * this->aspect(); }
-  //printf("Derp? %d, %d\n", width, height);
   // Create bitmap.  width, height, alpha 1=enabled,0=disabled
   bool alpha = false;
-  printf("just about to allocate bitmap w:%d, h:%d\n", width, height);
   FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, alpha);
-  printf("BITMAP CREATED\n");
-  if (!bitmap) { printf("ALLOCATING BITMAP FAILED"); return false; }
+  if (!bitmap) { return false; }
-  printf("FILLING BITMAP");
-  // fill all pixels with white for the background color
+  // and fill all pixels with white for the background color
   FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF);
-  printf("BITMAP FILLED");
-  // Render a page to a bitmap in RGBA format
-  // args are: *buffer, page, start_x, start_y, size_x, size_y, rotation, and flags
+  // Render a page into the bitmap in RGBA format
   // flags are:
   //      0 for normal display, or combination of flags defined below
   //   0x01 Set if annotations are to be rendered
@@ -56,7 +78,8 @@ bool Page::render(char* path, int width, int height) {
   int rotation = 0;
   int flags = FPDF_PRINTING; // A flag defined in PDFium's codebase.
   FPDF_RenderPageBitmap(bitmap, this->fpdf_page, start_x, start_y, width, height, rotation, flags);
-  printf("RENDERED BITMAP");
+  // Calculate the page's stride.
   // The stride holds the width of one row in bytes.  It may not be an exact
   // multiple of the pixel width because the data may be packed to always end on a byte boundary
   int stride = FPDFBitmap_GetStride(bitmap);
@@ -69,34 +92,30 @@ bool Page::render(char* path, int width, int height) {
     ((stride * height) > (INT_MAX / 3))
   );
   if (bitmapIsntValid){
-      printf("BITMAP ISN'T VALID");
       FPDFBitmap_Destroy(bitmap);
       return false;
   }
-  // Read the FPDF bitmap into a FreeImage bitmap.
+  // Hand off the PDFium bitmap data to FreeImage for additional processing.
   unsigned bpp        = 32;
   unsigned red_mask   = 0xFF0000;
   unsigned green_mask = 0x00FF00;
   unsigned blue_mask  = 0x0000FF;
   bool     topdown    = true;
   FIBITMAP *raw = FreeImage_ConvertFromRawBits(
-    (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp, red_mask, green_mask, blue_mask, topdown);
+    (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp,
+    red_mask, green_mask, blue_mask, topdown);
-  printf("ALLOCATED MAP");
-  // at this point we're done with the FPDF bitmap and can destroy it.
+  // With bitmap handoff complete the FPDF bitmap can be destroyed.
   FPDFBitmap_Destroy(bitmap);
-  printf("FREE BITMAP");
   // Conversion to jpg or gif require that the bpp be set to 24
   // since we're not exporting using alpha transparency above in FPDFBitmap_Create
   FIBITMAP *image = FreeImage_ConvertTo24Bits(raw);
-  printf("CONVERT TO 24BITS2");
   FreeImage_Unload(raw);
-  printf("DEALLOCATE RAW");
   // figure out the desired format from the file extension
   FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(path);
-  printf("DEDUCE FORMAT");
   bool success = false;
   if ( FIF_GIF == format ){
@@ -108,19 +127,11 @@ bool Page::render(char* path, int width, int height) {
       // All other formats should be just a save call
       success = FreeImage_Save(format, image, path, 0);
   }
-  printf("SAVED IMAGE");
   // unload the image
   FreeImage_Unload(image);
-  printf("UNLOADED IMAGE");
-  return success;
-}
-Page::~Page() {
-  if (this->opened) {
-    FPDF_ClosePage(this->fpdf_page);
-    this->document->notifyPageClosed(this);
-  }
+  return success;
 }
 /********************************************
@@ -132,17 +143,64 @@ void Define_Page() {
   VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
   VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
+  // Define the C allocator function so that when a new PDFShaver::Page instance
+  // is created, our C/C++ data structures are initialized into the Ruby lifecycle.
   rb_define_alloc_func(rb_PDFShaver_Page, *page_allocate);
-  rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
+  // Wire the C functions we need/want into Ruby land.
+  // We're using the CPP_RUBY_METHOD_FUNC to wrap functions for C++'s comfort.
   rb_define_private_method(rb_PDFShaver_Page, "initialize_page_internals",
                             CPP_RUBY_METHOD_FUNC(initialize_page_internals),-1);
+  rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
+  rb_define_private_method(rb_PDFShaver_Page, "load_data",   CPP_RUBY_METHOD_FUNC(page_load_data), 0);
+  rb_define_private_method(rb_PDFShaver_Page, "unload_data", CPP_RUBY_METHOD_FUNC(page_unload_data), 0);
 }
+// Create a new C++ Page object and store it in any newly created
+// Ruby page instances.
 VALUE page_allocate(VALUE rb_PDFShaver_Page) {
   Page* page = new Page();
   return Data_Wrap_Struct(rb_PDFShaver_Page, NULL, destroy_page, page);
 }
+// And delete the C++ page when we're done with the Ruby page.
+static void destroy_page(Page* page) { delete page; }
+// This function does the actual initialization of the C++ page's internals
+// defining which page of the document will be opened when `load_data` is called.
+VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
+  // use Ruby's argument scanner to pull out a required
+  VALUE rb_document, page_index, options;
+  int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
+  // fetch the C++ document from the Ruby document the page has been initialized with
+  Document* document;
+  Data_Get_Struct(rb_document, Document, document);
+  // And fetch the C++ page
+  Page* page;
+  Data_Get_Struct(self, Page, page);
+  // and associate them by initializing the C++ page.
+  page->initialize(document, FIX2INT(page_index));
+  return self;
+}
+VALUE page_load_data(VALUE self) {
+  Page* page;
+  Data_Get_Struct(self, Page, page);
+  if (! page->load() ) { rb_raise(rb_eRuntimeError, "Unable to load page data"); }
+  rb_ivar_set(self, rb_intern("@extension_data_is_loaded"),  Qtrue);
+  rb_ivar_set(self, rb_intern("@width"),  INT2FIX(page->width()));
+  rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
+  rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
+  return Qtrue;
+}
+VALUE page_unload_data(VALUE self) {
+  Page* page;
+  Data_Get_Struct(self, Page, page);
+  page->unload();
+  rb_ivar_set(self, rb_intern("@extension_data_is_loaded"),  Qfalse);
+  return Qtrue;
+}
 //bool page_render(int arg_count, VALUE* args, VALUE self) {
 VALUE page_render(int arg_count, VALUE* args, VALUE self) {
@@ -169,31 +227,8 @@ VALUE page_render(int arg_count, VALUE* args, VALUE self) {
   Page* page;
   Data_Get_Struct(self, Page, page);
-  return (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
-}
-VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
-  // use Ruby's argument scanner to pull out a required
-  VALUE rb_document, page_index, options;
-  int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
-  // Get the PDFShaver namespace and get the `Page` class inside it.
-  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
-  VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
-  Document* document;
-  Data_Get_Struct(rb_document, Document, document);
-  Page* page;
-  Data_Get_Struct(self, Page, page);
-  page->load(document, FIX2INT(page_index));
-  rb_ivar_set(self, rb_intern("@width"),  INT2FIX(page->width()));
-  rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
-  rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
-  return self;
-}
-static void destroy_page(Page* page) { delete page; }
+  page_load_data(self);
+  VALUE output = (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
+  page_unload_data(self);
+  return output;
+}

data/ext/pdfium_ruby/page.h CHANGED

@@ -10,7 +10,9 @@ class Page {
   public:
     Page();
-    bool load(Document* document, int page_number);
+    void initialize(Document* document, int page_number);
+    bool load();
+    void unload();
     double width();
     double height();
@@ -31,6 +33,8 @@ void Define_Page();
 VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
 VALUE page_render(int arg_count, VALUE* args, VALUE self);
 VALUE page_allocate(VALUE rb_PDFShaver_Page);
+VALUE page_load_data(VALUE rb_PDFShaver_Page);
+VALUE page_unload_data(VALUE rb_PDFShaver_Page);
 static void destroy_page(Page* page);
 #endif

data/lib/pdfshaver/page.rb CHANGED

@@ -1,7 +1,7 @@
 module PDFShaver
   class Page
     GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
-    attr_reader :document, :width, :height, :aspect, :number, :index
+    attr_reader :document, :number, :index
     def initialize document, number, options={}
       raise ArgumentError unless document.kind_of? PDFShaver::Document
@@ -11,6 +11,7 @@ module PDFShaver
       @number   = number
       @index    = number - 1
       @document = document
+      @extension_data_is_loaded = false
       initialize_page_internals document, @index
     end
@@ -24,6 +25,40 @@ module PDFShaver
       self.index <=> other.index
     end
+    def height
+      load_dimensions unless @height
+      @height
+    end
+    def width
+      load_dimensions unless @width
+      @width
+    end
+    def aspect
+      load_dimensions unless @aspect
+      @aspect
+    end
+    def with_data_loaded &block
+      load_data
+      output = yield self
+      unload_data
+      output
+    end
+    private
+    def load_dimensions
+      with_data_loaded do
+        # don't have to do anything, because loading/unloading page data
+        # will populate our dimensions.
+      end
+    end
+    public
+    # This code was written with the GraphicsMagick geometry argument parser
+    # as a direct reference.  Its intent is to provide a compatibility layer
+    # for specifying page geometry that functions identically to graphicsmagick's.
     def extract_dimensions_from_gm_geometry_string(arg)
       dimensions = {}
       return dimensions if arg.nil? or arg.empty?
@@ -44,6 +79,8 @@ module PDFShaver
           current_area = self.width * self.height
           target_area  = (requested_width || 1) * (requested_height || 1)
+          # if upper or lower bounds are supplied
+          # check whether the target_area size adheres to that constraint.
           resize = if modifier.include? '>'
             current_area > target_area
           elsif modifier.include? '<'
@@ -52,6 +89,7 @@ module PDFShaver
             true
           end
+          # Calculate page dimensions based on area
           if resize
             scale = 1.0 / Math.sqrt(current_area/target_area)
             dimensions[:width]  = (self.width*scale+0.25).floor
@@ -69,8 +107,8 @@ module PDFShaver
             width  = (self.width.to_f/self.height*height+0.5).floor
           end
-          # If proportional mode is requested
-          #
+          # For proportional mode, scales are specified by percent.
+          # Sizes are recalculated and stored as the target width in place for further processing
           if modifier.include? '%'
             x_scale = width
             y_scale = height

data/pdfshaver.gemspec CHANGED

@@ -4,26 +4,24 @@ require 'pdfshaver/version'
 Gem::Specification.new do |s|
   s.name        = 'pdfshaver'
-  s.version     = PDFShaver::VERSION + ".alpha1"
+  s.version     = PDFShaver::VERSION
   s.licenses    = ['MIT']
   s.summary     = "Shave pages off of PDFs as images"
+  s.description = <<-DESCRIPTION
+  Shave pages off of PDFs as images.  PDFShaver makes iterating PDF pages easy
+  by wrapping Google Chrome's PDFium library in an enumerable interface.
+  DESCRIPTION
+  s.homepage    = 'https://www.documentcloud.org/opensource'
   s.authors     = ["Ted Han", "Nathan Stitt"]
   s.email       = 'opensource@documentcloud.org'
   s.extensions = 'ext/pdfium_ruby/extconf.rb'
-  s.files       = Dir.glob %w[
-    lib/pdfshaver.rb
-    lib/*/**/*
-    ext/**/*
-    test/**/*
-    Gemfile
-    pdfshaver.gemspec
-    Rakefile
-    Readme.md
-  ]
+  s.files       = `git ls-files -z`.split("\x0")
+  s.test_files    = s.files.grep(%r{^(test|spec|features)/})
+  s.require_paths = ["lib"]
-  s.add_development_dependency "bundler", "~> 1.5"
-  s.add_development_dependency 'rake'
-  s.add_development_dependency 'rake-compiler'
-  s.add_development_dependency 'minitest'
-  s.add_development_dependency 'fastimage'
+  s.add_development_dependency "bundler",       "~> 1.5"
+  s.add_development_dependency 'rake',          "~>10.4"
+  s.add_development_dependency 'rake-compiler', "~>0.9"
+  s.add_development_dependency 'minitest',      "~>5.5"
+  s.add_development_dependency 'fastimage',     "~>1.6"
 end

data/test/gc_spec.rb ADDED

@@ -0,0 +1,23 @@
+require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
+describe GC do
+  it "won't segfault if when a document is GCed" do
+    doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
+    doc = nil
+    GC.start
+  end
+  it "won't segfault if when an invalid document is GCed" do
+    Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
+    GC.start
+  end
+  it "won't segfault if document falls out of scope before pages" do
+    doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
+    p1 = PDFShaver::Page.new(doc, 1)
+    doc = nil
+    GC.start
+    p1 = nil
+    GC.start
+  end
+end

data/test/gm_compatability_spec.rb CHANGED

@@ -81,8 +81,8 @@ describe "Resize arguments" do
         "200x200@"         => Size.new(176, 227),
         "1000>"            => base,
         #"1000<"            => Size.new(773, 1000),
-        "500>"             => Size.new(386, 500),
-        "500x>"            => Size.new(500, 647)
+        "500>"             => Size.new(390, 500),
+        "500x>"            => Size.new(500, 640)
       }.each do |input, expected|
         #puts "#{input} : #{expected.inspect}"
         output = @page.extract_dimensions_from_gm_geometry_string(input)

data/test/page_spec.rb CHANGED

@@ -109,25 +109,49 @@ describe PDFShaver::Page do
     end
   end
-  describe "GC" do
-    it "won't segfault if when a document is GCed" do
-      doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
-      doc = nil
-      GC.start
+  describe "lazy loading" do
+    before do
+      @page = PDFShaver::Page.new(@document, 1)
+      @output_path = File.join OUTPUT, 'image_render_test.gif'
+    end
+    it "should be safe to reuse pages" do
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.render(@output_path)
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.render(@output_path)
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
     end
-    it "won't segfault if when an invalid document is GCed" do
-      Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
-      GC.start
+    it "should not load data until requested" do
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.instance_variable_get("@height").must_equal nil
+      @page.instance_variable_get("@width").must_equal nil
+      @page.instance_variable_get("@aspect").must_equal nil
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.send(:load_dimensions)
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.height.wont_equal nil
+      @page.width.wont_equal nil
+      @page.aspect.wont_equal nil
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
     end
-    it "won't segfault if document falls out of scope before pages" do
-      doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
-      p1 = PDFShaver::Page.new(doc, 1)
-      doc = nil
-      GC.start
-      p1 = nil
-      GC.start
+    it "should provide a scope where data is kept loaded" do
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+      @page.with_data_loaded do
+        @page.instance_variable_get("@extension_data_is_loaded").must_equal true
+      end
+      @page.instance_variable_get("@extension_data_is_loaded").must_equal false
+    end
+    it "shouldn't blow up if nested twice" do
+      @page.with_data_loaded do |p|
+        p.with_data_loaded do |lol|
+          lol
+        end
+      end
     end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdfshaver
 version: !ruby/object:Gem::Version
-  version: 0.0.1.alpha1
+  version: 0.0.1
 platform: ruby
 authors:
 - Ted Han
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-18 00:00:00.000000000 Z
+date: 2015-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -29,68 +29,74 @@ dependencies:
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '10.4'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '10.4'
 - !ruby/object:Gem::Dependency
   name: rake-compiler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.9'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.9'
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '5.5'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '5.5'
 - !ruby/object:Gem::Dependency
   name: fastimage
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.6'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
-description:
+        version: '1.6'
+description: "  Shave pages off of PDFs as images.  PDFShaver makes iterating PDF
+  pages easy \n  by wrapping Google Chrome's PDFium library in an enumerable interface.\n"
 email: opensource@documentcloud.org
 executables: []
 extensions:
 - ext/pdfium_ruby/extconf.rb
 extra_rdoc_files: []
 files:
+- ".gitignore"
 - Gemfile
+- Gemfile.lock
 - Rakefile
 - Readme.md
+- bench/data_loading_speed.rb
+- bench/memory_stress.rb
+- bench/setup.rb
 - ext/pdfium_ruby/document.cpp
 - ext/pdfium_ruby/document.h
 - ext/pdfium_ruby/extconf.rb
@@ -105,15 +111,14 @@ files:
 - lib/pdfshaver/version.rb
 - pdfshaver.gemspec
 - test/document_spec.rb
-- test/fixtures/completely_encrypted.pdf
-- test/fixtures/encrypted.pdf
 - test/fixtures/letter-to-canadians-from-jack-layton.pdf
 - test/fixtures/uncharter.pdf
+- test/gc_spec.rb
 - test/gm_compatability_spec.rb
 - test/page_set_spec.rb
 - test/page_spec.rb
 - test/spec_helper.rb
-homepage:
+homepage: https://www.documentcloud.org/opensource
 licenses:
 - MIT
 metadata: {}
@@ -128,13 +133,21 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">"
+  - - ">="
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubyforge_project:
 rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Shave pages off of PDFs as images
-test_files: []
+test_files:
+- test/document_spec.rb
+- test/fixtures/letter-to-canadians-from-jack-layton.pdf
+- test/fixtures/uncharter.pdf
+- test/gc_spec.rb
+- test/gm_compatability_spec.rb
+- test/page_set_spec.rb
+- test/page_spec.rb
+- test/spec_helper.rb

data/test/fixtures/completely_encrypted.pdf DELETED

Binary file

data/test/fixtures/encrypted.pdf DELETED

Binary file