pdfshaver 0.0.1.alpha1 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 47c0de381cc44eaea49ef37b21928598d0c905cf
4
- data.tar.gz: 08cdd9fb20961c659c24667652c1fe07dc4c29bc
3
+ metadata.gz: 179892daf5c810a3516fef72ded8233423ebb6e7
4
+ data.tar.gz: ee7d17c718fff0ce34cec44de9f3abed6a86e2c2
5
5
  SHA512:
6
- metadata.gz: 85f7c759a3ebd29a922db660626872f207579bd4706e2e1c4b14eee71d9b550ebca5e17d579ce30850f201ba63a418da0f3171850f0d6291a54af176f6a72801
7
- data.tar.gz: 1c72fd8441a91e4b4245ec1e55894b4fd7693ac4b58a60d2178e00f8560ef952bcefcd2c5c562f1e8dcaf24be9d1c18a4740d3bd78c8828a28969a959b0d3dd4
6
+ metadata.gz: 97b911682b430e6f8d4314e39563f1f9d23332143d9b0170465b35f26a5caf0466b71543cba257329f9016c7fc304b7d769956f87008beed5bd0f39024fe377e
7
+ data.tar.gz: 78f35366a38991906e6097f79a1b13751bcd1f6dc75caada0825588407246422c01ed59550eb45709b5e2480cce8febb683a8b8954af2bc0332b404830270251
@@ -0,0 +1,8 @@
1
+ tmp
2
+ pkg
3
+ mkmf.log
4
+ *.bundle
5
+ *.so
6
+ test/output
7
+ .DS_Store
8
+ output
@@ -0,0 +1,26 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdfium (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ addressable (2.3.7)
10
+ fastimage (1.6.6)
11
+ addressable (~> 2.3, >= 2.3.5)
12
+ minitest (5.5.1)
13
+ rake (10.4.2)
14
+ rake-compiler (0.9.5)
15
+ rake
16
+
17
+ PLATFORMS
18
+ ruby
19
+
20
+ DEPENDENCIES
21
+ bundler (~> 1.5)
22
+ fastimage
23
+ minitest
24
+ pdfium!
25
+ rake
26
+ rake-compiler
data/Readme.md CHANGED
@@ -1,14 +1,67 @@
1
1
  # PDFShaver
2
2
 
3
- Shave pages off of PDFs as images
3
+ # N.B. THIS IS A WORK IN PROGRESS
4
4
 
5
- ### N.B. This is a work in process
5
+ Shave pages off of PDFs as images
6
6
 
7
7
  ### Examples
8
8
 
9
9
  require 'pdfshaver'
10
+ # open a document!
10
11
  document = PDFShaver::Document.new('./path/to/document.pdf')
12
+ # Iterate through its pages
11
13
  landscape_pages = document.pages.select{ |page| page.aspect > 1 }
12
14
  landscape_pages.each{ |page| page.render("./page_#{page.number}.gif") }
13
15
 
14
- copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
16
+ copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
17
+
18
+ ## Installation
19
+
20
+ PDFShaver is distributed as a Ruby gem. Once you have it's dependencies installed, all you have to do is type `gem install pdfshaver` (although in some cases you'll need to stick a `sudo` before the command).
21
+
22
+ PDFShaver depends on [Google Chrome's `PDFium` library][pdfium], and for now, installing `PDFium` takes a little bit of doing.
23
+
24
+ [pdfium]: https://code.google.com/p/pdfium/
25
+
26
+ In order install PDFium, you'll need Python, a C++ compiler, FreeImage and `git`. All of these tools should be available for your operating system.
27
+
28
+ ### OSX
29
+
30
+
31
+ #### C++ compiler
32
+ Check whether you have the xcode command line tools installed by typing `xcode-select -p`. If this command returns something like `/Applications/Xcode.app/Contents/Developer` then you have the command line tools installed already.
33
+
34
+ If you do not already have the xcode commandline tools installed running `xcode-select --install` will start you off down the correct path.
35
+
36
+ -------------------
37
+
38
+ At this point, it may be convenient to install Homebrew.
39
+
40
+ #### Python
41
+
42
+ If you're using a recent Mac, you should already have Python 2.7 installed on your machine. You can check what version of Python you're running by typing `python --version` into your terminal. If you don't have a recent version of python (version 2.7 or greater) installed, you'll
43
+
44
+ #### `git`
45
+
46
+ If you have homebrew installed simply type `brew install git`
47
+
48
+ ### Linux (we'll assume ubuntu or debian)
49
+
50
+ #### C++ Compiler
51
+ `sudo apt-get install build-essential`
52
+ #### `git`
53
+ `sudo apt-get install git`
54
+ #### FreeImage
55
+ `sudo apt-get install libfreeimage-dev`
56
+
57
+ ### Getting PDFium's dependencies
58
+
59
+ If you have any trouble check [PDFium's build instructions](https://code.google.com/p/pdfium/wiki/Build) for the most up to date instructions.
60
+
61
+
62
+
63
+ ### Getting the PDFium code
64
+
65
+ `git clone https://pdfium.googlesource.com/pdfium`
66
+
67
+
@@ -0,0 +1,13 @@
1
+ require_relative 'setup'
2
+ require 'benchmark'
3
+
4
+ here = File.dirname(__FILE__)
5
+ path = File.join(here, '..', 'test', 'fixtures', 'uncharter.pdf')
6
+ doc = PDFShaver::Document.new(path)
7
+ out_dir = File.join(here, 'output', 'speed')
8
+
9
+ count = 10
10
+ Benchmark.bm do |test|
11
+ test.report("naive"){ count.times{ doc.pages.each{ |page| full_naive_render(page, out_dir) } } }
12
+ test.report("smart"){ count.times{ doc.pages.each{ |page| full_smart_render(page, out_dir) } } }
13
+ end
@@ -0,0 +1,20 @@
1
+ require_relative 'setup'
2
+
3
+ some_number_of = ARGV.pop.to_i
4
+ some_number_of = 5 if some_number_of <= 0
5
+ puts "firing up #{some_number_of} forks"
6
+ some_number_of.times do |n|
7
+ fork do
8
+ here = File.dirname(__FILE__)
9
+ path = File.join(here, '..', 'test', 'fixtures', 'uncharter.pdf')
10
+ extract(path, n)
11
+ end
12
+ end
13
+
14
+ =begin
15
+ Questions:
16
+ * What can/should trigger Ruby's GC?
17
+ * What's the stack size look like?
18
+ * Is Ruby accurately reporting the amount of memory allocated? (how do we compare?) no!
19
+ * Can we notify Ruby about memory allocated in C/C++? No! \weep
20
+ =end
@@ -0,0 +1,53 @@
1
+ require_relative '../lib/pdfshaver'
2
+ require 'fileutils'
3
+ require 'pp'
4
+
5
+ def extract(doc_path, prefix=rand(10**10))
6
+ out_dir = File.join(".", "output", prefix.to_s)
7
+ FileUtils.mkdir_p(out_dir)
8
+ log = File.open(File.join(out_dir, "log.txt"), 'w')
9
+ log.sync = true
10
+ doc = PDFShaver::Document.new(doc_path)
11
+ doc.pages.each do |page|
12
+ log.puts("#{Time.now}: rendering page #{page.number}")
13
+ # shamelessly stolen from http://samsaffron.com/archive/2014/04/08/ruby-2-1-garbage-collection-ready-for-production
14
+ log.puts "RSS: #{`ps -eo rss,pid | grep #{Process.pid} | grep -v grep | awk '{ print $1; }'`}"
15
+ #GC.start
16
+ #log.puts(GC.stat)
17
+ easy_render(page, out_dir)
18
+ end
19
+ log.puts ("#{Time.now}: Done!")
20
+ end
21
+
22
+ # A method to test basic rendering
23
+ def easy_render(page, dir)
24
+ out_path = File.join(dir,"#{page.number}.gif")
25
+ page.render(out_path)
26
+ end
27
+
28
+ # A method for testing rendering a variety of pages
29
+ # but as it turns out rendering isn't the problem so
30
+ # this method isn't any heavier in memory usage than
31
+ # the easy render!
32
+ def full_naive_render(page, dir)
33
+ sizes = %w[1000x 700x 180x 60x75!]
34
+ sizes.each do |size_string|
35
+ dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
36
+ out_path = File.join(dir,"#{page.number}_#{size_string}.gif")
37
+ #puts out_path
38
+ page.render(out_path, dimensions)
39
+ end
40
+ end
41
+
42
+ def full_smart_render(p, dir)
43
+ p.with_data_loaded do |page|
44
+ sizes = %w[1000x 700x 180x 60x75!]
45
+ sizes.each do |size_string|
46
+ dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
47
+ out_path = File.join(dir,"#{page.number}_#{size_string}.gif")
48
+ #puts out_path
49
+ page.render(out_path, dimensions)
50
+ end
51
+ end
52
+ end
53
+
@@ -75,10 +75,6 @@ VALUE document_allocate(VALUE rb_PDFShaver_Document) {
75
75
 
76
76
  // Entry point for PDFShaver::Document's ruby initializer into C++ land
77
77
  VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
78
- // Get the PDFShaver namespace and get the `Document` class inside it.
79
- VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
80
- VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
81
-
82
78
  // use Ruby's argument scanner to pull out a required
83
79
  // `path` argument and an optional `options` hash.
84
80
  VALUE path, options;
@@ -88,7 +84,8 @@ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
88
84
  // path should at this point be validated & known to exist.
89
85
  Document* document;
90
86
  Data_Get_Struct(self, Document, document);
91
- int parse_status = document->load(path);
87
+ //int parse_status =
88
+ document->load(path);
92
89
  //document_handle_parse_status(parse_status, path);
93
90
  if (!document->isValid()) { rb_raise(rb_eArgError, "failed to open file (%" PRIsVALUE")", path); }
94
91
 
@@ -99,10 +96,10 @@ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
99
96
 
100
97
  void document_handle_parse_status(int status, VALUE path) {
101
98
  //printf("\nSTATUS: %d\n", status);
102
- VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
103
- VALUE rb_eEncryptionError = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
104
- VALUE rb_eInvalidFormatError = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
105
- VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
99
+ //VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
100
+ //VALUE rb_eEncryptionError = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
101
+ //VALUE rb_eInvalidFormatError = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
102
+ //VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
106
103
 
107
104
  //switch (status) {
108
105
  // case PDFPARSE_ERROR_SUCCESS:
@@ -2,9 +2,9 @@ require "mkmf"
2
2
  require 'rbconfig'
3
3
  # List directories to search for PDFium headers and library files to link against
4
4
  def append_pdfium_directory_to paths
5
- paths.map do |dir|
5
+ paths.map do |dir|
6
6
  [
7
- File.join(dir, 'pdfium'),
7
+ File.join(dir, 'pdfium'),
8
8
  File.join(dir, 'pdfium', 'fpdfsdk', 'include'),
9
9
  File.join(dir, 'pdfium', 'third_party', 'base', 'numerics')
10
10
  ]
@@ -56,13 +56,18 @@ LIB_FILES.each do | lib |
56
56
  have_library(lib) or abort "Couldn't find library lib#{lib} in #{LIB_DIRS.join(', ')}"
57
57
  end
58
58
 
59
- $CPPFLAGS += " -fPIC -std=c++11"
59
+ $CPPFLAGS += " -fPIC -std=c++11 -Wall"
60
60
  if RUBY_PLATFORM =~ /darwin/
61
61
  have_library('objc')
62
62
  FRAMEWORKS = %w{AppKit CoreFoundation}
63
63
  $LDFLAGS << FRAMEWORKS.map { |f| " -framework #{f}" }.join
64
+ end
65
+
66
+ if ENV['DEBUG'] == '1'
67
+ $defs.push "-DDEBUG=1"
68
+ $CPPFLAGS += " -g"
64
69
  else
65
- #$CPPFLAGS += " -fPIC -std=c++11"
70
+ $CPPFLAGS += " -O2"
66
71
  end
67
72
 
68
73
  create_makefile "pdfium_ruby"
@@ -5,22 +5,49 @@
5
5
  * C++ Page definition
6
6
  *********************************************/
7
7
 
8
+ // When created make sure C++ pages are marked as unopened.
8
9
  Page::Page() { this->opened = false; }
9
10
 
10
- bool Page::load(Document* document, int page_index) {
11
+ // When destroying a C++ Page, make sure to dispose of the internals properly.
12
+ // And notify the parent document that this page is no longer going to be used.
13
+ Page::~Page() {
14
+ if (this->opened) {
15
+ this->unload();
16
+ this->document->notifyPageClosed(this);
17
+ }
18
+ }
19
+
20
+ // When the page is initialized through the Ruby lifecycle, store a reference
21
+ // to its parent Document, the page number and notify the Document that this page
22
+ // is available to be loaded.
23
+ void Page::initialize(Document* document, int page_index) {
11
24
  this->document = document;
12
25
  this->page_index = page_index;
13
-
14
- this->fpdf_page = FPDF_LoadPage(document->fpdf_document, page_index);
15
- document->notifyPageOpened(this);
16
- this->opened = true;
26
+ this->document->notifyPageOpened(this);
27
+ }
28
+
29
+ // Load the page through PDFium and flag the document as currently open.
30
+ bool Page::load() {
31
+ if (!this->opened) {
32
+ this->fpdf_page = FPDF_LoadPage(this->document->fpdf_document, this->page_index);
33
+ this->opened = true;
34
+ }
17
35
  return this->opened;
18
36
  }
19
37
 
38
+ // Unload the page (freeing the page's memory) and mark it as not open.
39
+ void Page::unload() {
40
+ if (this->opened){ FPDF_ClosePage(this->fpdf_page); }
41
+ this->opened = false;
42
+ }
43
+
44
+ // readers for the page's dimensions.
20
45
  double Page::width(){ return FPDF_GetPageWidth(this->fpdf_page); }
21
46
  double Page::height(){ return FPDF_GetPageHeight(this->fpdf_page); }
22
47
  double Page::aspect() { return width() / height(); }
23
48
 
49
+ // Render the page to a destination path with the dimensions
50
+ // specified by width & height (or appropriate defaults).
24
51
  bool Page::render(char* path, int width, int height) {
25
52
  // If no height or width is supplied, render at natural dimensions.
26
53
  if (!width && !height) {
@@ -31,21 +58,16 @@ bool Page::render(char* path, int width, int height) {
31
58
  // infer the other by preserving page aspect ratio.
32
59
  if ( width && !height) { height = width / this->aspect(); }
33
60
  if (!width && height) { width = height * this->aspect(); }
34
- //printf("Derp? %d, %d\n", width, height);
35
61
 
36
62
  // Create bitmap. width, height, alpha 1=enabled,0=disabled
37
63
  bool alpha = false;
38
- printf("just about to allocate bitmap w:%d, h:%d\n", width, height);
39
64
  FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, alpha);
40
- printf("BITMAP CREATED\n");
41
- if (!bitmap) { printf("ALLOCATING BITMAP FAILED"); return false; }
65
+ if (!bitmap) { return false; }
42
66
 
43
- printf("FILLING BITMAP");
44
- // fill all pixels with white for the background color
67
+ // and fill all pixels with white for the background color
45
68
  FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF);
46
- printf("BITMAP FILLED");
47
- // Render a page to a bitmap in RGBA format
48
- // args are: *buffer, page, start_x, start_y, size_x, size_y, rotation, and flags
69
+
70
+ // Render a page into the bitmap in RGBA format
49
71
  // flags are:
50
72
  // 0 for normal display, or combination of flags defined below
51
73
  // 0x01 Set if annotations are to be rendered
@@ -56,7 +78,8 @@ bool Page::render(char* path, int width, int height) {
56
78
  int rotation = 0;
57
79
  int flags = FPDF_PRINTING; // A flag defined in PDFium's codebase.
58
80
  FPDF_RenderPageBitmap(bitmap, this->fpdf_page, start_x, start_y, width, height, rotation, flags);
59
- printf("RENDERED BITMAP");
81
+
82
+ // Calculate the page's stride.
60
83
  // The stride holds the width of one row in bytes. It may not be an exact
61
84
  // multiple of the pixel width because the data may be packed to always end on a byte boundary
62
85
  int stride = FPDFBitmap_GetStride(bitmap);
@@ -69,34 +92,30 @@ bool Page::render(char* path, int width, int height) {
69
92
  ((stride * height) > (INT_MAX / 3))
70
93
  );
71
94
  if (bitmapIsntValid){
72
- printf("BITMAP ISN'T VALID");
73
95
  FPDFBitmap_Destroy(bitmap);
74
96
  return false;
75
97
  }
76
98
 
77
- // Read the FPDF bitmap into a FreeImage bitmap.
99
+ // Hand off the PDFium bitmap data to FreeImage for additional processing.
78
100
  unsigned bpp = 32;
79
101
  unsigned red_mask = 0xFF0000;
80
102
  unsigned green_mask = 0x00FF00;
81
103
  unsigned blue_mask = 0x0000FF;
82
104
  bool topdown = true;
83
105
  FIBITMAP *raw = FreeImage_ConvertFromRawBits(
84
- (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp, red_mask, green_mask, blue_mask, topdown);
106
+ (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp,
107
+ red_mask, green_mask, blue_mask, topdown);
85
108
 
86
- printf("ALLOCATED MAP");
87
- // at this point we're done with the FPDF bitmap and can destroy it.
109
+ // With bitmap handoff complete the FPDF bitmap can be destroyed.
88
110
  FPDFBitmap_Destroy(bitmap);
89
- printf("FREE BITMAP");
111
+
90
112
  // Conversion to jpg or gif require that the bpp be set to 24
91
113
  // since we're not exporting using alpha transparency above in FPDFBitmap_Create
92
114
  FIBITMAP *image = FreeImage_ConvertTo24Bits(raw);
93
- printf("CONVERT TO 24BITS2");
94
115
  FreeImage_Unload(raw);
95
- printf("DEALLOCATE RAW");
96
116
 
97
117
  // figure out the desired format from the file extension
98
118
  FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(path);
99
- printf("DEDUCE FORMAT");
100
119
 
101
120
  bool success = false;
102
121
  if ( FIF_GIF == format ){
@@ -108,19 +127,11 @@ bool Page::render(char* path, int width, int height) {
108
127
  // All other formats should be just a save call
109
128
  success = FreeImage_Save(format, image, path, 0);
110
129
  }
111
- printf("SAVED IMAGE");
112
130
 
113
131
  // unload the image
114
132
  FreeImage_Unload(image);
115
- printf("UNLOADED IMAGE");
116
- return success;
117
- }
118
133
 
119
- Page::~Page() {
120
- if (this->opened) {
121
- FPDF_ClosePage(this->fpdf_page);
122
- this->document->notifyPageClosed(this);
123
- }
134
+ return success;
124
135
  }
125
136
 
126
137
  /********************************************
@@ -132,17 +143,64 @@ void Define_Page() {
132
143
  VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
133
144
  VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
134
145
 
146
+ // Define the C allocator function so that when a new PDFShaver::Page instance
147
+ // is created, our C/C++ data structures are initialized into the Ruby lifecycle.
135
148
  rb_define_alloc_func(rb_PDFShaver_Page, *page_allocate);
136
149
 
137
- rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
150
+ // Wire the C functions we need/want into Ruby land.
151
+ // We're using the CPP_RUBY_METHOD_FUNC to wrap functions for C++'s comfort.
138
152
  rb_define_private_method(rb_PDFShaver_Page, "initialize_page_internals",
139
153
  CPP_RUBY_METHOD_FUNC(initialize_page_internals),-1);
154
+ rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
155
+ rb_define_private_method(rb_PDFShaver_Page, "load_data", CPP_RUBY_METHOD_FUNC(page_load_data), 0);
156
+ rb_define_private_method(rb_PDFShaver_Page, "unload_data", CPP_RUBY_METHOD_FUNC(page_unload_data), 0);
140
157
  }
141
158
 
159
+ // Create a new C++ Page object and store it in any newly created
160
+ // Ruby page instances.
142
161
  VALUE page_allocate(VALUE rb_PDFShaver_Page) {
143
162
  Page* page = new Page();
144
163
  return Data_Wrap_Struct(rb_PDFShaver_Page, NULL, destroy_page, page);
145
164
  }
165
+ // And delete the C++ page when we're done with the Ruby page.
166
+ static void destroy_page(Page* page) { delete page; }
167
+
168
+ // This function does the actual initialization of the C++ page's internals
169
+ // defining which page of the document will be opened when `load_data` is called.
170
+ VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
171
+ // use Ruby's argument scanner to pull out a required
172
+ VALUE rb_document, page_index, options;
173
+ int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
174
+
175
+ // fetch the C++ document from the Ruby document the page has been initialized with
176
+ Document* document;
177
+ Data_Get_Struct(rb_document, Document, document);
178
+ // And fetch the C++ page
179
+ Page* page;
180
+ Data_Get_Struct(self, Page, page);
181
+ // and associate them by initializing the C++ page.
182
+ page->initialize(document, FIX2INT(page_index));
183
+ return self;
184
+ }
185
+
186
+ VALUE page_load_data(VALUE self) {
187
+ Page* page;
188
+ Data_Get_Struct(self, Page, page);
189
+ if (! page->load() ) { rb_raise(rb_eRuntimeError, "Unable to load page data"); }
190
+ rb_ivar_set(self, rb_intern("@extension_data_is_loaded"), Qtrue);
191
+ rb_ivar_set(self, rb_intern("@width"), INT2FIX(page->width()));
192
+ rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
193
+ rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
194
+ return Qtrue;
195
+ }
196
+
197
+ VALUE page_unload_data(VALUE self) {
198
+ Page* page;
199
+ Data_Get_Struct(self, Page, page);
200
+ page->unload();
201
+ rb_ivar_set(self, rb_intern("@extension_data_is_loaded"), Qfalse);
202
+ return Qtrue;
203
+ }
146
204
 
147
205
  //bool page_render(int arg_count, VALUE* args, VALUE self) {
148
206
  VALUE page_render(int arg_count, VALUE* args, VALUE self) {
@@ -169,31 +227,8 @@ VALUE page_render(int arg_count, VALUE* args, VALUE self) {
169
227
 
170
228
  Page* page;
171
229
  Data_Get_Struct(self, Page, page);
172
- return (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
173
- }
174
-
175
- VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
176
- // use Ruby's argument scanner to pull out a required
177
- VALUE rb_document, page_index, options;
178
- int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
179
-
180
- // Get the PDFShaver namespace and get the `Page` class inside it.
181
- VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
182
- VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
183
-
184
- Document* document;
185
- Data_Get_Struct(rb_document, Document, document);
186
-
187
- Page* page;
188
- Data_Get_Struct(self, Page, page);
189
-
190
- page->load(document, FIX2INT(page_index));
191
-
192
- rb_ivar_set(self, rb_intern("@width"), INT2FIX(page->width()));
193
- rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
194
- rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
195
-
196
- return self;
197
- }
198
-
199
- static void destroy_page(Page* page) { delete page; }
230
+ page_load_data(self);
231
+ VALUE output = (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
232
+ page_unload_data(self);
233
+ return output;
234
+ }
@@ -10,7 +10,9 @@ class Page {
10
10
  public:
11
11
  Page();
12
12
 
13
- bool load(Document* document, int page_number);
13
+ void initialize(Document* document, int page_number);
14
+ bool load();
15
+ void unload();
14
16
 
15
17
  double width();
16
18
  double height();
@@ -31,6 +33,8 @@ void Define_Page();
31
33
  VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
32
34
  VALUE page_render(int arg_count, VALUE* args, VALUE self);
33
35
  VALUE page_allocate(VALUE rb_PDFShaver_Page);
36
+ VALUE page_load_data(VALUE rb_PDFShaver_Page);
37
+ VALUE page_unload_data(VALUE rb_PDFShaver_Page);
34
38
  static void destroy_page(Page* page);
35
39
 
36
40
  #endif
@@ -1,7 +1,7 @@
1
1
  module PDFShaver
2
2
  class Page
3
3
  GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
4
- attr_reader :document, :width, :height, :aspect, :number, :index
4
+ attr_reader :document, :number, :index
5
5
 
6
6
  def initialize document, number, options={}
7
7
  raise ArgumentError unless document.kind_of? PDFShaver::Document
@@ -11,6 +11,7 @@ module PDFShaver
11
11
  @number = number
12
12
  @index = number - 1
13
13
  @document = document
14
+ @extension_data_is_loaded = false
14
15
  initialize_page_internals document, @index
15
16
  end
16
17
 
@@ -24,6 +25,40 @@ module PDFShaver
24
25
  self.index <=> other.index
25
26
  end
26
27
 
28
+ def height
29
+ load_dimensions unless @height
30
+ @height
31
+ end
32
+
33
+ def width
34
+ load_dimensions unless @width
35
+ @width
36
+ end
37
+
38
+ def aspect
39
+ load_dimensions unless @aspect
40
+ @aspect
41
+ end
42
+
43
+ def with_data_loaded &block
44
+ load_data
45
+ output = yield self
46
+ unload_data
47
+ output
48
+ end
49
+
50
+ private
51
+ def load_dimensions
52
+ with_data_loaded do
53
+ # don't have to do anything, because loading/unloading page data
54
+ # will populate our dimensions.
55
+ end
56
+ end
57
+
58
+ public
59
+ # This code was written with the GraphicsMagick geometry argument parser
60
+ # as a direct reference. Its intent is to provide a compatibility layer
61
+ # for specifying page geometry that functions identically to graphicsmagick's.
27
62
  def extract_dimensions_from_gm_geometry_string(arg)
28
63
  dimensions = {}
29
64
  return dimensions if arg.nil? or arg.empty?
@@ -44,6 +79,8 @@ module PDFShaver
44
79
  current_area = self.width * self.height
45
80
  target_area = (requested_width || 1) * (requested_height || 1)
46
81
 
82
+ # if upper or lower bounds are supplied
83
+ # check whether the target_area size adheres to that constraint.
47
84
  resize = if modifier.include? '>'
48
85
  current_area > target_area
49
86
  elsif modifier.include? '<'
@@ -52,6 +89,7 @@ module PDFShaver
52
89
  true
53
90
  end
54
91
 
92
+ # Calculate page dimensions based on area
55
93
  if resize
56
94
  scale = 1.0 / Math.sqrt(current_area/target_area)
57
95
  dimensions[:width] = (self.width*scale+0.25).floor
@@ -69,8 +107,8 @@ module PDFShaver
69
107
  width = (self.width.to_f/self.height*height+0.5).floor
70
108
  end
71
109
 
72
- # If proportional mode is requested
73
- #
110
+ # For proportional mode, scales are specified by percent.
111
+ # Sizes are recalculated and stored as the target width in place for further processing
74
112
  if modifier.include? '%'
75
113
  x_scale = width
76
114
  y_scale = height
@@ -4,26 +4,24 @@ require 'pdfshaver/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'pdfshaver'
7
- s.version = PDFShaver::VERSION + ".alpha1"
7
+ s.version = PDFShaver::VERSION
8
8
  s.licenses = ['MIT']
9
9
  s.summary = "Shave pages off of PDFs as images"
10
+ s.description = <<-DESCRIPTION
11
+ Shave pages off of PDFs as images. PDFShaver makes iterating PDF pages easy
12
+ by wrapping Google Chrome's PDFium library in an enumerable interface.
13
+ DESCRIPTION
14
+ s.homepage = 'https://www.documentcloud.org/opensource'
10
15
  s.authors = ["Ted Han", "Nathan Stitt"]
11
16
  s.email = 'opensource@documentcloud.org'
12
17
  s.extensions = 'ext/pdfium_ruby/extconf.rb'
13
- s.files = Dir.glob %w[
14
- lib/pdfshaver.rb
15
- lib/*/**/*
16
- ext/**/*
17
- test/**/*
18
- Gemfile
19
- pdfshaver.gemspec
20
- Rakefile
21
- Readme.md
22
- ]
18
+ s.files = `git ls-files -z`.split("\x0")
19
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
20
+ s.require_paths = ["lib"]
23
21
 
24
- s.add_development_dependency "bundler", "~> 1.5"
25
- s.add_development_dependency 'rake'
26
- s.add_development_dependency 'rake-compiler'
27
- s.add_development_dependency 'minitest'
28
- s.add_development_dependency 'fastimage'
22
+ s.add_development_dependency "bundler", "~> 1.5"
23
+ s.add_development_dependency 'rake', "~>10.4"
24
+ s.add_development_dependency 'rake-compiler', "~>0.9"
25
+ s.add_development_dependency 'minitest', "~>5.5"
26
+ s.add_development_dependency 'fastimage', "~>1.6"
29
27
  end
@@ -0,0 +1,23 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
2
+
3
+ describe GC do
4
+ it "won't segfault if when a document is GCed" do
5
+ doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
6
+ doc = nil
7
+ GC.start
8
+ end
9
+
10
+ it "won't segfault if when an invalid document is GCed" do
11
+ Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
12
+ GC.start
13
+ end
14
+
15
+ it "won't segfault if document falls out of scope before pages" do
16
+ doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
17
+ p1 = PDFShaver::Page.new(doc, 1)
18
+ doc = nil
19
+ GC.start
20
+ p1 = nil
21
+ GC.start
22
+ end
23
+ end
@@ -81,8 +81,8 @@ describe "Resize arguments" do
81
81
  "200x200@" => Size.new(176, 227),
82
82
  "1000>" => base,
83
83
  #"1000<" => Size.new(773, 1000),
84
- "500>" => Size.new(386, 500),
85
- "500x>" => Size.new(500, 647)
84
+ "500>" => Size.new(390, 500),
85
+ "500x>" => Size.new(500, 640)
86
86
  }.each do |input, expected|
87
87
  #puts "#{input} : #{expected.inspect}"
88
88
  output = @page.extract_dimensions_from_gm_geometry_string(input)
@@ -109,25 +109,49 @@ describe PDFShaver::Page do
109
109
  end
110
110
  end
111
111
 
112
- describe "GC" do
113
- it "won't segfault if when a document is GCed" do
114
- doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
115
- doc = nil
116
- GC.start
112
+ describe "lazy loading" do
113
+ before do
114
+ @page = PDFShaver::Page.new(@document, 1)
115
+ @output_path = File.join OUTPUT, 'image_render_test.gif'
116
+ end
117
+
118
+ it "should be safe to reuse pages" do
119
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
120
+ @page.render(@output_path)
121
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
122
+ @page.render(@output_path)
123
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
117
124
  end
118
125
 
119
- it "won't segfault if when an invalid document is GCed" do
120
- Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
121
- GC.start
126
+ it "should not load data until requested" do
127
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
128
+ @page.instance_variable_get("@height").must_equal nil
129
+ @page.instance_variable_get("@width").must_equal nil
130
+ @page.instance_variable_get("@aspect").must_equal nil
131
+
132
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
133
+ @page.send(:load_dimensions)
134
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
135
+ @page.height.wont_equal nil
136
+ @page.width.wont_equal nil
137
+ @page.aspect.wont_equal nil
138
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
122
139
  end
123
140
 
124
- it "won't segfault if document falls out of scope before pages" do
125
- doc = PDFShaver::Document.new(File.join(FIXTURES,'uncharter.pdf'))
126
- p1 = PDFShaver::Page.new(doc, 1)
127
- doc = nil
128
- GC.start
129
- p1 = nil
130
- GC.start
141
+ it "should provide a scope where data is kept loaded" do
142
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
143
+ @page.with_data_loaded do
144
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal true
145
+ end
146
+ @page.instance_variable_get("@extension_data_is_loaded").must_equal false
147
+ end
148
+
149
+ it "shouldn't blow up if nested twice" do
150
+ @page.with_data_loaded do |p|
151
+ p.with_data_loaded do |lol|
152
+ lol
153
+ end
154
+ end
131
155
  end
132
156
  end
133
157
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfshaver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.alpha1
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ted Han
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-02-18 00:00:00.000000000 Z
12
+ date: 2015-02-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -29,68 +29,74 @@ dependencies:
29
29
  name: rake
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
- - - ">="
32
+ - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '0'
34
+ version: '10.4'
35
35
  type: :development
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
- - - ">="
39
+ - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '0'
41
+ version: '10.4'
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: rake-compiler
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
- - - ">="
46
+ - - "~>"
47
47
  - !ruby/object:Gem::Version
48
- version: '0'
48
+ version: '0.9'
49
49
  type: :development
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
- - - ">="
53
+ - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '0'
55
+ version: '0.9'
56
56
  - !ruby/object:Gem::Dependency
57
57
  name: minitest
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
- - - ">="
60
+ - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '0'
62
+ version: '5.5'
63
63
  type: :development
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - ">="
67
+ - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '0'
69
+ version: '5.5'
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: fastimage
72
72
  requirement: !ruby/object:Gem::Requirement
73
73
  requirements:
74
- - - ">="
74
+ - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0'
76
+ version: '1.6'
77
77
  type: :development
78
78
  prerelease: false
79
79
  version_requirements: !ruby/object:Gem::Requirement
80
80
  requirements:
81
- - - ">="
81
+ - - "~>"
82
82
  - !ruby/object:Gem::Version
83
- version: '0'
84
- description:
83
+ version: '1.6'
84
+ description: " Shave pages off of PDFs as images. PDFShaver makes iterating PDF
85
+ pages easy \n by wrapping Google Chrome's PDFium library in an enumerable interface.\n"
85
86
  email: opensource@documentcloud.org
86
87
  executables: []
87
88
  extensions:
88
89
  - ext/pdfium_ruby/extconf.rb
89
90
  extra_rdoc_files: []
90
91
  files:
92
+ - ".gitignore"
91
93
  - Gemfile
94
+ - Gemfile.lock
92
95
  - Rakefile
93
96
  - Readme.md
97
+ - bench/data_loading_speed.rb
98
+ - bench/memory_stress.rb
99
+ - bench/setup.rb
94
100
  - ext/pdfium_ruby/document.cpp
95
101
  - ext/pdfium_ruby/document.h
96
102
  - ext/pdfium_ruby/extconf.rb
@@ -105,15 +111,14 @@ files:
105
111
  - lib/pdfshaver/version.rb
106
112
  - pdfshaver.gemspec
107
113
  - test/document_spec.rb
108
- - test/fixtures/completely_encrypted.pdf
109
- - test/fixtures/encrypted.pdf
110
114
  - test/fixtures/letter-to-canadians-from-jack-layton.pdf
111
115
  - test/fixtures/uncharter.pdf
116
+ - test/gc_spec.rb
112
117
  - test/gm_compatability_spec.rb
113
118
  - test/page_set_spec.rb
114
119
  - test/page_spec.rb
115
120
  - test/spec_helper.rb
116
- homepage:
121
+ homepage: https://www.documentcloud.org/opensource
117
122
  licenses:
118
123
  - MIT
119
124
  metadata: {}
@@ -128,13 +133,21 @@ required_ruby_version: !ruby/object:Gem::Requirement
128
133
  version: '0'
129
134
  required_rubygems_version: !ruby/object:Gem::Requirement
130
135
  requirements:
131
- - - ">"
136
+ - - ">="
132
137
  - !ruby/object:Gem::Version
133
- version: 1.3.1
138
+ version: '0'
134
139
  requirements: []
135
140
  rubyforge_project:
136
141
  rubygems_version: 2.4.5
137
142
  signing_key:
138
143
  specification_version: 4
139
144
  summary: Shave pages off of PDFs as images
140
- test_files: []
145
+ test_files:
146
+ - test/document_spec.rb
147
+ - test/fixtures/letter-to-canadians-from-jack-layton.pdf
148
+ - test/fixtures/uncharter.pdf
149
+ - test/gc_spec.rb
150
+ - test/gm_compatability_spec.rb
151
+ - test/page_set_spec.rb
152
+ - test/page_spec.rb
153
+ - test/spec_helper.rb
Binary file