pdfshaver 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ #ifndef __PAGE_H__
2
+ #define __PAGE_H__
3
+
4
+ // forward declaration since Page/Document classes are interdependent
5
+ class Document;
6
+ #include "pdfium_ruby.h"
7
+ #include "document.h"
8
+
9
+ class Page {
10
+ public:
11
+ Page();
12
+
13
+ bool load(Document* document, int page_number);
14
+
15
+ double width();
16
+ double height();
17
+ double aspect();
18
+
19
+ bool render(char* path, int width, int height);
20
+
21
+ ~Page();
22
+
23
+ private:
24
+ int page_index;
25
+ bool opened;
26
+ Document *document;
27
+ FPDF_PAGE fpdf_page;
28
+ };
29
+
30
+ void Define_Page();
31
+ VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
32
+ VALUE page_render(int arg_count, VALUE* args, VALUE self);
33
+ VALUE page_allocate(VALUE rb_PDFShaver_Page);
34
+ static void destroy_page(Page* page);
35
+
36
+ #endif
@@ -0,0 +1,17 @@
1
+ #include "pdfium_ruby.h"
2
+
3
+ #include "fpdfview.h"
4
+
5
+ extern "C"
6
+ void Init_pdfium_ruby (void) {
7
+ // Initialize PDFium
8
+ FPDF_InitLibrary();
9
+
10
+ // Define `PDFShaver` module as a namespace for all of our other objects
11
+ VALUE rb_PDFShaver = rb_define_module("PDFShaver");
12
+
13
+ // Define `Document` and `Page` classes
14
+ Define_Document();
15
+ Define_Page();
16
+ //Define_PageSet();
17
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef __PDFIUM_RUBY_H__
2
+ #define __PDFIUM_RUBY_H__
3
+
4
+ extern "C" {
5
+ #include "ruby.h"
6
+ }
7
+
8
+ // Inspired by https://github.com/jasonroelofs/rice/blob/1740a6d12c99fce8c21eda3c5385738318ab9172/rice/detail/ruby.hpp#L33-L37
9
+ // Casts C functions into a type that C++ is happy calling
10
+ extern "C" typedef VALUE (*CPP_RUBY_METHOD_FUNC)(ANYARGS);
11
+
12
+ void Define_Document();
13
+ void Define_Page();
14
+ void Define_PageSet();
15
+
16
+ // helper function for printing string literals for debugging purposes
17
+ void inline ruby_puts_cstring(const char* str) { rb_funcall(rb_cObject, rb_intern("puts"), 1, rb_str_new_cstr(str)); }
18
+ void inline ruby_puts_values(VALUE str){ rb_funcall(rb_cObject, rb_intern("puts"), 1, str); }
19
+
20
+ #endif
data/lib/pdfshaver.rb ADDED
@@ -0,0 +1,13 @@
1
+ module PDFShaver
2
+ class EncryptionError < StandardError; end
3
+ class InvalidFormatError < StandardError; end
4
+ class MissingHandlerError < StandardError; end
5
+ end
6
+
7
+ %w[
8
+ document
9
+ page
10
+ page_set
11
+ version
12
+ ].each { |file| require_relative File.join('pdfshaver', file) }
13
+ require_relative 'pdfium_ruby'
@@ -0,0 +1,21 @@
1
+ module PDFShaver
2
+ class Document
3
+ attr_reader :length, :path
4
+
5
+ def initialize path, options={}
6
+ raise ArgumentError, "Can't find a file at '#{path}' to open" unless File.exists? path
7
+ # otherwise attempt to acquire it.
8
+
9
+ @path = path
10
+ open_document_with_pdfium(path)
11
+ end
12
+
13
+ def == other
14
+ File.realpath(self.path) == File.realpath(other.path)
15
+ end
16
+
17
+ def pages(page_list=:all)
18
+ PageSet.new(self, page_list)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,117 @@
1
+ module PDFShaver
2
+ class Page
3
+ GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
4
+ attr_reader :document, :width, :height, :aspect, :number, :index
5
+
6
+ def initialize document, number, options={}
7
+ raise ArgumentError unless document.kind_of? PDFShaver::Document
8
+ raise ArgumentError unless number.kind_of? Integer
9
+ raise ArgumentError unless number > 0 and number <= document.length
10
+
11
+ @number = number
12
+ @index = number - 1
13
+ @document = document
14
+ initialize_page_internals document, @index
15
+ end
16
+
17
+ def == other
18
+ raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
19
+ (self.document == other.document) and (self.index == other.index)
20
+ end
21
+
22
+ def <=> other
23
+ raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
24
+ self.index <=> other.index
25
+ end
26
+
27
+ def extract_dimensions_from_gm_geometry_string(arg)
28
+ dimensions = {}
29
+ arg.match(GM_MATCHER) do |match|
30
+
31
+ # grab parsed tokens
32
+ requested_width = match[:width].to_f unless match[:width].nil?
33
+ requested_height = match[:height].to_f unless match[:height].nil?
34
+ modifier = match[:modifier] || ""
35
+
36
+ ## Algorithm ported from GraphicsMagick's GetMagickGeometry function.
37
+ #
38
+ # the '@' option precludes all other options.
39
+ if modifier.include? '@'
40
+ # calculate the current page area
41
+ # and the specified target area for comparison
42
+ current_area = self.width * self.height
43
+ target_area = (requested_width || 1) * (requested_height || 1)
44
+
45
+ resize = if modifier.include? '>'
46
+ current_area > target_area
47
+ elsif modifier.include? '<'
48
+ current_area < target_area
49
+ else
50
+ true
51
+ end
52
+
53
+ if resize
54
+ scale = 1.0 / Math.sqrt(current_area/target_area)
55
+ dimensions[:width] = (self.width*scale+0.25).floor
56
+ dimensions[:height] = (self.height*scale+0.25).floor
57
+ end
58
+ else # Handle all of the non area modes.
59
+ width = requested_width
60
+ height = requested_height
61
+
62
+ # when supplied with only a width or a height
63
+ # infer the other using the page's aspect ratio.
64
+ if width and not height
65
+ height = (width/self.aspect+0.5).floor
66
+ elsif height and not width
67
+ width = (self.width.to_f/self.height*height+0.5).floor
68
+ end
69
+
70
+ # If proportional mode is requested
71
+ #
72
+ if modifier.include? '%'
73
+ x_scale = width
74
+ y_scale = height
75
+ x_scale = y_scale if requested_width.nil? or requested_height.nil?
76
+ width = ((self.width * x_scale / 100.0) +0.5).floor
77
+ height = ((self.height * y_scale / 100.0) +0.5).floor
78
+ # this is to match how GraphicsMagick works.
79
+ requested_width = width
80
+ requested_height = height
81
+ end
82
+
83
+ if modifier.include? '!' and ((width != requested_width) || (height != requested_height))
84
+ if (requested_width == 0) || (requested_height == 0)
85
+ scale = 1.0
86
+ else
87
+ width_ratio = width / self.width
88
+ height_ratio = height / self.height
89
+ scale = width_ratio
90
+ end
91
+
92
+ width = (scale*self.width+0.5).floor
93
+ height = (scale*self.height+0.5).floor
94
+ end
95
+
96
+ if modifier.include? '>'
97
+ width = self.width if self.width < width
98
+ height = self.height if self.height < height
99
+ end
100
+
101
+ if modifier.include? '<'
102
+ width = self.width if self.width > width
103
+ height = self.height if self.height > height
104
+ end
105
+
106
+ dimensions[:width] = width.floor
107
+ dimensions[:height] = height.floor
108
+ end
109
+ dimensions[:width] ||= self.width.floor
110
+ dimensions[:height] ||= self.height.floor
111
+ return dimensions
112
+ end
113
+
114
+ raise ArgumentError, "unable to extract width & height from '#{arg}'"
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,83 @@
1
+ module PDFShaver
2
+ class PageSet
3
+ include Enumerable
4
+
5
+ attr_reader :document
6
+ def initialize document, page_list=:all, options={}
7
+ @document = document
8
+ @page_list = extract_page_numbers(page_list)
9
+ end
10
+
11
+ def each(&block)
12
+ enumerator.each(&block)
13
+ end
14
+
15
+ def [](page_index)
16
+ Page.new(@document, @page_list.to_a[page_index])
17
+ end
18
+
19
+ def first
20
+ Page.new(@document, @page_list.first)
21
+ end
22
+
23
+ def last
24
+ Page.new(@document, @page_list.last)
25
+ end
26
+
27
+ def size
28
+ @page_list.size
29
+ end
30
+
31
+ private
32
+ def enumerator
33
+ Enumerator.new do |yielder|
34
+ @page_list.each do |page_number|
35
+ yielder.yield Page.new(self.document, page_number)
36
+ end
37
+ end
38
+ end
39
+
40
+ def extract_page_numbers(inputs)
41
+ case inputs
42
+ when :all
43
+ Range.new(1,self.document.length)
44
+ when Numeric
45
+ raise ArgumentError, "#{inputs} is not a valid page number" unless valid_page_number?(inputs)
46
+ [inputs]
47
+ when Range
48
+ unless valid_page_range?(inputs)
49
+ raise ArgumentError, "#{inputs} did not fall in a valid range of pages (#{1..self.document.length})"
50
+ end
51
+ inputs
52
+ when Array
53
+ numbers = []
54
+ inputs.flatten.each do |input|
55
+ case
56
+ when valid_page_number?(input) then numbers.push input
57
+ when valid_page_range?(input) then numbers += input.to_a
58
+ when valid_page_string?(input) then
59
+ else raise ArgumentError, "#{input} is not a valid page or list of pages (as part of #{inputs})"
60
+ end
61
+ end
62
+ numbers.sort
63
+ when String
64
+ valid_page_string?(inputs)
65
+ else
66
+ raise ArgumentError, "#{inputs.inspect} is not a valid list of pages"
67
+ end
68
+ end
69
+
70
+ def valid_page_number?(number)
71
+ number.kind_of?(Numeric) and number > 0 and number <= self.document.length
72
+ end
73
+
74
+ def valid_page_range?(range)
75
+ range.kind_of?(Range) and range.first <= range.last and
76
+ valid_page_number?(range.first) and valid_page_number?(range.last)
77
+ end
78
+
79
+ def valid_page_string?(input)
80
+ raise ArgumentError, "todo: support strings as page specifiers"
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,3 @@
1
+ module PDFShaver
2
+ VERSION='0.0.1'
3
+ end
data/pdfshaver.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'pdfshaver/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'pdfshaver'
7
+ s.version = PDFShaver::VERSION + ".alpha"
8
+ s.licenses = ['MIT']
9
+ s.summary = "Shave pages off of PDFs as images"
10
+ s.authors = ["Ted Han", "Nathan Stitt"]
11
+ s.email = 'opensource@documentcloud.org'
12
+ s.extensions = 'ext/pdfium_ruby/extconf.rb'
13
+ s.files = Dir.glob %w[
14
+ lib/pdfshaver.rb
15
+ lib/*/**/*
16
+ ext/**/*
17
+ test/**/*
18
+ Gemfile
19
+ pdfshaver.gemspec
20
+ Rakefile
21
+ Readme.md
22
+ ]
23
+
24
+ s.add_development_dependency "bundler", "~> 1.5"
25
+ s.add_development_dependency 'rake'
26
+ s.add_development_dependency 'rake-compiler'
27
+ s.add_development_dependency 'minitest'
28
+ s.add_development_dependency 'fastimage'
29
+ end
@@ -0,0 +1,36 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
2
+
3
+ describe PDFShaver::Document do
4
+
5
+ it "should be instantiated" do
6
+ path = File.join(FIXTURES, 'uncharter.pdf')
7
+ PDFShaver::Document.new(path).must_be_instance_of PDFShaver::Document
8
+ end
9
+
10
+ it "should throw an error if path can't be found" do
11
+ Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
12
+ end
13
+
14
+ it "should throw an error if a document can't be opened" do
15
+ Proc.new do
16
+ path = File.join(FIXTURES, 'completely_encrypted.pdf')
17
+ PDFShaver::Document.new(path)
18
+ end.must_raise ArgumentError
19
+ end
20
+
21
+ describe "instance methods" do
22
+ before do
23
+ @path = File.join(FIXTURES, 'uncharter.pdf')
24
+ @document = PDFShaver::Document.new(@path)
25
+ end
26
+
27
+ it "should have a length" do
28
+ @document.length.must_equal 55
29
+ end
30
+
31
+ it { @document.must_equal @document }
32
+ it { @document.must_equal PDFShaver::Document.new(@path) }
33
+ it { @document.wont_equal PDFShaver::Document.new(File.join(FIXTURES, 'letter-to-canadians-from-jack-layton.pdf')) }
34
+ end
35
+
36
+ end
Binary file
Binary file
@@ -0,0 +1,92 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
2
+
3
+ describe "Resize arguments" do
4
+ before do
5
+ path = File.join(FIXTURES, 'uncharter.pdf')
6
+ @document = PDFShaver::Document.new(path)
7
+ @page = PDFShaver::Page.new(@document, 1)
8
+ end
9
+
10
+ describe "Syntax" do
11
+ class TokenSet
12
+ # Create a list of attributes which we can
13
+ # refer back to later
14
+ # and use the splat operator to define our attributes
15
+ KEYS = [:width, :height, :modifier]
16
+ attr_reader *KEYS
17
+
18
+ # helper methods so we can pretend an instance is a hash
19
+ def [](key); self.send(key); end
20
+ def []=(key, val); self.instance_variable_set("@#{key}", val); end
21
+
22
+ # use the KEYS and their order to initialize instance variables
23
+ def initialize(*attributes)
24
+ KEYS.each_with_index{ |key, index| self[key] = attributes[index] }
25
+ end
26
+ end
27
+
28
+ it "should match valid graphicsmagick strings" do
29
+ inputs = {
30
+ "100" => TokenSet.new(nil, "100", nil),
31
+ "101x102" => TokenSet.new("101", "102", nil),
32
+ "103x" => TokenSet.new("103", nil, nil),
33
+ "x104" => TokenSet.new(nil, "104", nil),
34
+ "105%" => TokenSet.new(nil, "105", "%"),
35
+ "106@" => TokenSet.new(nil, "106", "@"),
36
+ "107<" => TokenSet.new(nil, "107", "<"),
37
+ "108>" => TokenSet.new(nil, "108", ">"),
38
+ "109x110%" => TokenSet.new("109", "110", "%"),
39
+ "x111%" => TokenSet.new(nil, "111", "%"),
40
+ "112x%" => TokenSet.new("112", nil, "%"),
41
+ }
42
+
43
+ inputs.each do |input, expected|
44
+ input.must_match(PDFShaver::Page::GM_MATCHER)
45
+ match = input.match(PDFShaver::Page::GM_MATCHER)
46
+ TokenSet::KEYS.each{ |key| match[key].must_equal expected[key] }
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "Semantic" do
52
+ class Size
53
+ attr_reader :width, :height, :aspect
54
+ def initialize(width, height)
55
+ @width = width
56
+ @height = height
57
+ @aspect = @width.to_f / height
58
+ end
59
+
60
+ def scale(factor)
61
+ self.class.new((@width*factor).to_i, (@height*factor).to_i)
62
+ end
63
+ end
64
+
65
+ it "should specify width and height" do
66
+ w = @page.width.to_i
67
+ h = @page.height.to_i
68
+ base = Size.new(w,h)
69
+ {
70
+ "#{w}x#{h}" => base,
71
+ "#{(w*0.5).to_i}x" => base.scale(0.5),
72
+ "x#{h*2}" => base.scale(2),
73
+ "100x100!" => Size.new(100, 100),
74
+ "100x100%" => base,
75
+ "200x200%" => base.scale(2),
76
+ "200x200@" => Size.new(176, 227),
77
+ "1000>" => base,
78
+ #"1000<" => Size.new(773, 1000),
79
+ "500>" => Size.new(386, 500),
80
+ "500x>" => Size.new(500, 647)
81
+ }.each do |input, expected|
82
+ #puts "#{input} : #{expected.inspect}"
83
+ output = @page.extract_dimensions_from_gm_geometry_string(input)
84
+ #puts "#{output.inspect} vs #{expected.inspect}"
85
+ dimensions = Size.new(output[:width], output[:height])
86
+ dimensions.aspect.must_be_within_delta expected.aspect, 0.005
87
+ dimensions.width.must_be_within_delta expected.width, 1
88
+ dimensions.height.must_be_within_delta expected.height, 1
89
+ end
90
+ end
91
+ end
92
+ end