pdfshaver 0.0.1.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ #ifndef __PAGE_H__
2
+ #define __PAGE_H__
3
+
4
+ // forward declaration since Page/Document classes are interdependent
5
+ class Document;
6
+ #include "pdfium_ruby.h"
7
+ #include "document.h"
8
+
9
+ class Page {
10
+ public:
11
+ Page();
12
+
13
+ bool load(Document* document, int page_number);
14
+
15
+ double width();
16
+ double height();
17
+ double aspect();
18
+
19
+ bool render(char* path, int width, int height);
20
+
21
+ ~Page();
22
+
23
+ private:
24
+ int page_index;
25
+ bool opened;
26
+ Document *document;
27
+ FPDF_PAGE fpdf_page;
28
+ };
29
+
30
+ void Define_Page();
31
+ VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
32
+ VALUE page_render(int arg_count, VALUE* args, VALUE self);
33
+ VALUE page_allocate(VALUE rb_PDFShaver_Page);
34
+ static void destroy_page(Page* page);
35
+
36
+ #endif
@@ -0,0 +1,17 @@
1
+ #include "pdfium_ruby.h"
2
+
3
+ #include "fpdfview.h"
4
+
5
+ extern "C"
6
+ void Init_pdfium_ruby (void) {
7
+ // Initialize PDFium
8
+ FPDF_InitLibrary();
9
+
10
+ // Define `PDFShaver` module as a namespace for all of our other objects
11
+ VALUE rb_PDFShaver = rb_define_module("PDFShaver");
12
+
13
+ // Define `Document` and `Page` classes
14
+ Define_Document();
15
+ Define_Page();
16
+ //Define_PageSet();
17
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef __PDFIUM_RUBY_H__
2
+ #define __PDFIUM_RUBY_H__
3
+
4
+ extern "C" {
5
+ #include "ruby.h"
6
+ }
7
+
8
+ // Inspired by https://github.com/jasonroelofs/rice/blob/1740a6d12c99fce8c21eda3c5385738318ab9172/rice/detail/ruby.hpp#L33-L37
9
+ // Casts C functions into a type that C++ is happy calling
10
+ extern "C" typedef VALUE (*CPP_RUBY_METHOD_FUNC)(ANYARGS);
11
+
12
+ void Define_Document();
13
+ void Define_Page();
14
+ void Define_PageSet();
15
+
16
+ // helper function for printing string literals for debugging purposes
17
+ void inline ruby_puts_cstring(const char* str) { rb_funcall(rb_cObject, rb_intern("puts"), 1, rb_str_new_cstr(str)); }
18
+ void inline ruby_puts_values(VALUE str){ rb_funcall(rb_cObject, rb_intern("puts"), 1, str); }
19
+
20
+ #endif
data/lib/pdfshaver.rb ADDED
@@ -0,0 +1,13 @@
1
+ module PDFShaver
2
+ class EncryptionError < StandardError; end
3
+ class InvalidFormatError < StandardError; end
4
+ class MissingHandlerError < StandardError; end
5
+ end
6
+
7
+ %w[
8
+ document
9
+ page
10
+ page_set
11
+ version
12
+ ].each { |file| require_relative File.join('pdfshaver', file) }
13
+ require_relative 'pdfium_ruby'
@@ -0,0 +1,21 @@
1
+ module PDFShaver
2
+ class Document
3
+ attr_reader :length, :path
4
+
5
+ def initialize path, options={}
6
+ raise ArgumentError, "Can't find a file at '#{path}' to open" unless File.exists? path
7
+ # otherwise attempt to acquire it.
8
+
9
+ @path = path
10
+ open_document_with_pdfium(path)
11
+ end
12
+
13
+ def == other
14
+ File.realpath(self.path) == File.realpath(other.path)
15
+ end
16
+
17
+ def pages(page_list=:all)
18
+ PageSet.new(self, page_list)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,117 @@
1
+ module PDFShaver
2
+ class Page
3
+ GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
4
+ attr_reader :document, :width, :height, :aspect, :number, :index
5
+
6
+ def initialize document, number, options={}
7
+ raise ArgumentError unless document.kind_of? PDFShaver::Document
8
+ raise ArgumentError unless number.kind_of? Integer
9
+ raise ArgumentError unless number > 0 and number <= document.length
10
+
11
+ @number = number
12
+ @index = number - 1
13
+ @document = document
14
+ initialize_page_internals document, @index
15
+ end
16
+
17
+ def == other
18
+ raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
19
+ (self.document == other.document) and (self.index == other.index)
20
+ end
21
+
22
+ def <=> other
23
+ raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
24
+ self.index <=> other.index
25
+ end
26
+
27
+ def extract_dimensions_from_gm_geometry_string(arg)
28
+ dimensions = {}
29
+ arg.match(GM_MATCHER) do |match|
30
+
31
+ # grab parsed tokens
32
+ requested_width = match[:width].to_f unless match[:width].nil?
33
+ requested_height = match[:height].to_f unless match[:height].nil?
34
+ modifier = match[:modifier] || ""
35
+
36
+ ## Algorithm ported from GraphicsMagick's GetMagickGeometry function.
37
+ #
38
+ # the '@' option precludes all other options.
39
+ if modifier.include? '@'
40
+ # calculate the current page area
41
+ # and the specified target area for comparison
42
+ current_area = self.width * self.height
43
+ target_area = (requested_width || 1) * (requested_height || 1)
44
+
45
+ resize = if modifier.include? '>'
46
+ current_area > target_area
47
+ elsif modifier.include? '<'
48
+ current_area < target_area
49
+ else
50
+ true
51
+ end
52
+
53
+ if resize
54
+ scale = 1.0 / Math.sqrt(current_area/target_area)
55
+ dimensions[:width] = (self.width*scale+0.25).floor
56
+ dimensions[:height] = (self.height*scale+0.25).floor
57
+ end
58
+ else # Handle all of the non area modes.
59
+ width = requested_width
60
+ height = requested_height
61
+
62
+ # when supplied with only a width or a height
63
+ # infer the other using the page's aspect ratio.
64
+ if width and not height
65
+ height = (width/self.aspect+0.5).floor
66
+ elsif height and not width
67
+ width = (self.width.to_f/self.height*height+0.5).floor
68
+ end
69
+
70
+ # If proportional mode is requested
71
+ #
72
+ if modifier.include? '%'
73
+ x_scale = width
74
+ y_scale = height
75
+ x_scale = y_scale if requested_width.nil? or requested_height.nil?
76
+ width = ((self.width * x_scale / 100.0) +0.5).floor
77
+ height = ((self.height * y_scale / 100.0) +0.5).floor
78
+ # this is to match how GraphicsMagick works.
79
+ requested_width = width
80
+ requested_height = height
81
+ end
82
+
83
+ if modifier.include? '!' and ((width != requested_width) || (height != requested_height))
84
+ if (requested_width == 0) || (requested_height == 0)
85
+ scale = 1.0
86
+ else
87
+ width_ratio = width / self.width
88
+ height_ratio = height / self.height
89
+ scale = width_ratio
90
+ end
91
+
92
+ width = (scale*self.width+0.5).floor
93
+ height = (scale*self.height+0.5).floor
94
+ end
95
+
96
+ if modifier.include? '>'
97
+ width = self.width if self.width < width
98
+ height = self.height if self.height < height
99
+ end
100
+
101
+ if modifier.include? '<'
102
+ width = self.width if self.width > width
103
+ height = self.height if self.height > height
104
+ end
105
+
106
+ dimensions[:width] = width.floor
107
+ dimensions[:height] = height.floor
108
+ end
109
+ dimensions[:width] ||= self.width.floor
110
+ dimensions[:height] ||= self.height.floor
111
+ return dimensions
112
+ end
113
+
114
+ raise ArgumentError, "unable to extract width & height from '#{arg}'"
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,83 @@
1
+ module PDFShaver
2
+ class PageSet
3
+ include Enumerable
4
+
5
+ attr_reader :document
6
+ def initialize document, page_list=:all, options={}
7
+ @document = document
8
+ @page_list = extract_page_numbers(page_list)
9
+ end
10
+
11
+ def each(&block)
12
+ enumerator.each(&block)
13
+ end
14
+
15
+ def [](page_index)
16
+ Page.new(@document, @page_list.to_a[page_index])
17
+ end
18
+
19
+ def first
20
+ Page.new(@document, @page_list.first)
21
+ end
22
+
23
+ def last
24
+ Page.new(@document, @page_list.last)
25
+ end
26
+
27
+ def size
28
+ @page_list.size
29
+ end
30
+
31
+ private
32
+ def enumerator
33
+ Enumerator.new do |yielder|
34
+ @page_list.each do |page_number|
35
+ yielder.yield Page.new(self.document, page_number)
36
+ end
37
+ end
38
+ end
39
+
40
+ def extract_page_numbers(inputs)
41
+ case inputs
42
+ when :all
43
+ Range.new(1,self.document.length)
44
+ when Numeric
45
+ raise ArgumentError, "#{inputs} is not a valid page number" unless valid_page_number?(inputs)
46
+ [inputs]
47
+ when Range
48
+ unless valid_page_range?(inputs)
49
+ raise ArgumentError, "#{inputs} did not fall in a valid range of pages (#{1..self.document.length})"
50
+ end
51
+ inputs
52
+ when Array
53
+ numbers = []
54
+ inputs.flatten.each do |input|
55
+ case
56
+ when valid_page_number?(input) then numbers.push input
57
+ when valid_page_range?(input) then numbers += input.to_a
58
+ when valid_page_string?(input) then
59
+ else raise ArgumentError, "#{input} is not a valid page or list of pages (as part of #{inputs})"
60
+ end
61
+ end
62
+ numbers.sort
63
+ when String
64
+ valid_page_string?(inputs)
65
+ else
66
+ raise ArgumentError, "#{inputs.inspect} is not a valid list of pages"
67
+ end
68
+ end
69
+
70
+ def valid_page_number?(number)
71
+ number.kind_of?(Numeric) and number > 0 and number <= self.document.length
72
+ end
73
+
74
+ def valid_page_range?(range)
75
+ range.kind_of?(Range) and range.first <= range.last and
76
+ valid_page_number?(range.first) and valid_page_number?(range.last)
77
+ end
78
+
79
+ def valid_page_string?(input)
80
+ raise ArgumentError, "todo: support strings as page specifiers"
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,3 @@
1
+ module PDFShaver
2
+ VERSION='0.0.1'
3
+ end
data/pdfshaver.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'pdfshaver/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'pdfshaver'
7
+ s.version = PDFShaver::VERSION + ".alpha"
8
+ s.licenses = ['MIT']
9
+ s.summary = "Shave pages off of PDFs as images"
10
+ s.authors = ["Ted Han", "Nathan Stitt"]
11
+ s.email = 'opensource@documentcloud.org'
12
+ s.extensions = 'ext/pdfium_ruby/extconf.rb'
13
+ s.files = Dir.glob %w[
14
+ lib/pdfshaver.rb
15
+ lib/*/**/*
16
+ ext/**/*
17
+ test/**/*
18
+ Gemfile
19
+ pdfshaver.gemspec
20
+ Rakefile
21
+ Readme.md
22
+ ]
23
+
24
+ s.add_development_dependency "bundler", "~> 1.5"
25
+ s.add_development_dependency 'rake'
26
+ s.add_development_dependency 'rake-compiler'
27
+ s.add_development_dependency 'minitest'
28
+ s.add_development_dependency 'fastimage'
29
+ end
@@ -0,0 +1,36 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
2
+
3
+ describe PDFShaver::Document do
4
+
5
+ it "should be instantiated" do
6
+ path = File.join(FIXTURES, 'uncharter.pdf')
7
+ PDFShaver::Document.new(path).must_be_instance_of PDFShaver::Document
8
+ end
9
+
10
+ it "should throw an error if path can't be found" do
11
+ Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
12
+ end
13
+
14
+ it "should throw an error if a document can't be opened" do
15
+ Proc.new do
16
+ path = File.join(FIXTURES, 'completely_encrypted.pdf')
17
+ PDFShaver::Document.new(path)
18
+ end.must_raise ArgumentError
19
+ end
20
+
21
+ describe "instance methods" do
22
+ before do
23
+ @path = File.join(FIXTURES, 'uncharter.pdf')
24
+ @document = PDFShaver::Document.new(@path)
25
+ end
26
+
27
+ it "should have a length" do
28
+ @document.length.must_equal 55
29
+ end
30
+
31
+ it { @document.must_equal @document }
32
+ it { @document.must_equal PDFShaver::Document.new(@path) }
33
+ it { @document.wont_equal PDFShaver::Document.new(File.join(FIXTURES, 'letter-to-canadians-from-jack-layton.pdf')) }
34
+ end
35
+
36
+ end
Binary file
Binary file
@@ -0,0 +1,92 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
2
+
3
+ describe "Resize arguments" do
4
+ before do
5
+ path = File.join(FIXTURES, 'uncharter.pdf')
6
+ @document = PDFShaver::Document.new(path)
7
+ @page = PDFShaver::Page.new(@document, 1)
8
+ end
9
+
10
+ describe "Syntax" do
11
+ class TokenSet
12
+ # Create a list of attributes which we can
13
+ # refer back to later
14
+ # and use the splat operator to define our attributes
15
+ KEYS = [:width, :height, :modifier]
16
+ attr_reader *KEYS
17
+
18
+ # helper methods so we can pretend an instance is a hash
19
+ def [](key); self.send(key); end
20
+ def []=(key, val); self.instance_variable_set("@#{key}", val); end
21
+
22
+ # use the KEYS and their order to initialize instance variables
23
+ def initialize(*attributes)
24
+ KEYS.each_with_index{ |key, index| self[key] = attributes[index] }
25
+ end
26
+ end
27
+
28
+ it "should match valid graphicsmagick strings" do
29
+ inputs = {
30
+ "100" => TokenSet.new(nil, "100", nil),
31
+ "101x102" => TokenSet.new("101", "102", nil),
32
+ "103x" => TokenSet.new("103", nil, nil),
33
+ "x104" => TokenSet.new(nil, "104", nil),
34
+ "105%" => TokenSet.new(nil, "105", "%"),
35
+ "106@" => TokenSet.new(nil, "106", "@"),
36
+ "107<" => TokenSet.new(nil, "107", "<"),
37
+ "108>" => TokenSet.new(nil, "108", ">"),
38
+ "109x110%" => TokenSet.new("109", "110", "%"),
39
+ "x111%" => TokenSet.new(nil, "111", "%"),
40
+ "112x%" => TokenSet.new("112", nil, "%"),
41
+ }
42
+
43
+ inputs.each do |input, expected|
44
+ input.must_match(PDFShaver::Page::GM_MATCHER)
45
+ match = input.match(PDFShaver::Page::GM_MATCHER)
46
+ TokenSet::KEYS.each{ |key| match[key].must_equal expected[key] }
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "Semantic" do
52
+ class Size
53
+ attr_reader :width, :height, :aspect
54
+ def initialize(width, height)
55
+ @width = width
56
+ @height = height
57
+ @aspect = @width.to_f / height
58
+ end
59
+
60
+ def scale(factor)
61
+ self.class.new((@width*factor).to_i, (@height*factor).to_i)
62
+ end
63
+ end
64
+
65
+ it "should specify width and height" do
66
+ w = @page.width.to_i
67
+ h = @page.height.to_i
68
+ base = Size.new(w,h)
69
+ {
70
+ "#{w}x#{h}" => base,
71
+ "#{(w*0.5).to_i}x" => base.scale(0.5),
72
+ "x#{h*2}" => base.scale(2),
73
+ "100x100!" => Size.new(100, 100),
74
+ "100x100%" => base,
75
+ "200x200%" => base.scale(2),
76
+ "200x200@" => Size.new(176, 227),
77
+ "1000>" => base,
78
+ #"1000<" => Size.new(773, 1000),
79
+ "500>" => Size.new(386, 500),
80
+ "500x>" => Size.new(500, 647)
81
+ }.each do |input, expected|
82
+ #puts "#{input} : #{expected.inspect}"
83
+ output = @page.extract_dimensions_from_gm_geometry_string(input)
84
+ #puts "#{output.inspect} vs #{expected.inspect}"
85
+ dimensions = Size.new(output[:width], output[:height])
86
+ dimensions.aspect.must_be_within_delta expected.aspect, 0.005
87
+ dimensions.width.must_be_within_delta expected.width, 1
88
+ dimensions.height.must_be_within_delta expected.height, 1
89
+ end
90
+ end
91
+ end
92
+ end