pdfshaver 0.0.1.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/Rakefile +10 -0
- data/Readme.md +14 -0
- data/ext/pdfium_ruby/document.cpp +131 -0
- data/ext/pdfium_ruby/document.h +53 -0
- data/ext/pdfium_ruby/extconf.rb +68 -0
- data/ext/pdfium_ruby/page.cpp +190 -0
- data/ext/pdfium_ruby/page.h +36 -0
- data/ext/pdfium_ruby/pdfium_ruby.cpp +17 -0
- data/ext/pdfium_ruby/pdfium_ruby.h +20 -0
- data/lib/pdfshaver.rb +13 -0
- data/lib/pdfshaver/document.rb +21 -0
- data/lib/pdfshaver/page.rb +117 -0
- data/lib/pdfshaver/page_set.rb +83 -0
- data/lib/pdfshaver/version.rb +3 -0
- data/pdfshaver.gemspec +29 -0
- data/test/document_spec.rb +36 -0
- data/test/fixtures/completely_encrypted.pdf +0 -0
- data/test/fixtures/encrypted.pdf +0 -0
- data/test/fixtures/letter-to-canadians-from-jack-layton.pdf +0 -0
- data/test/fixtures/uncharter.pdf +0 -0
- data/test/gm_compatability_spec.rb +92 -0
- data/test/page_set_spec.rb +62 -0
- data/test/page_spec.rb +133 -0
- data/test/spec_helper.rb +13 -0
- metadata +140 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
#ifndef __PAGE_H__
|
2
|
+
#define __PAGE_H__
|
3
|
+
|
4
|
+
// forward declaration since Page/Document classes are interdependent
|
5
|
+
class Document;
|
6
|
+
#include "pdfium_ruby.h"
|
7
|
+
#include "document.h"
|
8
|
+
|
9
|
+
class Page {
|
10
|
+
public:
|
11
|
+
Page();
|
12
|
+
|
13
|
+
bool load(Document* document, int page_number);
|
14
|
+
|
15
|
+
double width();
|
16
|
+
double height();
|
17
|
+
double aspect();
|
18
|
+
|
19
|
+
bool render(char* path, int width, int height);
|
20
|
+
|
21
|
+
~Page();
|
22
|
+
|
23
|
+
private:
|
24
|
+
int page_index;
|
25
|
+
bool opened;
|
26
|
+
Document *document;
|
27
|
+
FPDF_PAGE fpdf_page;
|
28
|
+
};
|
29
|
+
|
30
|
+
void Define_Page();
|
31
|
+
VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
|
32
|
+
VALUE page_render(int arg_count, VALUE* args, VALUE self);
|
33
|
+
VALUE page_allocate(VALUE rb_PDFShaver_Page);
|
34
|
+
static void destroy_page(Page* page);
|
35
|
+
|
36
|
+
#endif
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#include "pdfium_ruby.h"
|
2
|
+
|
3
|
+
#include "fpdfview.h"
|
4
|
+
|
5
|
+
extern "C"
|
6
|
+
void Init_pdfium_ruby (void) {
|
7
|
+
// Initialize PDFium
|
8
|
+
FPDF_InitLibrary();
|
9
|
+
|
10
|
+
// Define `PDFShaver` module as a namespace for all of our other objects
|
11
|
+
VALUE rb_PDFShaver = rb_define_module("PDFShaver");
|
12
|
+
|
13
|
+
// Define `Document` and `Page` classes
|
14
|
+
Define_Document();
|
15
|
+
Define_Page();
|
16
|
+
//Define_PageSet();
|
17
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef __PDFIUM_RUBY_H__
|
2
|
+
#define __PDFIUM_RUBY_H__
|
3
|
+
|
4
|
+
extern "C" {
|
5
|
+
#include "ruby.h"
|
6
|
+
}
|
7
|
+
|
8
|
+
// Inspired by https://github.com/jasonroelofs/rice/blob/1740a6d12c99fce8c21eda3c5385738318ab9172/rice/detail/ruby.hpp#L33-L37
|
9
|
+
// Casts C functions into a type that C++ is happy calling
|
10
|
+
extern "C" typedef VALUE (*CPP_RUBY_METHOD_FUNC)(ANYARGS);
|
11
|
+
|
12
|
+
void Define_Document();
|
13
|
+
void Define_Page();
|
14
|
+
void Define_PageSet();
|
15
|
+
|
16
|
+
// helper function for printing string literals for debugging purposes
|
17
|
+
void inline ruby_puts_cstring(const char* str) { rb_funcall(rb_cObject, rb_intern("puts"), 1, rb_str_new_cstr(str)); }
|
18
|
+
void inline ruby_puts_values(VALUE str){ rb_funcall(rb_cObject, rb_intern("puts"), 1, str); }
|
19
|
+
|
20
|
+
#endif
|
data/lib/pdfshaver.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class EncryptionError < StandardError; end
|
3
|
+
class InvalidFormatError < StandardError; end
|
4
|
+
class MissingHandlerError < StandardError; end
|
5
|
+
end
|
6
|
+
|
7
|
+
%w[
|
8
|
+
document
|
9
|
+
page
|
10
|
+
page_set
|
11
|
+
version
|
12
|
+
].each { |file| require_relative File.join('pdfshaver', file) }
|
13
|
+
require_relative 'pdfium_ruby'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class Document
|
3
|
+
attr_reader :length, :path
|
4
|
+
|
5
|
+
def initialize path, options={}
|
6
|
+
raise ArgumentError, "Can't find a file at '#{path}' to open" unless File.exists? path
|
7
|
+
# otherwise attempt to acquire it.
|
8
|
+
|
9
|
+
@path = path
|
10
|
+
open_document_with_pdfium(path)
|
11
|
+
end
|
12
|
+
|
13
|
+
def == other
|
14
|
+
File.realpath(self.path) == File.realpath(other.path)
|
15
|
+
end
|
16
|
+
|
17
|
+
def pages(page_list=:all)
|
18
|
+
PageSet.new(self, page_list)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class Page
|
3
|
+
GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
|
4
|
+
attr_reader :document, :width, :height, :aspect, :number, :index
|
5
|
+
|
6
|
+
def initialize document, number, options={}
|
7
|
+
raise ArgumentError unless document.kind_of? PDFShaver::Document
|
8
|
+
raise ArgumentError unless number.kind_of? Integer
|
9
|
+
raise ArgumentError unless number > 0 and number <= document.length
|
10
|
+
|
11
|
+
@number = number
|
12
|
+
@index = number - 1
|
13
|
+
@document = document
|
14
|
+
initialize_page_internals document, @index
|
15
|
+
end
|
16
|
+
|
17
|
+
def == other
|
18
|
+
raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
|
19
|
+
(self.document == other.document) and (self.index == other.index)
|
20
|
+
end
|
21
|
+
|
22
|
+
def <=> other
|
23
|
+
raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
|
24
|
+
self.index <=> other.index
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract_dimensions_from_gm_geometry_string(arg)
|
28
|
+
dimensions = {}
|
29
|
+
arg.match(GM_MATCHER) do |match|
|
30
|
+
|
31
|
+
# grab parsed tokens
|
32
|
+
requested_width = match[:width].to_f unless match[:width].nil?
|
33
|
+
requested_height = match[:height].to_f unless match[:height].nil?
|
34
|
+
modifier = match[:modifier] || ""
|
35
|
+
|
36
|
+
## Algorithm ported from GraphicsMagick's GetMagickGeometry function.
|
37
|
+
#
|
38
|
+
# the '@' option precludes all other options.
|
39
|
+
if modifier.include? '@'
|
40
|
+
# calculate the current page area
|
41
|
+
# and the specified target area for comparison
|
42
|
+
current_area = self.width * self.height
|
43
|
+
target_area = (requested_width || 1) * (requested_height || 1)
|
44
|
+
|
45
|
+
resize = if modifier.include? '>'
|
46
|
+
current_area > target_area
|
47
|
+
elsif modifier.include? '<'
|
48
|
+
current_area < target_area
|
49
|
+
else
|
50
|
+
true
|
51
|
+
end
|
52
|
+
|
53
|
+
if resize
|
54
|
+
scale = 1.0 / Math.sqrt(current_area/target_area)
|
55
|
+
dimensions[:width] = (self.width*scale+0.25).floor
|
56
|
+
dimensions[:height] = (self.height*scale+0.25).floor
|
57
|
+
end
|
58
|
+
else # Handle all of the non area modes.
|
59
|
+
width = requested_width
|
60
|
+
height = requested_height
|
61
|
+
|
62
|
+
# when supplied with only a width or a height
|
63
|
+
# infer the other using the page's aspect ratio.
|
64
|
+
if width and not height
|
65
|
+
height = (width/self.aspect+0.5).floor
|
66
|
+
elsif height and not width
|
67
|
+
width = (self.width.to_f/self.height*height+0.5).floor
|
68
|
+
end
|
69
|
+
|
70
|
+
# If proportional mode is requested
|
71
|
+
#
|
72
|
+
if modifier.include? '%'
|
73
|
+
x_scale = width
|
74
|
+
y_scale = height
|
75
|
+
x_scale = y_scale if requested_width.nil? or requested_height.nil?
|
76
|
+
width = ((self.width * x_scale / 100.0) +0.5).floor
|
77
|
+
height = ((self.height * y_scale / 100.0) +0.5).floor
|
78
|
+
# this is to match how GraphicsMagick works.
|
79
|
+
requested_width = width
|
80
|
+
requested_height = height
|
81
|
+
end
|
82
|
+
|
83
|
+
if modifier.include? '!' and ((width != requested_width) || (height != requested_height))
|
84
|
+
if (requested_width == 0) || (requested_height == 0)
|
85
|
+
scale = 1.0
|
86
|
+
else
|
87
|
+
width_ratio = width / self.width
|
88
|
+
height_ratio = height / self.height
|
89
|
+
scale = width_ratio
|
90
|
+
end
|
91
|
+
|
92
|
+
width = (scale*self.width+0.5).floor
|
93
|
+
height = (scale*self.height+0.5).floor
|
94
|
+
end
|
95
|
+
|
96
|
+
if modifier.include? '>'
|
97
|
+
width = self.width if self.width < width
|
98
|
+
height = self.height if self.height < height
|
99
|
+
end
|
100
|
+
|
101
|
+
if modifier.include? '<'
|
102
|
+
width = self.width if self.width > width
|
103
|
+
height = self.height if self.height > height
|
104
|
+
end
|
105
|
+
|
106
|
+
dimensions[:width] = width.floor
|
107
|
+
dimensions[:height] = height.floor
|
108
|
+
end
|
109
|
+
dimensions[:width] ||= self.width.floor
|
110
|
+
dimensions[:height] ||= self.height.floor
|
111
|
+
return dimensions
|
112
|
+
end
|
113
|
+
|
114
|
+
raise ArgumentError, "unable to extract width & height from '#{arg}'"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class PageSet
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_reader :document
|
6
|
+
def initialize document, page_list=:all, options={}
|
7
|
+
@document = document
|
8
|
+
@page_list = extract_page_numbers(page_list)
|
9
|
+
end
|
10
|
+
|
11
|
+
def each(&block)
|
12
|
+
enumerator.each(&block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](page_index)
|
16
|
+
Page.new(@document, @page_list.to_a[page_index])
|
17
|
+
end
|
18
|
+
|
19
|
+
def first
|
20
|
+
Page.new(@document, @page_list.first)
|
21
|
+
end
|
22
|
+
|
23
|
+
def last
|
24
|
+
Page.new(@document, @page_list.last)
|
25
|
+
end
|
26
|
+
|
27
|
+
def size
|
28
|
+
@page_list.size
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def enumerator
|
33
|
+
Enumerator.new do |yielder|
|
34
|
+
@page_list.each do |page_number|
|
35
|
+
yielder.yield Page.new(self.document, page_number)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def extract_page_numbers(inputs)
|
41
|
+
case inputs
|
42
|
+
when :all
|
43
|
+
Range.new(1,self.document.length)
|
44
|
+
when Numeric
|
45
|
+
raise ArgumentError, "#{inputs} is not a valid page number" unless valid_page_number?(inputs)
|
46
|
+
[inputs]
|
47
|
+
when Range
|
48
|
+
unless valid_page_range?(inputs)
|
49
|
+
raise ArgumentError, "#{inputs} did not fall in a valid range of pages (#{1..self.document.length})"
|
50
|
+
end
|
51
|
+
inputs
|
52
|
+
when Array
|
53
|
+
numbers = []
|
54
|
+
inputs.flatten.each do |input|
|
55
|
+
case
|
56
|
+
when valid_page_number?(input) then numbers.push input
|
57
|
+
when valid_page_range?(input) then numbers += input.to_a
|
58
|
+
when valid_page_string?(input) then
|
59
|
+
else raise ArgumentError, "#{input} is not a valid page or list of pages (as part of #{inputs})"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
numbers.sort
|
63
|
+
when String
|
64
|
+
valid_page_string?(inputs)
|
65
|
+
else
|
66
|
+
raise ArgumentError, "#{inputs.inspect} is not a valid list of pages"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def valid_page_number?(number)
|
71
|
+
number.kind_of?(Numeric) and number > 0 and number <= self.document.length
|
72
|
+
end
|
73
|
+
|
74
|
+
def valid_page_range?(range)
|
75
|
+
range.kind_of?(Range) and range.first <= range.last and
|
76
|
+
valid_page_number?(range.first) and valid_page_number?(range.last)
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_page_string?(input)
|
80
|
+
raise ArgumentError, "todo: support strings as page specifiers"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/pdfshaver.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'pdfshaver/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'pdfshaver'
|
7
|
+
s.version = PDFShaver::VERSION + ".alpha"
|
8
|
+
s.licenses = ['MIT']
|
9
|
+
s.summary = "Shave pages off of PDFs as images"
|
10
|
+
s.authors = ["Ted Han", "Nathan Stitt"]
|
11
|
+
s.email = 'opensource@documentcloud.org'
|
12
|
+
s.extensions = 'ext/pdfium_ruby/extconf.rb'
|
13
|
+
s.files = Dir.glob %w[
|
14
|
+
lib/pdfshaver.rb
|
15
|
+
lib/*/**/*
|
16
|
+
ext/**/*
|
17
|
+
test/**/*
|
18
|
+
Gemfile
|
19
|
+
pdfshaver.gemspec
|
20
|
+
Rakefile
|
21
|
+
Readme.md
|
22
|
+
]
|
23
|
+
|
24
|
+
s.add_development_dependency "bundler", "~> 1.5"
|
25
|
+
s.add_development_dependency 'rake'
|
26
|
+
s.add_development_dependency 'rake-compiler'
|
27
|
+
s.add_development_dependency 'minitest'
|
28
|
+
s.add_development_dependency 'fastimage'
|
29
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
|
2
|
+
|
3
|
+
describe PDFShaver::Document do
|
4
|
+
|
5
|
+
it "should be instantiated" do
|
6
|
+
path = File.join(FIXTURES, 'uncharter.pdf')
|
7
|
+
PDFShaver::Document.new(path).must_be_instance_of PDFShaver::Document
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should throw an error if path can't be found" do
|
11
|
+
Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should throw an error if a document can't be opened" do
|
15
|
+
Proc.new do
|
16
|
+
path = File.join(FIXTURES, 'completely_encrypted.pdf')
|
17
|
+
PDFShaver::Document.new(path)
|
18
|
+
end.must_raise ArgumentError
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "instance methods" do
|
22
|
+
before do
|
23
|
+
@path = File.join(FIXTURES, 'uncharter.pdf')
|
24
|
+
@document = PDFShaver::Document.new(@path)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have a length" do
|
28
|
+
@document.length.must_equal 55
|
29
|
+
end
|
30
|
+
|
31
|
+
it { @document.must_equal @document }
|
32
|
+
it { @document.must_equal PDFShaver::Document.new(@path) }
|
33
|
+
it { @document.wont_equal PDFShaver::Document.new(File.join(FIXTURES, 'letter-to-canadians-from-jack-layton.pdf')) }
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
|
2
|
+
|
3
|
+
describe "Resize arguments" do
|
4
|
+
before do
|
5
|
+
path = File.join(FIXTURES, 'uncharter.pdf')
|
6
|
+
@document = PDFShaver::Document.new(path)
|
7
|
+
@page = PDFShaver::Page.new(@document, 1)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Syntax" do
|
11
|
+
class TokenSet
|
12
|
+
# Create a list of attributes which we can
|
13
|
+
# refer back to later
|
14
|
+
# and use the splat operator to define our attributes
|
15
|
+
KEYS = [:width, :height, :modifier]
|
16
|
+
attr_reader *KEYS
|
17
|
+
|
18
|
+
# helper methods so we can pretend an instance is a hash
|
19
|
+
def [](key); self.send(key); end
|
20
|
+
def []=(key, val); self.instance_variable_set("@#{key}", val); end
|
21
|
+
|
22
|
+
# use the KEYS and their order to initialize instance variables
|
23
|
+
def initialize(*attributes)
|
24
|
+
KEYS.each_with_index{ |key, index| self[key] = attributes[index] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should match valid graphicsmagick strings" do
|
29
|
+
inputs = {
|
30
|
+
"100" => TokenSet.new(nil, "100", nil),
|
31
|
+
"101x102" => TokenSet.new("101", "102", nil),
|
32
|
+
"103x" => TokenSet.new("103", nil, nil),
|
33
|
+
"x104" => TokenSet.new(nil, "104", nil),
|
34
|
+
"105%" => TokenSet.new(nil, "105", "%"),
|
35
|
+
"106@" => TokenSet.new(nil, "106", "@"),
|
36
|
+
"107<" => TokenSet.new(nil, "107", "<"),
|
37
|
+
"108>" => TokenSet.new(nil, "108", ">"),
|
38
|
+
"109x110%" => TokenSet.new("109", "110", "%"),
|
39
|
+
"x111%" => TokenSet.new(nil, "111", "%"),
|
40
|
+
"112x%" => TokenSet.new("112", nil, "%"),
|
41
|
+
}
|
42
|
+
|
43
|
+
inputs.each do |input, expected|
|
44
|
+
input.must_match(PDFShaver::Page::GM_MATCHER)
|
45
|
+
match = input.match(PDFShaver::Page::GM_MATCHER)
|
46
|
+
TokenSet::KEYS.each{ |key| match[key].must_equal expected[key] }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "Semantic" do
|
52
|
+
class Size
|
53
|
+
attr_reader :width, :height, :aspect
|
54
|
+
def initialize(width, height)
|
55
|
+
@width = width
|
56
|
+
@height = height
|
57
|
+
@aspect = @width.to_f / height
|
58
|
+
end
|
59
|
+
|
60
|
+
def scale(factor)
|
61
|
+
self.class.new((@width*factor).to_i, (@height*factor).to_i)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should specify width and height" do
|
66
|
+
w = @page.width.to_i
|
67
|
+
h = @page.height.to_i
|
68
|
+
base = Size.new(w,h)
|
69
|
+
{
|
70
|
+
"#{w}x#{h}" => base,
|
71
|
+
"#{(w*0.5).to_i}x" => base.scale(0.5),
|
72
|
+
"x#{h*2}" => base.scale(2),
|
73
|
+
"100x100!" => Size.new(100, 100),
|
74
|
+
"100x100%" => base,
|
75
|
+
"200x200%" => base.scale(2),
|
76
|
+
"200x200@" => Size.new(176, 227),
|
77
|
+
"1000>" => base,
|
78
|
+
#"1000<" => Size.new(773, 1000),
|
79
|
+
"500>" => Size.new(386, 500),
|
80
|
+
"500x>" => Size.new(500, 647)
|
81
|
+
}.each do |input, expected|
|
82
|
+
#puts "#{input} : #{expected.inspect}"
|
83
|
+
output = @page.extract_dimensions_from_gm_geometry_string(input)
|
84
|
+
#puts "#{output.inspect} vs #{expected.inspect}"
|
85
|
+
dimensions = Size.new(output[:width], output[:height])
|
86
|
+
dimensions.aspect.must_be_within_delta expected.aspect, 0.005
|
87
|
+
dimensions.width.must_be_within_delta expected.width, 1
|
88
|
+
dimensions.height.must_be_within_delta expected.height, 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|