pdfshaver 0.0.1.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/Rakefile +10 -0
- data/Readme.md +14 -0
- data/ext/pdfium_ruby/document.cpp +131 -0
- data/ext/pdfium_ruby/document.h +53 -0
- data/ext/pdfium_ruby/extconf.rb +68 -0
- data/ext/pdfium_ruby/page.cpp +190 -0
- data/ext/pdfium_ruby/page.h +36 -0
- data/ext/pdfium_ruby/pdfium_ruby.cpp +17 -0
- data/ext/pdfium_ruby/pdfium_ruby.h +20 -0
- data/lib/pdfshaver.rb +13 -0
- data/lib/pdfshaver/document.rb +21 -0
- data/lib/pdfshaver/page.rb +117 -0
- data/lib/pdfshaver/page_set.rb +83 -0
- data/lib/pdfshaver/version.rb +3 -0
- data/pdfshaver.gemspec +29 -0
- data/test/document_spec.rb +36 -0
- data/test/fixtures/completely_encrypted.pdf +0 -0
- data/test/fixtures/encrypted.pdf +0 -0
- data/test/fixtures/letter-to-canadians-from-jack-layton.pdf +0 -0
- data/test/fixtures/uncharter.pdf +0 -0
- data/test/gm_compatability_spec.rb +92 -0
- data/test/page_set_spec.rb +62 -0
- data/test/page_spec.rb +133 -0
- data/test/spec_helper.rb +13 -0
- metadata +140 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
#ifndef __PAGE_H__
|
2
|
+
#define __PAGE_H__
|
3
|
+
|
4
|
+
// forward declaration since Page/Document classes are interdependent
|
5
|
+
class Document;
|
6
|
+
#include "pdfium_ruby.h"
|
7
|
+
#include "document.h"
|
8
|
+
|
9
|
+
class Page {
|
10
|
+
public:
|
11
|
+
Page();
|
12
|
+
|
13
|
+
bool load(Document* document, int page_number);
|
14
|
+
|
15
|
+
double width();
|
16
|
+
double height();
|
17
|
+
double aspect();
|
18
|
+
|
19
|
+
bool render(char* path, int width, int height);
|
20
|
+
|
21
|
+
~Page();
|
22
|
+
|
23
|
+
private:
|
24
|
+
int page_index;
|
25
|
+
bool opened;
|
26
|
+
Document *document;
|
27
|
+
FPDF_PAGE fpdf_page;
|
28
|
+
};
|
29
|
+
|
30
|
+
void Define_Page();
|
31
|
+
VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self);
|
32
|
+
VALUE page_render(int arg_count, VALUE* args, VALUE self);
|
33
|
+
VALUE page_allocate(VALUE rb_PDFShaver_Page);
|
34
|
+
static void destroy_page(Page* page);
|
35
|
+
|
36
|
+
#endif
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#include "pdfium_ruby.h"
|
2
|
+
|
3
|
+
#include "fpdfview.h"
|
4
|
+
|
5
|
+
extern "C"
|
6
|
+
void Init_pdfium_ruby (void) {
|
7
|
+
// Initialize PDFium
|
8
|
+
FPDF_InitLibrary();
|
9
|
+
|
10
|
+
// Define `PDFShaver` module as a namespace for all of our other objects
|
11
|
+
VALUE rb_PDFShaver = rb_define_module("PDFShaver");
|
12
|
+
|
13
|
+
// Define `Document` and `Page` classes
|
14
|
+
Define_Document();
|
15
|
+
Define_Page();
|
16
|
+
//Define_PageSet();
|
17
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef __PDFIUM_RUBY_H__
|
2
|
+
#define __PDFIUM_RUBY_H__
|
3
|
+
|
4
|
+
extern "C" {
|
5
|
+
#include "ruby.h"
|
6
|
+
}
|
7
|
+
|
8
|
+
// Inspired by https://github.com/jasonroelofs/rice/blob/1740a6d12c99fce8c21eda3c5385738318ab9172/rice/detail/ruby.hpp#L33-L37
|
9
|
+
// Casts C functions into a type that C++ is happy calling
|
10
|
+
extern "C" typedef VALUE (*CPP_RUBY_METHOD_FUNC)(ANYARGS);
|
11
|
+
|
12
|
+
void Define_Document();
|
13
|
+
void Define_Page();
|
14
|
+
void Define_PageSet();
|
15
|
+
|
16
|
+
// helper function for printing string literals for debugging purposes
|
17
|
+
void inline ruby_puts_cstring(const char* str) { rb_funcall(rb_cObject, rb_intern("puts"), 1, rb_str_new_cstr(str)); }
|
18
|
+
void inline ruby_puts_values(VALUE str){ rb_funcall(rb_cObject, rb_intern("puts"), 1, str); }
|
19
|
+
|
20
|
+
#endif
|
data/lib/pdfshaver.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class EncryptionError < StandardError; end
|
3
|
+
class InvalidFormatError < StandardError; end
|
4
|
+
class MissingHandlerError < StandardError; end
|
5
|
+
end
|
6
|
+
|
7
|
+
%w[
|
8
|
+
document
|
9
|
+
page
|
10
|
+
page_set
|
11
|
+
version
|
12
|
+
].each { |file| require_relative File.join('pdfshaver', file) }
|
13
|
+
require_relative 'pdfium_ruby'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class Document
|
3
|
+
attr_reader :length, :path
|
4
|
+
|
5
|
+
def initialize path, options={}
|
6
|
+
raise ArgumentError, "Can't find a file at '#{path}' to open" unless File.exists? path
|
7
|
+
# otherwise attempt to acquire it.
|
8
|
+
|
9
|
+
@path = path
|
10
|
+
open_document_with_pdfium(path)
|
11
|
+
end
|
12
|
+
|
13
|
+
def == other
|
14
|
+
File.realpath(self.path) == File.realpath(other.path)
|
15
|
+
end
|
16
|
+
|
17
|
+
def pages(page_list=:all)
|
18
|
+
PageSet.new(self, page_list)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class Page
|
3
|
+
GM_MATCHER = /^\s*((?<width>\d+)x((?<height>\d+))?|x?(?<height>\d+))(?<modifier>[@%!<>^]+)?\s*$/
|
4
|
+
attr_reader :document, :width, :height, :aspect, :number, :index
|
5
|
+
|
6
|
+
def initialize document, number, options={}
|
7
|
+
raise ArgumentError unless document.kind_of? PDFShaver::Document
|
8
|
+
raise ArgumentError unless number.kind_of? Integer
|
9
|
+
raise ArgumentError unless number > 0 and number <= document.length
|
10
|
+
|
11
|
+
@number = number
|
12
|
+
@index = number - 1
|
13
|
+
@document = document
|
14
|
+
initialize_page_internals document, @index
|
15
|
+
end
|
16
|
+
|
17
|
+
def == other
|
18
|
+
raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
|
19
|
+
(self.document == other.document) and (self.index == other.index)
|
20
|
+
end
|
21
|
+
|
22
|
+
def <=> other
|
23
|
+
raise ArgumentError, "unable to compare #{self.class} with #{other.class}" unless other.kind_of? self.class
|
24
|
+
self.index <=> other.index
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract_dimensions_from_gm_geometry_string(arg)
|
28
|
+
dimensions = {}
|
29
|
+
arg.match(GM_MATCHER) do |match|
|
30
|
+
|
31
|
+
# grab parsed tokens
|
32
|
+
requested_width = match[:width].to_f unless match[:width].nil?
|
33
|
+
requested_height = match[:height].to_f unless match[:height].nil?
|
34
|
+
modifier = match[:modifier] || ""
|
35
|
+
|
36
|
+
## Algorithm ported from GraphicsMagick's GetMagickGeometry function.
|
37
|
+
#
|
38
|
+
# the '@' option precludes all other options.
|
39
|
+
if modifier.include? '@'
|
40
|
+
# calculate the current page area
|
41
|
+
# and the specified target area for comparison
|
42
|
+
current_area = self.width * self.height
|
43
|
+
target_area = (requested_width || 1) * (requested_height || 1)
|
44
|
+
|
45
|
+
resize = if modifier.include? '>'
|
46
|
+
current_area > target_area
|
47
|
+
elsif modifier.include? '<'
|
48
|
+
current_area < target_area
|
49
|
+
else
|
50
|
+
true
|
51
|
+
end
|
52
|
+
|
53
|
+
if resize
|
54
|
+
scale = 1.0 / Math.sqrt(current_area/target_area)
|
55
|
+
dimensions[:width] = (self.width*scale+0.25).floor
|
56
|
+
dimensions[:height] = (self.height*scale+0.25).floor
|
57
|
+
end
|
58
|
+
else # Handle all of the non area modes.
|
59
|
+
width = requested_width
|
60
|
+
height = requested_height
|
61
|
+
|
62
|
+
# when supplied with only a width or a height
|
63
|
+
# infer the other using the page's aspect ratio.
|
64
|
+
if width and not height
|
65
|
+
height = (width/self.aspect+0.5).floor
|
66
|
+
elsif height and not width
|
67
|
+
width = (self.width.to_f/self.height*height+0.5).floor
|
68
|
+
end
|
69
|
+
|
70
|
+
# If proportional mode is requested
|
71
|
+
#
|
72
|
+
if modifier.include? '%'
|
73
|
+
x_scale = width
|
74
|
+
y_scale = height
|
75
|
+
x_scale = y_scale if requested_width.nil? or requested_height.nil?
|
76
|
+
width = ((self.width * x_scale / 100.0) +0.5).floor
|
77
|
+
height = ((self.height * y_scale / 100.0) +0.5).floor
|
78
|
+
# this is to match how GraphicsMagick works.
|
79
|
+
requested_width = width
|
80
|
+
requested_height = height
|
81
|
+
end
|
82
|
+
|
83
|
+
if modifier.include? '!' and ((width != requested_width) || (height != requested_height))
|
84
|
+
if (requested_width == 0) || (requested_height == 0)
|
85
|
+
scale = 1.0
|
86
|
+
else
|
87
|
+
width_ratio = width / self.width
|
88
|
+
height_ratio = height / self.height
|
89
|
+
scale = width_ratio
|
90
|
+
end
|
91
|
+
|
92
|
+
width = (scale*self.width+0.5).floor
|
93
|
+
height = (scale*self.height+0.5).floor
|
94
|
+
end
|
95
|
+
|
96
|
+
if modifier.include? '>'
|
97
|
+
width = self.width if self.width < width
|
98
|
+
height = self.height if self.height < height
|
99
|
+
end
|
100
|
+
|
101
|
+
if modifier.include? '<'
|
102
|
+
width = self.width if self.width > width
|
103
|
+
height = self.height if self.height > height
|
104
|
+
end
|
105
|
+
|
106
|
+
dimensions[:width] = width.floor
|
107
|
+
dimensions[:height] = height.floor
|
108
|
+
end
|
109
|
+
dimensions[:width] ||= self.width.floor
|
110
|
+
dimensions[:height] ||= self.height.floor
|
111
|
+
return dimensions
|
112
|
+
end
|
113
|
+
|
114
|
+
raise ArgumentError, "unable to extract width & height from '#{arg}'"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module PDFShaver
|
2
|
+
class PageSet
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_reader :document
|
6
|
+
def initialize document, page_list=:all, options={}
|
7
|
+
@document = document
|
8
|
+
@page_list = extract_page_numbers(page_list)
|
9
|
+
end
|
10
|
+
|
11
|
+
def each(&block)
|
12
|
+
enumerator.each(&block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](page_index)
|
16
|
+
Page.new(@document, @page_list.to_a[page_index])
|
17
|
+
end
|
18
|
+
|
19
|
+
def first
|
20
|
+
Page.new(@document, @page_list.first)
|
21
|
+
end
|
22
|
+
|
23
|
+
def last
|
24
|
+
Page.new(@document, @page_list.last)
|
25
|
+
end
|
26
|
+
|
27
|
+
def size
|
28
|
+
@page_list.size
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def enumerator
|
33
|
+
Enumerator.new do |yielder|
|
34
|
+
@page_list.each do |page_number|
|
35
|
+
yielder.yield Page.new(self.document, page_number)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def extract_page_numbers(inputs)
|
41
|
+
case inputs
|
42
|
+
when :all
|
43
|
+
Range.new(1,self.document.length)
|
44
|
+
when Numeric
|
45
|
+
raise ArgumentError, "#{inputs} is not a valid page number" unless valid_page_number?(inputs)
|
46
|
+
[inputs]
|
47
|
+
when Range
|
48
|
+
unless valid_page_range?(inputs)
|
49
|
+
raise ArgumentError, "#{inputs} did not fall in a valid range of pages (#{1..self.document.length})"
|
50
|
+
end
|
51
|
+
inputs
|
52
|
+
when Array
|
53
|
+
numbers = []
|
54
|
+
inputs.flatten.each do |input|
|
55
|
+
case
|
56
|
+
when valid_page_number?(input) then numbers.push input
|
57
|
+
when valid_page_range?(input) then numbers += input.to_a
|
58
|
+
when valid_page_string?(input) then
|
59
|
+
else raise ArgumentError, "#{input} is not a valid page or list of pages (as part of #{inputs})"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
numbers.sort
|
63
|
+
when String
|
64
|
+
valid_page_string?(inputs)
|
65
|
+
else
|
66
|
+
raise ArgumentError, "#{inputs.inspect} is not a valid list of pages"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def valid_page_number?(number)
|
71
|
+
number.kind_of?(Numeric) and number > 0 and number <= self.document.length
|
72
|
+
end
|
73
|
+
|
74
|
+
def valid_page_range?(range)
|
75
|
+
range.kind_of?(Range) and range.first <= range.last and
|
76
|
+
valid_page_number?(range.first) and valid_page_number?(range.last)
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_page_string?(input)
|
80
|
+
raise ArgumentError, "todo: support strings as page specifiers"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/pdfshaver.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'pdfshaver/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'pdfshaver'
|
7
|
+
s.version = PDFShaver::VERSION + ".alpha"
|
8
|
+
s.licenses = ['MIT']
|
9
|
+
s.summary = "Shave pages off of PDFs as images"
|
10
|
+
s.authors = ["Ted Han", "Nathan Stitt"]
|
11
|
+
s.email = 'opensource@documentcloud.org'
|
12
|
+
s.extensions = 'ext/pdfium_ruby/extconf.rb'
|
13
|
+
s.files = Dir.glob %w[
|
14
|
+
lib/pdfshaver.rb
|
15
|
+
lib/*/**/*
|
16
|
+
ext/**/*
|
17
|
+
test/**/*
|
18
|
+
Gemfile
|
19
|
+
pdfshaver.gemspec
|
20
|
+
Rakefile
|
21
|
+
Readme.md
|
22
|
+
]
|
23
|
+
|
24
|
+
s.add_development_dependency "bundler", "~> 1.5"
|
25
|
+
s.add_development_dependency 'rake'
|
26
|
+
s.add_development_dependency 'rake-compiler'
|
27
|
+
s.add_development_dependency 'minitest'
|
28
|
+
s.add_development_dependency 'fastimage'
|
29
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
|
2
|
+
|
3
|
+
describe PDFShaver::Document do
|
4
|
+
|
5
|
+
it "should be instantiated" do
|
6
|
+
path = File.join(FIXTURES, 'uncharter.pdf')
|
7
|
+
PDFShaver::Document.new(path).must_be_instance_of PDFShaver::Document
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should throw an error if path can't be found" do
|
11
|
+
Proc.new{ PDFShaver::Document.new("suede shoes") }.must_raise ArgumentError
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should throw an error if a document can't be opened" do
|
15
|
+
Proc.new do
|
16
|
+
path = File.join(FIXTURES, 'completely_encrypted.pdf')
|
17
|
+
PDFShaver::Document.new(path)
|
18
|
+
end.must_raise ArgumentError
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "instance methods" do
|
22
|
+
before do
|
23
|
+
@path = File.join(FIXTURES, 'uncharter.pdf')
|
24
|
+
@document = PDFShaver::Document.new(@path)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have a length" do
|
28
|
+
@document.length.must_equal 55
|
29
|
+
end
|
30
|
+
|
31
|
+
it { @document.must_equal @document }
|
32
|
+
it { @document.must_equal PDFShaver::Document.new(@path) }
|
33
|
+
it { @document.wont_equal PDFShaver::Document.new(File.join(FIXTURES, 'letter-to-canadians-from-jack-layton.pdf')) }
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__),'spec_helper'))
|
2
|
+
|
3
|
+
describe "Resize arguments" do
|
4
|
+
before do
|
5
|
+
path = File.join(FIXTURES, 'uncharter.pdf')
|
6
|
+
@document = PDFShaver::Document.new(path)
|
7
|
+
@page = PDFShaver::Page.new(@document, 1)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Syntax" do
|
11
|
+
class TokenSet
|
12
|
+
# Create a list of attributes which we can
|
13
|
+
# refer back to later
|
14
|
+
# and use the splat operator to define our attributes
|
15
|
+
KEYS = [:width, :height, :modifier]
|
16
|
+
attr_reader *KEYS
|
17
|
+
|
18
|
+
# helper methods so we can pretend an instance is a hash
|
19
|
+
def [](key); self.send(key); end
|
20
|
+
def []=(key, val); self.instance_variable_set("@#{key}", val); end
|
21
|
+
|
22
|
+
# use the KEYS and their order to initialize instance variables
|
23
|
+
def initialize(*attributes)
|
24
|
+
KEYS.each_with_index{ |key, index| self[key] = attributes[index] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should match valid graphicsmagick strings" do
|
29
|
+
inputs = {
|
30
|
+
"100" => TokenSet.new(nil, "100", nil),
|
31
|
+
"101x102" => TokenSet.new("101", "102", nil),
|
32
|
+
"103x" => TokenSet.new("103", nil, nil),
|
33
|
+
"x104" => TokenSet.new(nil, "104", nil),
|
34
|
+
"105%" => TokenSet.new(nil, "105", "%"),
|
35
|
+
"106@" => TokenSet.new(nil, "106", "@"),
|
36
|
+
"107<" => TokenSet.new(nil, "107", "<"),
|
37
|
+
"108>" => TokenSet.new(nil, "108", ">"),
|
38
|
+
"109x110%" => TokenSet.new("109", "110", "%"),
|
39
|
+
"x111%" => TokenSet.new(nil, "111", "%"),
|
40
|
+
"112x%" => TokenSet.new("112", nil, "%"),
|
41
|
+
}
|
42
|
+
|
43
|
+
inputs.each do |input, expected|
|
44
|
+
input.must_match(PDFShaver::Page::GM_MATCHER)
|
45
|
+
match = input.match(PDFShaver::Page::GM_MATCHER)
|
46
|
+
TokenSet::KEYS.each{ |key| match[key].must_equal expected[key] }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "Semantic" do
|
52
|
+
class Size
|
53
|
+
attr_reader :width, :height, :aspect
|
54
|
+
def initialize(width, height)
|
55
|
+
@width = width
|
56
|
+
@height = height
|
57
|
+
@aspect = @width.to_f / height
|
58
|
+
end
|
59
|
+
|
60
|
+
def scale(factor)
|
61
|
+
self.class.new((@width*factor).to_i, (@height*factor).to_i)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should specify width and height" do
|
66
|
+
w = @page.width.to_i
|
67
|
+
h = @page.height.to_i
|
68
|
+
base = Size.new(w,h)
|
69
|
+
{
|
70
|
+
"#{w}x#{h}" => base,
|
71
|
+
"#{(w*0.5).to_i}x" => base.scale(0.5),
|
72
|
+
"x#{h*2}" => base.scale(2),
|
73
|
+
"100x100!" => Size.new(100, 100),
|
74
|
+
"100x100%" => base,
|
75
|
+
"200x200%" => base.scale(2),
|
76
|
+
"200x200@" => Size.new(176, 227),
|
77
|
+
"1000>" => base,
|
78
|
+
#"1000<" => Size.new(773, 1000),
|
79
|
+
"500>" => Size.new(386, 500),
|
80
|
+
"500x>" => Size.new(500, 647)
|
81
|
+
}.each do |input, expected|
|
82
|
+
#puts "#{input} : #{expected.inspect}"
|
83
|
+
output = @page.extract_dimensions_from_gm_geometry_string(input)
|
84
|
+
#puts "#{output.inspect} vs #{expected.inspect}"
|
85
|
+
dimensions = Size.new(output[:width], output[:height])
|
86
|
+
dimensions.aspect.must_be_within_delta expected.aspect, 0.005
|
87
|
+
dimensions.width.must_be_within_delta expected.width, 1
|
88
|
+
dimensions.height.must_be_within_delta expected.height, 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|