pdfium 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.ruby-version +1 -0
- data/Gemfile +9 -0
- data/Guardfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +68 -0
- data/Rakefile +62 -0
- data/ext/pdfium_ext/bookmark.cc +221 -0
- data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
- data/ext/pdfium_ext/document.cc +268 -0
- data/ext/pdfium_ext/document.h +66 -0
- data/ext/pdfium_ext/document_wrapper.cc +63 -0
- data/ext/pdfium_ext/document_wrapper.h +56 -0
- data/ext/pdfium_ext/extconf.h +3 -0
- data/ext/pdfium_ext/extconf.rb +76 -0
- data/ext/pdfium_ext/image.cc +332 -0
- data/ext/pdfium_ext/page.cc +392 -0
- data/ext/pdfium_ext/page.h +5 -0
- data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
- data/ext/pdfium_ext/page_object_wrapper.h +27 -0
- data/ext/pdfium_ext/page_wrapper.cc +86 -0
- data/ext/pdfium_ext/page_wrapper.h +37 -0
- data/ext/pdfium_ext/pdfium.cc +115 -0
- data/ext/pdfium_ext/pdfium.h +69 -0
- data/lib/pdfium.rb +15 -0
- data/lib/pdfium/bookmark_list.rb +28 -0
- data/lib/pdfium/bounding_box.rb +16 -0
- data/lib/pdfium/image_list.rb +21 -0
- data/lib/pdfium/page_list.rb +36 -0
- data/lib/pdfium/page_sizes.rb +7 -0
- data/lib/pdfium/version.rb +4 -0
- data/pdfium.gemspec +29 -0
- data/test/benchmark-docsplit.rb +41 -0
- data/test/bookmarks_list_spec.rb +26 -0
- data/test/bookmarks_spec.rb +34 -0
- data/test/debug.rb +24 -0
- data/test/document_spec.rb +49 -0
- data/test/image_list_spec.rb +18 -0
- data/test/image_spec.rb +53 -0
- data/test/page_list_spec.rb +24 -0
- data/test/page_spec.rb +91 -0
- data/test/pdfium_spec.rb +15 -0
- data/test/profile.rb +29 -0
- data/test/spec_helper.rb +31 -0
- metadata +202 -0
data/pdfium.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'pdfium/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "pdfium"
|
8
|
+
spec.version = PDFium::VERSION
|
9
|
+
spec.authors = ["Nathan Stitt", "Ted Han"]
|
10
|
+
spec.email = ["nathan@stitt.org"]
|
11
|
+
spec.summary = %q{Ruby bindings for Google's PDFium project}
|
12
|
+
spec.description = %q{Ruby bindings for Google's PDFium project. It supports extracting text and images from PDF's as well as rendering pages to bitmaps}
|
13
|
+
spec.homepage = "https://github.com/nathanstitt/pdfium-ruby"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
|
24
|
+
spec.add_development_dependency "guard-minitest"
|
25
|
+
spec.add_development_dependency "guard-rake"
|
26
|
+
spec.add_development_dependency "fastimage"
|
27
|
+
spec.add_development_dependency "image_science"
|
28
|
+
spec.add_development_dependency "rake-compiler"
|
29
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rusage'
|
4
|
+
require 'docsplit'
|
5
|
+
|
6
|
+
|
7
|
+
IMAGE_SIZES = {
|
8
|
+
large: '1000x',
|
9
|
+
normal: '700x',
|
10
|
+
small: '180x',
|
11
|
+
thumbnail: '60x75!'
|
12
|
+
}
|
13
|
+
|
14
|
+
output_directory = Dir.mktmpdir
|
15
|
+
|
16
|
+
start_time = Time.now
|
17
|
+
Dir.glob("test-pdfs/*.pdf").each do |pdf|
|
18
|
+
Docsplit.extract_images(pdf, :format => :gif, :size => IMAGE_SIZES.values, :rolling => true, :output => output_directory)
|
19
|
+
end
|
20
|
+
elapsed_time = Time.now-start_time
|
21
|
+
|
22
|
+
# Process.crusage measures children
|
23
|
+
# Process.rusage measures self
|
24
|
+
usage = Process.crusage
|
25
|
+
|
26
|
+
rss = usage.maxrss.to_f
|
27
|
+
# OSX reports in terms of bytes, bsd & linux use kb. "man getrusage"
|
28
|
+
rss = rss/1024 if RUBY_PLATFORM =~ /darwin/
|
29
|
+
|
30
|
+
du = Dir.glob("#{output_directory}/*/**").inject(0.0){|x,img| x+File.stat(img).size }
|
31
|
+
FileUtils.rm_r output_directory
|
32
|
+
|
33
|
+
def report(label,value,specifier="")
|
34
|
+
printf("%12s: %8.3f %s\n", label, value, specifier)
|
35
|
+
end
|
36
|
+
|
37
|
+
report "Elapsed", elapsed_time, "Seconds"
|
38
|
+
report "System CPU", usage.stime
|
39
|
+
report "User CPU", usage.utime
|
40
|
+
report "Max Memory", (rss/1024),"MB"
|
41
|
+
report "Disk Space", (du/1024/1024), "MB"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::BookmarkList do
|
4
|
+
|
5
|
+
let(:api){ load_document("with_bookmarks") }
|
6
|
+
let(:utf){ load_document("example_utf8") }
|
7
|
+
|
8
|
+
it "can be created" do
|
9
|
+
assert_kind_of PDFium::BookmarkList, api.bookmarks
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can be empty" do
|
13
|
+
bm = PDFium::Bookmark.new(document: api)
|
14
|
+
assert_kind_of PDFium::BookmarkList, bm.children
|
15
|
+
assert bm.children.empty?, "First bookmark shouldn't have any children"
|
16
|
+
end
|
17
|
+
|
18
|
+
it "can iterate" do
|
19
|
+
count = 0
|
20
|
+
utf.bookmarks.each do | bm |
|
21
|
+
count +=1
|
22
|
+
end
|
23
|
+
assert_equal 16, count
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::Bookmark do
|
4
|
+
let(:api){ load_document("with_bookmarks") }
|
5
|
+
|
6
|
+
it "can be read" do
|
7
|
+
bm = PDFium::Bookmark.new(document: api)
|
8
|
+
assert_equal Encoding::UTF_16LE, bm.title.encoding
|
9
|
+
assert_equal "INDEX", bm.title.encode!("ASCII-8BIT")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can create siblings" do
|
13
|
+
bm = PDFium::Bookmark.new(document: api)
|
14
|
+
second = bm.next_sibling
|
15
|
+
assert_kind_of PDFium::Bookmark, second
|
16
|
+
assert_equal "Chapter 1", second.title.encode!("ASCII-8BIT")
|
17
|
+
refute second.next_sibling, "PDF shouldn't have 3 top level bookmarks"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "can create children" do
|
21
|
+
bm = PDFium::Bookmark.new(document: api)
|
22
|
+
children = bm.next_sibling.children
|
23
|
+
assert_kind_of PDFium::BookmarkList, children
|
24
|
+
assert_equal 1, children.count
|
25
|
+
end
|
26
|
+
|
27
|
+
it "has destinations" do
|
28
|
+
bm = PDFium::Bookmark.new(document: api).next_sibling
|
29
|
+
assert_kind_of Hash, bm.destination
|
30
|
+
assert_equal :destination, bm.destination[:type]
|
31
|
+
assert_equal 1, bm.destination[:page_number]
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
data/test/debug.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pdfium'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
|
7
|
+
path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf')
|
8
|
+
|
9
|
+
# guide = PDFium::Document.new( path )
|
10
|
+
# `rm /tmp/images/*`
|
11
|
+
# page = guide.page_at(0)
|
12
|
+
|
13
|
+
|
14
|
+
# path.write page.as_image(height: 120).data('jpg')
|
15
|
+
|
16
|
+
data = path.read
|
17
|
+
puts data.length
|
18
|
+
pdf = PDFium::Document.from_memory(data)
|
19
|
+
|
20
|
+
puts pdf.page_count
|
21
|
+
|
22
|
+
# page.each_image do |img|
|
23
|
+
# img.save("/tmp/images/#{img.index}.png")
|
24
|
+
# end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::Document do
|
4
|
+
let(:guide){ load_document("example_images") }
|
5
|
+
|
6
|
+
it "can create a new empty pdf" do
|
7
|
+
pdf = PDFium::Document.new
|
8
|
+
assert pdf
|
9
|
+
end
|
10
|
+
|
11
|
+
it "can be initialized from string" do
|
12
|
+
data = pdf_path("with_bookmarks").read
|
13
|
+
pdf = PDFium::Document.from_memory(data)
|
14
|
+
assert pdf
|
15
|
+
assert_equal 3, pdf.page_count
|
16
|
+
end
|
17
|
+
|
18
|
+
it "counts pdf pages" do
|
19
|
+
assert_equal 3, guide.page_count
|
20
|
+
end
|
21
|
+
|
22
|
+
it "can save to a file" do
|
23
|
+
pdf = PDFium::Document.new
|
24
|
+
PDFium::Page.create(pdf,0)
|
25
|
+
Tempfile.open(['test','.pdf']) do |f|
|
26
|
+
pdf.save(f.path)
|
27
|
+
reloaded = PDFium::Document.new(f.path)
|
28
|
+
assert_equal 1, reloaded.page_count
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "returns pages" do
|
33
|
+
assert guide
|
34
|
+
assert_kind_of PDFium::PageList, guide.pages
|
35
|
+
end
|
36
|
+
|
37
|
+
it "can read metadata" do
|
38
|
+
assert_equal "mPDF 5.1", guide.metadata[:producer].encode!("ASCII-8BIT")
|
39
|
+
end
|
40
|
+
|
41
|
+
it "can write metadata" do
|
42
|
+
guide.metadata do | md |
|
43
|
+
md[:author] = "My Little Writer"
|
44
|
+
end
|
45
|
+
after_saving(guide) do | saved |
|
46
|
+
assert_equal "My Little Writer", saved.metadata[:author].encode!("ASCII-8BIT")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::ImageList do
|
4
|
+
|
5
|
+
let(:image_doc){ load_document("example_images").page_at(0) }
|
6
|
+
let(:blank_page){ load_document("example_utf8").page_at(0) }
|
7
|
+
|
8
|
+
it "can be empty" do
|
9
|
+
assert blank_page.images.none?, "images found where there should not be"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can iterate" do
|
13
|
+
count = 0
|
14
|
+
image_doc.images.each{|i| count+=1 }
|
15
|
+
assert_equal 26, count
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/test/image_spec.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require 'image_science'
|
3
|
+
|
4
|
+
describe PDFium::Image do
|
5
|
+
let(:guide){ load_document("example_images") }
|
6
|
+
let(:page) { guide.page_at(0) }
|
7
|
+
|
8
|
+
it "saves as various formats" do
|
9
|
+
%w{png jpeg tiff bmp gif}.each do | ext |
|
10
|
+
file = Tempfile.new(['test', ".#{ext}"])
|
11
|
+
width = rand(200) + 100
|
12
|
+
height = rand(300) + 100
|
13
|
+
page.as_image(width: width, height: height).save(file.path)
|
14
|
+
assert_size "#{width}x#{height}", file.path
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
it "dumps to string" do
|
20
|
+
file = Tempfile.new(['test',".jpeg"])
|
21
|
+
file.write page.as_image(height: 120).data('jpg')
|
22
|
+
file.flush
|
23
|
+
assert_size "84x120", file.path
|
24
|
+
end
|
25
|
+
|
26
|
+
it "iterates over page images" do
|
27
|
+
valid_sizes = [567, 284, 386, 227, 500, 939, 950, 959]
|
28
|
+
count = 0
|
29
|
+
page.each_image do |img|
|
30
|
+
count += 1
|
31
|
+
next if count % 4 == 0 # to speed up spec runs only sample 1/4 of time
|
32
|
+
assert_kind_of PDFium::Image, img
|
33
|
+
file = Tempfile.new(['test',".png"])
|
34
|
+
img.save(file.path)
|
35
|
+
assert_includes valid_sizes, FastImage.size(file).first
|
36
|
+
assert_includes valid_sizes, FastImage.size(file).last
|
37
|
+
end
|
38
|
+
assert_equal 26, count, "Incorrect # of images counted"
|
39
|
+
end
|
40
|
+
|
41
|
+
it "can return an ImageScience instance" do
|
42
|
+
image = page.images.first
|
43
|
+
assert image
|
44
|
+
ims = image.as_science
|
45
|
+
file = Tempfile.new(['test',"jpg"])
|
46
|
+
ims.cropped_thumbnail(100) do |thumb|
|
47
|
+
thumb.save file.path
|
48
|
+
assert_size "100x100", file.path
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::PageList do
|
4
|
+
|
5
|
+
let(:guide){ load_document("with_bookmarks") }
|
6
|
+
|
7
|
+
it "can be empty" do
|
8
|
+
pdf = PDFium::Document.new
|
9
|
+
pages = pdf.pages
|
10
|
+
assert_kind_of PDFium::PageList, pages
|
11
|
+
assert pages.none?, "A freshly created Document shouldn't have any pages"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "can iterate" do
|
15
|
+
count = 0
|
16
|
+
guide.pages.each{ count += 1 }
|
17
|
+
assert_equal 3, count
|
18
|
+
end
|
19
|
+
|
20
|
+
it "supports access by index" do
|
21
|
+
assert_equal 1, guide.pages[0].number
|
22
|
+
assert_equal 2, guide.pages[1].number
|
23
|
+
end
|
24
|
+
end
|
data/test/page_spec.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative 'spec_helper'
|
3
|
+
|
4
|
+
describe PDFium::Page do
|
5
|
+
let(:utfdoc){ PDFium::Document.new( pdf_path("example_utf8") ) }
|
6
|
+
let(:textdoc){ PDFium::Document.new( pdf_path("example_divs") ) }
|
7
|
+
|
8
|
+
let(:page) { PDFium::Page.open(utfdoc,0) }
|
9
|
+
|
10
|
+
|
11
|
+
it "opens existing page" do
|
12
|
+
assert_kind_of PDFium::Page, page
|
13
|
+
end
|
14
|
+
|
15
|
+
it "creates a page" do
|
16
|
+
page = PDFium::Page.create(utfdoc,0)
|
17
|
+
assert_kind_of PDFium::Page, page
|
18
|
+
end
|
19
|
+
|
20
|
+
it "adds pages to an existing document" do
|
21
|
+
pdf = PDFium::Document.new
|
22
|
+
PDFium::Page.create(pdf)
|
23
|
+
PDFium::Page.create(pdf)
|
24
|
+
assert_equal 2, pdf.page_count
|
25
|
+
end
|
26
|
+
|
27
|
+
it "has dimensions" do
|
28
|
+
assert_in_delta 595.28, page.width
|
29
|
+
assert_in_delta 841.89, page.height
|
30
|
+
end
|
31
|
+
|
32
|
+
it "refuses to open invalid page ranges" do
|
33
|
+
assert_raises(RangeError) do
|
34
|
+
PDFium::Page.open(utfdoc,-1)
|
35
|
+
end
|
36
|
+
assert_raises(RangeError) do
|
37
|
+
PDFium::Page.open(utfdoc,90)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "creates new pages" do
|
42
|
+
pdf = PDFium::Document.new
|
43
|
+
page = PDFium::Page.create(pdf,0, width:100, height:180)
|
44
|
+
assert_equal 100, page.width
|
45
|
+
after_saving(pdf) do | saved |
|
46
|
+
assert_equal 1, saved.page_count
|
47
|
+
assert_equal 100.0, saved.page_at(0).width
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
it "can't be created using new" do
|
52
|
+
assert_raises(RuntimeError){ PDFium::Page.new }
|
53
|
+
end
|
54
|
+
|
55
|
+
it "can load/unload page" do
|
56
|
+
pdf = PDFium::Document.new
|
57
|
+
10.times do
|
58
|
+
page = PDFium::Page.create(pdf)
|
59
|
+
page.unload
|
60
|
+
end
|
61
|
+
assert_equal 10, pdf.page_count
|
62
|
+
page = pdf.page_at(1)
|
63
|
+
assert_equal 612.0, page.width
|
64
|
+
page.unload
|
65
|
+
assert_equal 612.0, page.width
|
66
|
+
end
|
67
|
+
|
68
|
+
it "can read text" do
|
69
|
+
ascii_text = PDFium::Page.open(textdoc,1).text.encode!("ASCII-8BIT")
|
70
|
+
assert_match /Cras tellus. Fusce aliquet/, ascii_text
|
71
|
+
end
|
72
|
+
|
73
|
+
it "can read utf text" do
|
74
|
+
strings = [
|
75
|
+
"Жълтата дюля беше щастлива",
|
76
|
+
"Jove xef, porti whisky amb quinze glaçons d'hidrogen",
|
77
|
+
"Příliš žluťoučký kůň úpěl ďábelské ódy",
|
78
|
+
"Høj bly gom vandt fræk sexquiz på wc",
|
79
|
+
"Doch Bep, flink sexy qua vorm, zwijgt",
|
80
|
+
"Törkylempijä vongahdus",
|
81
|
+
"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
|
82
|
+
]
|
83
|
+
text = PDFium::Page.open(utfdoc,0).text
|
84
|
+
utf8 = text.encode("UTF-8")
|
85
|
+
strings.each do | sentence |
|
86
|
+
assert_match sentence, utf8
|
87
|
+
assert_match sentence.encode("UTF-16LE"), text
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
data/test/pdfium_spec.rb
ADDED
data/test/profile.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pdfium'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
puts "Waiting for profiler attachment (PID: #{Process.pid})\nPress enter to continue"
|
7
|
+
gets
|
8
|
+
|
9
|
+
path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf').to_s
|
10
|
+
|
11
|
+
guide = PDFium::Document.new( path )
|
12
|
+
`rm /tmp/images/*`
|
13
|
+
page = guide.page_at(0)
|
14
|
+
page.each_image do |img|
|
15
|
+
img.save("/tmp/images/#{img.index}.png")
|
16
|
+
end
|
17
|
+
|
18
|
+
# count = 0
|
19
|
+
# pdf.bookmarks.each do | bm |
|
20
|
+
# count +=1
|
21
|
+
# print count.to_s + " "
|
22
|
+
# puts bm.title
|
23
|
+
# end
|
24
|
+
|
25
|
+
# pdf.each_page do | page |
|
26
|
+
# page.width
|
27
|
+
# end
|
28
|
+
|
29
|
+
# GC.start(full_mark: true, immediate_sweep: true)
|