pdfium 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.ruby-version +1 -0
- data/Gemfile +9 -0
- data/Guardfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +68 -0
- data/Rakefile +62 -0
- data/ext/pdfium_ext/bookmark.cc +221 -0
- data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
- data/ext/pdfium_ext/document.cc +268 -0
- data/ext/pdfium_ext/document.h +66 -0
- data/ext/pdfium_ext/document_wrapper.cc +63 -0
- data/ext/pdfium_ext/document_wrapper.h +56 -0
- data/ext/pdfium_ext/extconf.h +3 -0
- data/ext/pdfium_ext/extconf.rb +76 -0
- data/ext/pdfium_ext/image.cc +332 -0
- data/ext/pdfium_ext/page.cc +392 -0
- data/ext/pdfium_ext/page.h +5 -0
- data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
- data/ext/pdfium_ext/page_object_wrapper.h +27 -0
- data/ext/pdfium_ext/page_wrapper.cc +86 -0
- data/ext/pdfium_ext/page_wrapper.h +37 -0
- data/ext/pdfium_ext/pdfium.cc +115 -0
- data/ext/pdfium_ext/pdfium.h +69 -0
- data/lib/pdfium.rb +15 -0
- data/lib/pdfium/bookmark_list.rb +28 -0
- data/lib/pdfium/bounding_box.rb +16 -0
- data/lib/pdfium/image_list.rb +21 -0
- data/lib/pdfium/page_list.rb +36 -0
- data/lib/pdfium/page_sizes.rb +7 -0
- data/lib/pdfium/version.rb +4 -0
- data/pdfium.gemspec +29 -0
- data/test/benchmark-docsplit.rb +41 -0
- data/test/bookmarks_list_spec.rb +26 -0
- data/test/bookmarks_spec.rb +34 -0
- data/test/debug.rb +24 -0
- data/test/document_spec.rb +49 -0
- data/test/image_list_spec.rb +18 -0
- data/test/image_spec.rb +53 -0
- data/test/page_list_spec.rb +24 -0
- data/test/page_spec.rb +91 -0
- data/test/pdfium_spec.rb +15 -0
- data/test/profile.rb +29 -0
- data/test/spec_helper.rb +31 -0
- metadata +202 -0
data/pdfium.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'pdfium/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "pdfium"
|
8
|
+
spec.version = PDFium::VERSION
|
9
|
+
spec.authors = ["Nathan Stitt", "Ted Han"]
|
10
|
+
spec.email = ["nathan@stitt.org"]
|
11
|
+
spec.summary = %q{Ruby bindings for Google's PDFium project}
|
12
|
+
spec.description = %q{Ruby bindings for Google's PDFium project. It supports extracting text and images from PDF's as well as rendering pages to bitmaps}
|
13
|
+
spec.homepage = "https://github.com/nathanstitt/pdfium-ruby"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
|
24
|
+
spec.add_development_dependency "guard-minitest"
|
25
|
+
spec.add_development_dependency "guard-rake"
|
26
|
+
spec.add_development_dependency "fastimage"
|
27
|
+
spec.add_development_dependency "image_science"
|
28
|
+
spec.add_development_dependency "rake-compiler"
|
29
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rusage'
|
4
|
+
require 'docsplit'
|
5
|
+
|
6
|
+
|
7
|
+
IMAGE_SIZES = {
|
8
|
+
large: '1000x',
|
9
|
+
normal: '700x',
|
10
|
+
small: '180x',
|
11
|
+
thumbnail: '60x75!'
|
12
|
+
}
|
13
|
+
|
14
|
+
output_directory = Dir.mktmpdir
|
15
|
+
|
16
|
+
start_time = Time.now
|
17
|
+
Dir.glob("test-pdfs/*.pdf").each do |pdf|
|
18
|
+
Docsplit.extract_images(pdf, :format => :gif, :size => IMAGE_SIZES.values, :rolling => true, :output => output_directory)
|
19
|
+
end
|
20
|
+
elapsed_time = Time.now-start_time
|
21
|
+
|
22
|
+
# Process.crusage measures children
|
23
|
+
# Process.rusage measures self
|
24
|
+
usage = Process.crusage
|
25
|
+
|
26
|
+
rss = usage.maxrss.to_f
|
27
|
+
# OSX reports in terms of bytes, bsd & linux use kb. "man getrusage"
|
28
|
+
rss = rss/1024 if RUBY_PLATFORM =~ /darwin/
|
29
|
+
|
30
|
+
du = Dir.glob("#{output_directory}/*/**").inject(0.0){|x,img| x+File.stat(img).size }
|
31
|
+
FileUtils.rm_r output_directory
|
32
|
+
|
33
|
+
def report(label,value,specifier="")
|
34
|
+
printf("%12s: %8.3f %s\n", label, value, specifier)
|
35
|
+
end
|
36
|
+
|
37
|
+
report "Elapsed", elapsed_time, "Seconds"
|
38
|
+
report "System CPU", usage.stime
|
39
|
+
report "User CPU", usage.utime
|
40
|
+
report "Max Memory", (rss/1024),"MB"
|
41
|
+
report "Disk Space", (du/1024/1024), "MB"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::BookmarkList do
|
4
|
+
|
5
|
+
let(:api){ load_document("with_bookmarks") }
|
6
|
+
let(:utf){ load_document("example_utf8") }
|
7
|
+
|
8
|
+
it "can be created" do
|
9
|
+
assert_kind_of PDFium::BookmarkList, api.bookmarks
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can be empty" do
|
13
|
+
bm = PDFium::Bookmark.new(document: api)
|
14
|
+
assert_kind_of PDFium::BookmarkList, bm.children
|
15
|
+
assert bm.children.empty?, "First bookmark shouldn't have any children"
|
16
|
+
end
|
17
|
+
|
18
|
+
it "can iterate" do
|
19
|
+
count = 0
|
20
|
+
utf.bookmarks.each do | bm |
|
21
|
+
count +=1
|
22
|
+
end
|
23
|
+
assert_equal 16, count
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::Bookmark do
|
4
|
+
let(:api){ load_document("with_bookmarks") }
|
5
|
+
|
6
|
+
it "can be read" do
|
7
|
+
bm = PDFium::Bookmark.new(document: api)
|
8
|
+
assert_equal Encoding::UTF_16LE, bm.title.encoding
|
9
|
+
assert_equal "INDEX", bm.title.encode!("ASCII-8BIT")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can create siblings" do
|
13
|
+
bm = PDFium::Bookmark.new(document: api)
|
14
|
+
second = bm.next_sibling
|
15
|
+
assert_kind_of PDFium::Bookmark, second
|
16
|
+
assert_equal "Chapter 1", second.title.encode!("ASCII-8BIT")
|
17
|
+
refute second.next_sibling, "PDF shouldn't have 3 top level bookmarks"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "can create children" do
|
21
|
+
bm = PDFium::Bookmark.new(document: api)
|
22
|
+
children = bm.next_sibling.children
|
23
|
+
assert_kind_of PDFium::BookmarkList, children
|
24
|
+
assert_equal 1, children.count
|
25
|
+
end
|
26
|
+
|
27
|
+
it "has destinations" do
|
28
|
+
bm = PDFium::Bookmark.new(document: api).next_sibling
|
29
|
+
assert_kind_of Hash, bm.destination
|
30
|
+
assert_equal :destination, bm.destination[:type]
|
31
|
+
assert_equal 1, bm.destination[:page_number]
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
data/test/debug.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pdfium'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
|
7
|
+
path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf')
|
8
|
+
|
9
|
+
# guide = PDFium::Document.new( path )
|
10
|
+
# `rm /tmp/images/*`
|
11
|
+
# page = guide.page_at(0)
|
12
|
+
|
13
|
+
|
14
|
+
# path.write page.as_image(height: 120).data('jpg')
|
15
|
+
|
16
|
+
data = path.read
|
17
|
+
puts data.length
|
18
|
+
pdf = PDFium::Document.from_memory(data)
|
19
|
+
|
20
|
+
puts pdf.page_count
|
21
|
+
|
22
|
+
# page.each_image do |img|
|
23
|
+
# img.save("/tmp/images/#{img.index}.png")
|
24
|
+
# end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::Document do
|
4
|
+
let(:guide){ load_document("example_images") }
|
5
|
+
|
6
|
+
it "can create a new empty pdf" do
|
7
|
+
pdf = PDFium::Document.new
|
8
|
+
assert pdf
|
9
|
+
end
|
10
|
+
|
11
|
+
it "can be initialized from string" do
|
12
|
+
data = pdf_path("with_bookmarks").read
|
13
|
+
pdf = PDFium::Document.from_memory(data)
|
14
|
+
assert pdf
|
15
|
+
assert_equal 3, pdf.page_count
|
16
|
+
end
|
17
|
+
|
18
|
+
it "counts pdf pages" do
|
19
|
+
assert_equal 3, guide.page_count
|
20
|
+
end
|
21
|
+
|
22
|
+
it "can save to a file" do
|
23
|
+
pdf = PDFium::Document.new
|
24
|
+
PDFium::Page.create(pdf,0)
|
25
|
+
Tempfile.open(['test','.pdf']) do |f|
|
26
|
+
pdf.save(f.path)
|
27
|
+
reloaded = PDFium::Document.new(f.path)
|
28
|
+
assert_equal 1, reloaded.page_count
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "returns pages" do
|
33
|
+
assert guide
|
34
|
+
assert_kind_of PDFium::PageList, guide.pages
|
35
|
+
end
|
36
|
+
|
37
|
+
it "can read metadata" do
|
38
|
+
assert_equal "mPDF 5.1", guide.metadata[:producer].encode!("ASCII-8BIT")
|
39
|
+
end
|
40
|
+
|
41
|
+
it "can write metadata" do
|
42
|
+
guide.metadata do | md |
|
43
|
+
md[:author] = "My Little Writer"
|
44
|
+
end
|
45
|
+
after_saving(guide) do | saved |
|
46
|
+
assert_equal "My Little Writer", saved.metadata[:author].encode!("ASCII-8BIT")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::ImageList do
|
4
|
+
|
5
|
+
let(:image_doc){ load_document("example_images").page_at(0) }
|
6
|
+
let(:blank_page){ load_document("example_utf8").page_at(0) }
|
7
|
+
|
8
|
+
it "can be empty" do
|
9
|
+
assert blank_page.images.none?, "images found where there should not be"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can iterate" do
|
13
|
+
count = 0
|
14
|
+
image_doc.images.each{|i| count+=1 }
|
15
|
+
assert_equal 26, count
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/test/image_spec.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require 'image_science'
|
3
|
+
|
4
|
+
describe PDFium::Image do
|
5
|
+
let(:guide){ load_document("example_images") }
|
6
|
+
let(:page) { guide.page_at(0) }
|
7
|
+
|
8
|
+
it "saves as various formats" do
|
9
|
+
%w{png jpeg tiff bmp gif}.each do | ext |
|
10
|
+
file = Tempfile.new(['test', ".#{ext}"])
|
11
|
+
width = rand(200) + 100
|
12
|
+
height = rand(300) + 100
|
13
|
+
page.as_image(width: width, height: height).save(file.path)
|
14
|
+
assert_size "#{width}x#{height}", file.path
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
it "dumps to string" do
|
20
|
+
file = Tempfile.new(['test',".jpeg"])
|
21
|
+
file.write page.as_image(height: 120).data('jpg')
|
22
|
+
file.flush
|
23
|
+
assert_size "84x120", file.path
|
24
|
+
end
|
25
|
+
|
26
|
+
it "iterates over page images" do
|
27
|
+
valid_sizes = [567, 284, 386, 227, 500, 939, 950, 959]
|
28
|
+
count = 0
|
29
|
+
page.each_image do |img|
|
30
|
+
count += 1
|
31
|
+
next if count % 4 == 0 # to speed up spec runs only sample 1/4 of time
|
32
|
+
assert_kind_of PDFium::Image, img
|
33
|
+
file = Tempfile.new(['test',".png"])
|
34
|
+
img.save(file.path)
|
35
|
+
assert_includes valid_sizes, FastImage.size(file).first
|
36
|
+
assert_includes valid_sizes, FastImage.size(file).last
|
37
|
+
end
|
38
|
+
assert_equal 26, count, "Incorrect # of images counted"
|
39
|
+
end
|
40
|
+
|
41
|
+
it "can return an ImageScience instance" do
|
42
|
+
image = page.images.first
|
43
|
+
assert image
|
44
|
+
ims = image.as_science
|
45
|
+
file = Tempfile.new(['test',"jpg"])
|
46
|
+
ims.cropped_thumbnail(100) do |thumb|
|
47
|
+
thumb.save file.path
|
48
|
+
assert_size "100x100", file.path
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe PDFium::PageList do
|
4
|
+
|
5
|
+
let(:guide){ load_document("with_bookmarks") }
|
6
|
+
|
7
|
+
it "can be empty" do
|
8
|
+
pdf = PDFium::Document.new
|
9
|
+
pages = pdf.pages
|
10
|
+
assert_kind_of PDFium::PageList, pages
|
11
|
+
assert pages.none?, "A freshly created Document shouldn't have any pages"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "can iterate" do
|
15
|
+
count = 0
|
16
|
+
guide.pages.each{ count += 1 }
|
17
|
+
assert_equal 3, count
|
18
|
+
end
|
19
|
+
|
20
|
+
it "supports access by index" do
|
21
|
+
assert_equal 1, guide.pages[0].number
|
22
|
+
assert_equal 2, guide.pages[1].number
|
23
|
+
end
|
24
|
+
end
|
data/test/page_spec.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative 'spec_helper'
|
3
|
+
|
4
|
+
describe PDFium::Page do
|
5
|
+
let(:utfdoc){ PDFium::Document.new( pdf_path("example_utf8") ) }
|
6
|
+
let(:textdoc){ PDFium::Document.new( pdf_path("example_divs") ) }
|
7
|
+
|
8
|
+
let(:page) { PDFium::Page.open(utfdoc,0) }
|
9
|
+
|
10
|
+
|
11
|
+
it "opens existing page" do
|
12
|
+
assert_kind_of PDFium::Page, page
|
13
|
+
end
|
14
|
+
|
15
|
+
it "creates a page" do
|
16
|
+
page = PDFium::Page.create(utfdoc,0)
|
17
|
+
assert_kind_of PDFium::Page, page
|
18
|
+
end
|
19
|
+
|
20
|
+
it "adds pages to an existing document" do
|
21
|
+
pdf = PDFium::Document.new
|
22
|
+
PDFium::Page.create(pdf)
|
23
|
+
PDFium::Page.create(pdf)
|
24
|
+
assert_equal 2, pdf.page_count
|
25
|
+
end
|
26
|
+
|
27
|
+
it "has dimensions" do
|
28
|
+
assert_in_delta 595.28, page.width
|
29
|
+
assert_in_delta 841.89, page.height
|
30
|
+
end
|
31
|
+
|
32
|
+
it "refuses to open invalid page ranges" do
|
33
|
+
assert_raises(RangeError) do
|
34
|
+
PDFium::Page.open(utfdoc,-1)
|
35
|
+
end
|
36
|
+
assert_raises(RangeError) do
|
37
|
+
PDFium::Page.open(utfdoc,90)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "creates new pages" do
|
42
|
+
pdf = PDFium::Document.new
|
43
|
+
page = PDFium::Page.create(pdf,0, width:100, height:180)
|
44
|
+
assert_equal 100, page.width
|
45
|
+
after_saving(pdf) do | saved |
|
46
|
+
assert_equal 1, saved.page_count
|
47
|
+
assert_equal 100.0, saved.page_at(0).width
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
it "can't be created using new" do
|
52
|
+
assert_raises(RuntimeError){ PDFium::Page.new }
|
53
|
+
end
|
54
|
+
|
55
|
+
it "can load/unload page" do
|
56
|
+
pdf = PDFium::Document.new
|
57
|
+
10.times do
|
58
|
+
page = PDFium::Page.create(pdf)
|
59
|
+
page.unload
|
60
|
+
end
|
61
|
+
assert_equal 10, pdf.page_count
|
62
|
+
page = pdf.page_at(1)
|
63
|
+
assert_equal 612.0, page.width
|
64
|
+
page.unload
|
65
|
+
assert_equal 612.0, page.width
|
66
|
+
end
|
67
|
+
|
68
|
+
it "can read text" do
|
69
|
+
ascii_text = PDFium::Page.open(textdoc,1).text.encode!("ASCII-8BIT")
|
70
|
+
assert_match /Cras tellus. Fusce aliquet/, ascii_text
|
71
|
+
end
|
72
|
+
|
73
|
+
it "can read utf text" do
|
74
|
+
strings = [
|
75
|
+
"Жълтата дюля беше щастлива",
|
76
|
+
"Jove xef, porti whisky amb quinze glaçons d'hidrogen",
|
77
|
+
"Příliš žluťoučký kůň úpěl ďábelské ódy",
|
78
|
+
"Høj bly gom vandt fræk sexquiz på wc",
|
79
|
+
"Doch Bep, flink sexy qua vorm, zwijgt",
|
80
|
+
"Törkylempijä vongahdus",
|
81
|
+
"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
|
82
|
+
]
|
83
|
+
text = PDFium::Page.open(utfdoc,0).text
|
84
|
+
utf8 = text.encode("UTF-8")
|
85
|
+
strings.each do | sentence |
|
86
|
+
assert_match sentence, utf8
|
87
|
+
assert_match sentence.encode("UTF-16LE"), text
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
data/test/pdfium_spec.rb
ADDED
data/test/profile.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pdfium'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
puts "Waiting for profiler attachment (PID: #{Process.pid})\nPress enter to continue"
|
7
|
+
gets
|
8
|
+
|
9
|
+
path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf').to_s
|
10
|
+
|
11
|
+
guide = PDFium::Document.new( path )
|
12
|
+
`rm /tmp/images/*`
|
13
|
+
page = guide.page_at(0)
|
14
|
+
page.each_image do |img|
|
15
|
+
img.save("/tmp/images/#{img.index}.png")
|
16
|
+
end
|
17
|
+
|
18
|
+
# count = 0
|
19
|
+
# pdf.bookmarks.each do | bm |
|
20
|
+
# count +=1
|
21
|
+
# print count.to_s + " "
|
22
|
+
# puts bm.title
|
23
|
+
# end
|
24
|
+
|
25
|
+
# pdf.each_page do | page |
|
26
|
+
# page.width
|
27
|
+
# end
|
28
|
+
|
29
|
+
# GC.start(full_mark: true, immediate_sweep: true)
|