pdfium 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +9 -0
  5. data/Guardfile +7 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +68 -0
  8. data/Rakefile +62 -0
  9. data/ext/pdfium_ext/bookmark.cc +221 -0
  10. data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
  11. data/ext/pdfium_ext/document.cc +268 -0
  12. data/ext/pdfium_ext/document.h +66 -0
  13. data/ext/pdfium_ext/document_wrapper.cc +63 -0
  14. data/ext/pdfium_ext/document_wrapper.h +56 -0
  15. data/ext/pdfium_ext/extconf.h +3 -0
  16. data/ext/pdfium_ext/extconf.rb +76 -0
  17. data/ext/pdfium_ext/image.cc +332 -0
  18. data/ext/pdfium_ext/page.cc +392 -0
  19. data/ext/pdfium_ext/page.h +5 -0
  20. data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
  21. data/ext/pdfium_ext/page_object_wrapper.h +27 -0
  22. data/ext/pdfium_ext/page_wrapper.cc +86 -0
  23. data/ext/pdfium_ext/page_wrapper.h +37 -0
  24. data/ext/pdfium_ext/pdfium.cc +115 -0
  25. data/ext/pdfium_ext/pdfium.h +69 -0
  26. data/lib/pdfium.rb +15 -0
  27. data/lib/pdfium/bookmark_list.rb +28 -0
  28. data/lib/pdfium/bounding_box.rb +16 -0
  29. data/lib/pdfium/image_list.rb +21 -0
  30. data/lib/pdfium/page_list.rb +36 -0
  31. data/lib/pdfium/page_sizes.rb +7 -0
  32. data/lib/pdfium/version.rb +4 -0
  33. data/pdfium.gemspec +29 -0
  34. data/test/benchmark-docsplit.rb +41 -0
  35. data/test/bookmarks_list_spec.rb +26 -0
  36. data/test/bookmarks_spec.rb +34 -0
  37. data/test/debug.rb +24 -0
  38. data/test/document_spec.rb +49 -0
  39. data/test/image_list_spec.rb +18 -0
  40. data/test/image_spec.rb +53 -0
  41. data/test/page_list_spec.rb +24 -0
  42. data/test/page_spec.rb +91 -0
  43. data/test/pdfium_spec.rb +15 -0
  44. data/test/profile.rb +29 -0
  45. data/test/spec_helper.rb +31 -0
  46. metadata +202 -0
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pdfium/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pdfium"
8
+ spec.version = PDFium::VERSION
9
+ spec.authors = ["Nathan Stitt", "Ted Han"]
10
+ spec.email = ["nathan@stitt.org"]
11
+ spec.summary = %q{Ruby bindings for Google's PDFium project}
12
+ spec.description = %q{Ruby bindings for Google's PDFium project. It supports extracting text and images from PDF's as well as rendering pages to bitmaps}
13
+ spec.homepage = "https://github.com/nathanstitt/pdfium-ruby"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_development_dependency "guard-minitest"
25
+ spec.add_development_dependency "guard-rake"
26
+ spec.add_development_dependency "fastimage"
27
+ spec.add_development_dependency "image_science"
28
+ spec.add_development_dependency "rake-compiler"
29
+ end
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rusage'
4
+ require 'docsplit'
5
+
6
+
7
+ IMAGE_SIZES = {
8
+ large: '1000x',
9
+ normal: '700x',
10
+ small: '180x',
11
+ thumbnail: '60x75!'
12
+ }
13
+
14
+ output_directory = Dir.mktmpdir
15
+
16
+ start_time = Time.now
17
+ Dir.glob("test-pdfs/*.pdf").each do |pdf|
18
+ Docsplit.extract_images(pdf, :format => :gif, :size => IMAGE_SIZES.values, :rolling => true, :output => output_directory)
19
+ end
20
+ elapsed_time = Time.now-start_time
21
+
22
+ # Process.crusage measures children
23
+ # Process.rusage measures self
24
+ usage = Process.crusage
25
+
26
+ rss = usage.maxrss.to_f
27
+ # OSX reports in terms of bytes, bsd & linux use kb. "man getrusage"
28
+ rss = rss/1024 if RUBY_PLATFORM =~ /darwin/
29
+
30
+ du = Dir.glob("#{output_directory}/*/**").inject(0.0){|x,img| x+File.stat(img).size }
31
+ FileUtils.rm_r output_directory
32
+
33
+ def report(label,value,specifier="")
34
+ printf("%12s: %8.3f %s\n", label, value, specifier)
35
+ end
36
+
37
+ report "Elapsed", elapsed_time, "Seconds"
38
+ report "System CPU", usage.stime
39
+ report "User CPU", usage.utime
40
+ report "Max Memory", (rss/1024),"MB"
41
+ report "Disk Space", (du/1024/1024), "MB"
@@ -0,0 +1,26 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::BookmarkList do
4
+
5
+ let(:api){ load_document("with_bookmarks") }
6
+ let(:utf){ load_document("example_utf8") }
7
+
8
+ it "can be created" do
9
+ assert_kind_of PDFium::BookmarkList, api.bookmarks
10
+ end
11
+
12
+ it "can be empty" do
13
+ bm = PDFium::Bookmark.new(document: api)
14
+ assert_kind_of PDFium::BookmarkList, bm.children
15
+ assert bm.children.empty?, "First bookmark shouldn't have any children"
16
+ end
17
+
18
+ it "can iterate" do
19
+ count = 0
20
+ utf.bookmarks.each do | bm |
21
+ count +=1
22
+ end
23
+ assert_equal 16, count
24
+ end
25
+
26
+ end
@@ -0,0 +1,34 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::Bookmark do
4
+ let(:api){ load_document("with_bookmarks") }
5
+
6
+ it "can be read" do
7
+ bm = PDFium::Bookmark.new(document: api)
8
+ assert_equal Encoding::UTF_16LE, bm.title.encoding
9
+ assert_equal "INDEX", bm.title.encode!("ASCII-8BIT")
10
+ end
11
+
12
+ it "can create siblings" do
13
+ bm = PDFium::Bookmark.new(document: api)
14
+ second = bm.next_sibling
15
+ assert_kind_of PDFium::Bookmark, second
16
+ assert_equal "Chapter 1", second.title.encode!("ASCII-8BIT")
17
+ refute second.next_sibling, "PDF shouldn't have 3 top level bookmarks"
18
+ end
19
+
20
+ it "can create children" do
21
+ bm = PDFium::Bookmark.new(document: api)
22
+ children = bm.next_sibling.children
23
+ assert_kind_of PDFium::BookmarkList, children
24
+ assert_equal 1, children.count
25
+ end
26
+
27
+ it "has destinations" do
28
+ bm = PDFium::Bookmark.new(document: api).next_sibling
29
+ assert_kind_of Hash, bm.destination
30
+ assert_equal :destination, bm.destination[:type]
31
+ assert_equal 1, bm.destination[:page_number]
32
+ end
33
+
34
+ end
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/pdfium'
4
+ require 'pathname'
5
+
6
+
7
+ path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf')
8
+
9
+ # guide = PDFium::Document.new( path )
10
+ # `rm /tmp/images/*`
11
+ # page = guide.page_at(0)
12
+
13
+
14
+ # path.write page.as_image(height: 120).data('jpg')
15
+
16
+ data = path.read
17
+ puts data.length
18
+ pdf = PDFium::Document.from_memory(data)
19
+
20
+ puts pdf.page_count
21
+
22
+ # page.each_image do |img|
23
+ # img.save("/tmp/images/#{img.index}.png")
24
+ # end
@@ -0,0 +1,49 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::Document do
4
+ let(:guide){ load_document("example_images") }
5
+
6
+ it "can create a new empty pdf" do
7
+ pdf = PDFium::Document.new
8
+ assert pdf
9
+ end
10
+
11
+ it "can be initialized from string" do
12
+ data = pdf_path("with_bookmarks").read
13
+ pdf = PDFium::Document.from_memory(data)
14
+ assert pdf
15
+ assert_equal 3, pdf.page_count
16
+ end
17
+
18
+ it "counts pdf pages" do
19
+ assert_equal 3, guide.page_count
20
+ end
21
+
22
+ it "can save to a file" do
23
+ pdf = PDFium::Document.new
24
+ PDFium::Page.create(pdf,0)
25
+ Tempfile.open(['test','.pdf']) do |f|
26
+ pdf.save(f.path)
27
+ reloaded = PDFium::Document.new(f.path)
28
+ assert_equal 1, reloaded.page_count
29
+ end
30
+ end
31
+
32
+ it "returns pages" do
33
+ assert guide
34
+ assert_kind_of PDFium::PageList, guide.pages
35
+ end
36
+
37
+ it "can read metadata" do
38
+ assert_equal "mPDF 5.1", guide.metadata[:producer].encode!("ASCII-8BIT")
39
+ end
40
+
41
+ it "can write metadata" do
42
+ guide.metadata do | md |
43
+ md[:author] = "My Little Writer"
44
+ end
45
+ after_saving(guide) do | saved |
46
+ assert_equal "My Little Writer", saved.metadata[:author].encode!("ASCII-8BIT")
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,18 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::ImageList do
4
+
5
+ let(:image_doc){ load_document("example_images").page_at(0) }
6
+ let(:blank_page){ load_document("example_utf8").page_at(0) }
7
+
8
+ it "can be empty" do
9
+ assert blank_page.images.none?, "images found where there should not be"
10
+ end
11
+
12
+ it "can iterate" do
13
+ count = 0
14
+ image_doc.images.each{|i| count+=1 }
15
+ assert_equal 26, count
16
+ end
17
+
18
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'spec_helper'
2
+ require 'image_science'
3
+
4
+ describe PDFium::Image do
5
+ let(:guide){ load_document("example_images") }
6
+ let(:page) { guide.page_at(0) }
7
+
8
+ it "saves as various formats" do
9
+ %w{png jpeg tiff bmp gif}.each do | ext |
10
+ file = Tempfile.new(['test', ".#{ext}"])
11
+ width = rand(200) + 100
12
+ height = rand(300) + 100
13
+ page.as_image(width: width, height: height).save(file.path)
14
+ assert_size "#{width}x#{height}", file.path
15
+ end
16
+ end
17
+
18
+
19
+ it "dumps to string" do
20
+ file = Tempfile.new(['test',".jpeg"])
21
+ file.write page.as_image(height: 120).data('jpg')
22
+ file.flush
23
+ assert_size "84x120", file.path
24
+ end
25
+
26
+ it "iterates over page images" do
27
+ valid_sizes = [567, 284, 386, 227, 500, 939, 950, 959]
28
+ count = 0
29
+ page.each_image do |img|
30
+ count += 1
31
+ next if count % 4 == 0 # to speed up spec runs only sample 1/4 of time
32
+ assert_kind_of PDFium::Image, img
33
+ file = Tempfile.new(['test',".png"])
34
+ img.save(file.path)
35
+ assert_includes valid_sizes, FastImage.size(file).first
36
+ assert_includes valid_sizes, FastImage.size(file).last
37
+ end
38
+ assert_equal 26, count, "Incorrect # of images counted"
39
+ end
40
+
41
+ it "can return an ImageScience instance" do
42
+ image = page.images.first
43
+ assert image
44
+ ims = image.as_science
45
+ file = Tempfile.new(['test',"jpg"])
46
+ ims.cropped_thumbnail(100) do |thumb|
47
+ thumb.save file.path
48
+ assert_size "100x100", file.path
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,24 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::PageList do
4
+
5
+ let(:guide){ load_document("with_bookmarks") }
6
+
7
+ it "can be empty" do
8
+ pdf = PDFium::Document.new
9
+ pages = pdf.pages
10
+ assert_kind_of PDFium::PageList, pages
11
+ assert pages.none?, "A freshly created Document shouldn't have any pages"
12
+ end
13
+
14
+ it "can iterate" do
15
+ count = 0
16
+ guide.pages.each{ count += 1 }
17
+ assert_equal 3, count
18
+ end
19
+
20
+ it "supports access by index" do
21
+ assert_equal 1, guide.pages[0].number
22
+ assert_equal 2, guide.pages[1].number
23
+ end
24
+ end
@@ -0,0 +1,91 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative 'spec_helper'
3
+
4
+ describe PDFium::Page do
5
+ let(:utfdoc){ PDFium::Document.new( pdf_path("example_utf8") ) }
6
+ let(:textdoc){ PDFium::Document.new( pdf_path("example_divs") ) }
7
+
8
+ let(:page) { PDFium::Page.open(utfdoc,0) }
9
+
10
+
11
+ it "opens existing page" do
12
+ assert_kind_of PDFium::Page, page
13
+ end
14
+
15
+ it "creates a page" do
16
+ page = PDFium::Page.create(utfdoc,0)
17
+ assert_kind_of PDFium::Page, page
18
+ end
19
+
20
+ it "adds pages to an existing document" do
21
+ pdf = PDFium::Document.new
22
+ PDFium::Page.create(pdf)
23
+ PDFium::Page.create(pdf)
24
+ assert_equal 2, pdf.page_count
25
+ end
26
+
27
+ it "has dimensions" do
28
+ assert_in_delta 595.28, page.width
29
+ assert_in_delta 841.89, page.height
30
+ end
31
+
32
+ it "refuses to open invalid page ranges" do
33
+ assert_raises(RangeError) do
34
+ PDFium::Page.open(utfdoc,-1)
35
+ end
36
+ assert_raises(RangeError) do
37
+ PDFium::Page.open(utfdoc,90)
38
+ end
39
+ end
40
+
41
+ it "creates new pages" do
42
+ pdf = PDFium::Document.new
43
+ page = PDFium::Page.create(pdf,0, width:100, height:180)
44
+ assert_equal 100, page.width
45
+ after_saving(pdf) do | saved |
46
+ assert_equal 1, saved.page_count
47
+ assert_equal 100.0, saved.page_at(0).width
48
+ end
49
+ end
50
+
51
+ it "can't be created using new" do
52
+ assert_raises(RuntimeError){ PDFium::Page.new }
53
+ end
54
+
55
+ it "can load/unload page" do
56
+ pdf = PDFium::Document.new
57
+ 10.times do
58
+ page = PDFium::Page.create(pdf)
59
+ page.unload
60
+ end
61
+ assert_equal 10, pdf.page_count
62
+ page = pdf.page_at(1)
63
+ assert_equal 612.0, page.width
64
+ page.unload
65
+ assert_equal 612.0, page.width
66
+ end
67
+
68
+ it "can read text" do
69
+ ascii_text = PDFium::Page.open(textdoc,1).text.encode!("ASCII-8BIT")
70
+ assert_match /Cras tellus. Fusce aliquet/, ascii_text
71
+ end
72
+
73
+ it "can read utf text" do
74
+ strings = [
75
+ "Жълтата дюля беше щастлива",
76
+ "Jove xef, porti whisky amb quinze glaçons d'hidrogen",
77
+ "Příliš žluťoučký kůň úpěl ďábelské ódy",
78
+ "Høj bly gom vandt fræk sexquiz på wc",
79
+ "Doch Bep, flink sexy qua vorm, zwijgt",
80
+ "Törkylempijä vongahdus",
81
+ "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
82
+ ]
83
+ text = PDFium::Page.open(utfdoc,0).text
84
+ utf8 = text.encode("UTF-8")
85
+ strings.each do | sentence |
86
+ assert_match sentence, utf8
87
+ assert_match sentence.encode("UTF-16LE"), text
88
+ end
89
+ end
90
+
91
+ end
@@ -0,0 +1,15 @@
1
+ require_relative 'spec_helper'
2
+ require 'tempfile'
3
+
4
+
5
+ describe PDFium do
6
+
7
+ it "creates classes" do
8
+ assert PDFium::Document
9
+ assert PDFium::Page
10
+ assert PDFium::Bookmark
11
+ assert PDFium::BookmarkList
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/pdfium'
4
+ require 'pathname'
5
+
6
+ puts "Waiting for profiler attachment (PID: #{Process.pid})\nPress enter to continue"
7
+ gets
8
+
9
+ path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf').to_s
10
+
11
+ guide = PDFium::Document.new( path )
12
+ `rm /tmp/images/*`
13
+ page = guide.page_at(0)
14
+ page.each_image do |img|
15
+ img.save("/tmp/images/#{img.index}.png")
16
+ end
17
+
18
+ # count = 0
19
+ # pdf.bookmarks.each do | bm |
20
+ # count +=1
21
+ # print count.to_s + " "
22
+ # puts bm.title
23
+ # end
24
+
25
+ # pdf.each_page do | page |
26
+ # page.width
27
+ # end
28
+
29
+ # GC.start(full_mark: true, immediate_sweep: true)