pdfium 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +9 -0
  5. data/Guardfile +7 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +68 -0
  8. data/Rakefile +62 -0
  9. data/ext/pdfium_ext/bookmark.cc +221 -0
  10. data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
  11. data/ext/pdfium_ext/document.cc +268 -0
  12. data/ext/pdfium_ext/document.h +66 -0
  13. data/ext/pdfium_ext/document_wrapper.cc +63 -0
  14. data/ext/pdfium_ext/document_wrapper.h +56 -0
  15. data/ext/pdfium_ext/extconf.h +3 -0
  16. data/ext/pdfium_ext/extconf.rb +76 -0
  17. data/ext/pdfium_ext/image.cc +332 -0
  18. data/ext/pdfium_ext/page.cc +392 -0
  19. data/ext/pdfium_ext/page.h +5 -0
  20. data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
  21. data/ext/pdfium_ext/page_object_wrapper.h +27 -0
  22. data/ext/pdfium_ext/page_wrapper.cc +86 -0
  23. data/ext/pdfium_ext/page_wrapper.h +37 -0
  24. data/ext/pdfium_ext/pdfium.cc +115 -0
  25. data/ext/pdfium_ext/pdfium.h +69 -0
  26. data/lib/pdfium.rb +15 -0
  27. data/lib/pdfium/bookmark_list.rb +28 -0
  28. data/lib/pdfium/bounding_box.rb +16 -0
  29. data/lib/pdfium/image_list.rb +21 -0
  30. data/lib/pdfium/page_list.rb +36 -0
  31. data/lib/pdfium/page_sizes.rb +7 -0
  32. data/lib/pdfium/version.rb +4 -0
  33. data/pdfium.gemspec +29 -0
  34. data/test/benchmark-docsplit.rb +41 -0
  35. data/test/bookmarks_list_spec.rb +26 -0
  36. data/test/bookmarks_spec.rb +34 -0
  37. data/test/debug.rb +24 -0
  38. data/test/document_spec.rb +49 -0
  39. data/test/image_list_spec.rb +18 -0
  40. data/test/image_spec.rb +53 -0
  41. data/test/page_list_spec.rb +24 -0
  42. data/test/page_spec.rb +91 -0
  43. data/test/pdfium_spec.rb +15 -0
  44. data/test/profile.rb +29 -0
  45. data/test/spec_helper.rb +31 -0
  46. metadata +202 -0
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pdfium/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pdfium"
8
+ spec.version = PDFium::VERSION
9
+ spec.authors = ["Nathan Stitt", "Ted Han"]
10
+ spec.email = ["nathan@stitt.org"]
11
+ spec.summary = %q{Ruby bindings for Google's PDFium project}
12
+ spec.description = %q{Ruby bindings for Google's PDFium project. It supports extracting text and images from PDF's as well as rendering pages to bitmaps}
13
+ spec.homepage = "https://github.com/nathanstitt/pdfium-ruby"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_development_dependency "guard-minitest"
25
+ spec.add_development_dependency "guard-rake"
26
+ spec.add_development_dependency "fastimage"
27
+ spec.add_development_dependency "image_science"
28
+ spec.add_development_dependency "rake-compiler"
29
+ end
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rusage'
4
+ require 'docsplit'
5
+
6
+
7
+ IMAGE_SIZES = {
8
+ large: '1000x',
9
+ normal: '700x',
10
+ small: '180x',
11
+ thumbnail: '60x75!'
12
+ }
13
+
14
+ output_directory = Dir.mktmpdir
15
+
16
+ start_time = Time.now
17
+ Dir.glob("test-pdfs/*.pdf").each do |pdf|
18
+ Docsplit.extract_images(pdf, :format => :gif, :size => IMAGE_SIZES.values, :rolling => true, :output => output_directory)
19
+ end
20
+ elapsed_time = Time.now-start_time
21
+
22
+ # Process.crusage measures children
23
+ # Process.rusage measures self
24
+ usage = Process.crusage
25
+
26
+ rss = usage.maxrss.to_f
27
+ # OSX reports in terms of bytes, bsd & linux use kb. "man getrusage"
28
+ rss = rss/1024 if RUBY_PLATFORM =~ /darwin/
29
+
30
+ du = Dir.glob("#{output_directory}/*/**").inject(0.0){|x,img| x+File.stat(img).size }
31
+ FileUtils.rm_r output_directory
32
+
33
+ def report(label,value,specifier="")
34
+ printf("%12s: %8.3f %s\n", label, value, specifier)
35
+ end
36
+
37
+ report "Elapsed", elapsed_time, "Seconds"
38
+ report "System CPU", usage.stime
39
+ report "User CPU", usage.utime
40
+ report "Max Memory", (rss/1024),"MB"
41
+ report "Disk Space", (du/1024/1024), "MB"
@@ -0,0 +1,26 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::BookmarkList do
4
+
5
+ let(:api){ load_document("with_bookmarks") }
6
+ let(:utf){ load_document("example_utf8") }
7
+
8
+ it "can be created" do
9
+ assert_kind_of PDFium::BookmarkList, api.bookmarks
10
+ end
11
+
12
+ it "can be empty" do
13
+ bm = PDFium::Bookmark.new(document: api)
14
+ assert_kind_of PDFium::BookmarkList, bm.children
15
+ assert bm.children.empty?, "First bookmark shouldn't have any children"
16
+ end
17
+
18
+ it "can iterate" do
19
+ count = 0
20
+ utf.bookmarks.each do | bm |
21
+ count +=1
22
+ end
23
+ assert_equal 16, count
24
+ end
25
+
26
+ end
@@ -0,0 +1,34 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::Bookmark do
4
+ let(:api){ load_document("with_bookmarks") }
5
+
6
+ it "can be read" do
7
+ bm = PDFium::Bookmark.new(document: api)
8
+ assert_equal Encoding::UTF_16LE, bm.title.encoding
9
+ assert_equal "INDEX", bm.title.encode!("ASCII-8BIT")
10
+ end
11
+
12
+ it "can create siblings" do
13
+ bm = PDFium::Bookmark.new(document: api)
14
+ second = bm.next_sibling
15
+ assert_kind_of PDFium::Bookmark, second
16
+ assert_equal "Chapter 1", second.title.encode!("ASCII-8BIT")
17
+ refute second.next_sibling, "PDF shouldn't have 3 top level bookmarks"
18
+ end
19
+
20
+ it "can create children" do
21
+ bm = PDFium::Bookmark.new(document: api)
22
+ children = bm.next_sibling.children
23
+ assert_kind_of PDFium::BookmarkList, children
24
+ assert_equal 1, children.count
25
+ end
26
+
27
+ it "has destinations" do
28
+ bm = PDFium::Bookmark.new(document: api).next_sibling
29
+ assert_kind_of Hash, bm.destination
30
+ assert_equal :destination, bm.destination[:type]
31
+ assert_equal 1, bm.destination[:page_number]
32
+ end
33
+
34
+ end
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/pdfium'
4
+ require 'pathname'
5
+
6
+
7
+ path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf')
8
+
9
+ # guide = PDFium::Document.new( path )
10
+ # `rm /tmp/images/*`
11
+ # page = guide.page_at(0)
12
+
13
+
14
+ # path.write page.as_image(height: 120).data('jpg')
15
+
16
+ data = path.read
17
+ puts data.length
18
+ pdf = PDFium::Document.from_memory(data)
19
+
20
+ puts pdf.page_count
21
+
22
+ # page.each_image do |img|
23
+ # img.save("/tmp/images/#{img.index}.png")
24
+ # end
@@ -0,0 +1,49 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::Document do
4
+ let(:guide){ load_document("example_images") }
5
+
6
+ it "can create a new empty pdf" do
7
+ pdf = PDFium::Document.new
8
+ assert pdf
9
+ end
10
+
11
+ it "can be initialized from string" do
12
+ data = pdf_path("with_bookmarks").read
13
+ pdf = PDFium::Document.from_memory(data)
14
+ assert pdf
15
+ assert_equal 3, pdf.page_count
16
+ end
17
+
18
+ it "counts pdf pages" do
19
+ assert_equal 3, guide.page_count
20
+ end
21
+
22
+ it "can save to a file" do
23
+ pdf = PDFium::Document.new
24
+ PDFium::Page.create(pdf,0)
25
+ Tempfile.open(['test','.pdf']) do |f|
26
+ pdf.save(f.path)
27
+ reloaded = PDFium::Document.new(f.path)
28
+ assert_equal 1, reloaded.page_count
29
+ end
30
+ end
31
+
32
+ it "returns pages" do
33
+ assert guide
34
+ assert_kind_of PDFium::PageList, guide.pages
35
+ end
36
+
37
+ it "can read metadata" do
38
+ assert_equal "mPDF 5.1", guide.metadata[:producer].encode!("ASCII-8BIT")
39
+ end
40
+
41
+ it "can write metadata" do
42
+ guide.metadata do | md |
43
+ md[:author] = "My Little Writer"
44
+ end
45
+ after_saving(guide) do | saved |
46
+ assert_equal "My Little Writer", saved.metadata[:author].encode!("ASCII-8BIT")
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,18 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::ImageList do
4
+
5
+ let(:image_doc){ load_document("example_images").page_at(0) }
6
+ let(:blank_page){ load_document("example_utf8").page_at(0) }
7
+
8
+ it "can be empty" do
9
+ assert blank_page.images.none?, "images found where there should not be"
10
+ end
11
+
12
+ it "can iterate" do
13
+ count = 0
14
+ image_doc.images.each{|i| count+=1 }
15
+ assert_equal 26, count
16
+ end
17
+
18
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'spec_helper'
2
+ require 'image_science'
3
+
4
+ describe PDFium::Image do
5
+ let(:guide){ load_document("example_images") }
6
+ let(:page) { guide.page_at(0) }
7
+
8
+ it "saves as various formats" do
9
+ %w{png jpeg tiff bmp gif}.each do | ext |
10
+ file = Tempfile.new(['test', ".#{ext}"])
11
+ width = rand(200) + 100
12
+ height = rand(300) + 100
13
+ page.as_image(width: width, height: height).save(file.path)
14
+ assert_size "#{width}x#{height}", file.path
15
+ end
16
+ end
17
+
18
+
19
+ it "dumps to string" do
20
+ file = Tempfile.new(['test',".jpeg"])
21
+ file.write page.as_image(height: 120).data('jpg')
22
+ file.flush
23
+ assert_size "84x120", file.path
24
+ end
25
+
26
+ it "iterates over page images" do
27
+ valid_sizes = [567, 284, 386, 227, 500, 939, 950, 959]
28
+ count = 0
29
+ page.each_image do |img|
30
+ count += 1
31
+ next if count % 4 == 0 # to speed up spec runs only sample 1/4 of time
32
+ assert_kind_of PDFium::Image, img
33
+ file = Tempfile.new(['test',".png"])
34
+ img.save(file.path)
35
+ assert_includes valid_sizes, FastImage.size(file).first
36
+ assert_includes valid_sizes, FastImage.size(file).last
37
+ end
38
+ assert_equal 26, count, "Incorrect # of images counted"
39
+ end
40
+
41
+ it "can return an ImageScience instance" do
42
+ image = page.images.first
43
+ assert image
44
+ ims = image.as_science
45
+ file = Tempfile.new(['test',"jpg"])
46
+ ims.cropped_thumbnail(100) do |thumb|
47
+ thumb.save file.path
48
+ assert_size "100x100", file.path
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,24 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe PDFium::PageList do
4
+
5
+ let(:guide){ load_document("with_bookmarks") }
6
+
7
+ it "can be empty" do
8
+ pdf = PDFium::Document.new
9
+ pages = pdf.pages
10
+ assert_kind_of PDFium::PageList, pages
11
+ assert pages.none?, "A freshly created Document shouldn't have any pages"
12
+ end
13
+
14
+ it "can iterate" do
15
+ count = 0
16
+ guide.pages.each{ count += 1 }
17
+ assert_equal 3, count
18
+ end
19
+
20
+ it "supports access by index" do
21
+ assert_equal 1, guide.pages[0].number
22
+ assert_equal 2, guide.pages[1].number
23
+ end
24
+ end
@@ -0,0 +1,91 @@
1
+ # -*- coding: utf-8 -*-
2
+ require_relative 'spec_helper'
3
+
4
+ describe PDFium::Page do
5
+ let(:utfdoc){ PDFium::Document.new( pdf_path("example_utf8") ) }
6
+ let(:textdoc){ PDFium::Document.new( pdf_path("example_divs") ) }
7
+
8
+ let(:page) { PDFium::Page.open(utfdoc,0) }
9
+
10
+
11
+ it "opens existing page" do
12
+ assert_kind_of PDFium::Page, page
13
+ end
14
+
15
+ it "creates a page" do
16
+ page = PDFium::Page.create(utfdoc,0)
17
+ assert_kind_of PDFium::Page, page
18
+ end
19
+
20
+ it "adds pages to an existing document" do
21
+ pdf = PDFium::Document.new
22
+ PDFium::Page.create(pdf)
23
+ PDFium::Page.create(pdf)
24
+ assert_equal 2, pdf.page_count
25
+ end
26
+
27
+ it "has dimensions" do
28
+ assert_in_delta 595.28, page.width
29
+ assert_in_delta 841.89, page.height
30
+ end
31
+
32
+ it "refuses to open invalid page ranges" do
33
+ assert_raises(RangeError) do
34
+ PDFium::Page.open(utfdoc,-1)
35
+ end
36
+ assert_raises(RangeError) do
37
+ PDFium::Page.open(utfdoc,90)
38
+ end
39
+ end
40
+
41
+ it "creates new pages" do
42
+ pdf = PDFium::Document.new
43
+ page = PDFium::Page.create(pdf,0, width:100, height:180)
44
+ assert_equal 100, page.width
45
+ after_saving(pdf) do | saved |
46
+ assert_equal 1, saved.page_count
47
+ assert_equal 100.0, saved.page_at(0).width
48
+ end
49
+ end
50
+
51
+ it "can't be created using new" do
52
+ assert_raises(RuntimeError){ PDFium::Page.new }
53
+ end
54
+
55
+ it "can load/unload page" do
56
+ pdf = PDFium::Document.new
57
+ 10.times do
58
+ page = PDFium::Page.create(pdf)
59
+ page.unload
60
+ end
61
+ assert_equal 10, pdf.page_count
62
+ page = pdf.page_at(1)
63
+ assert_equal 612.0, page.width
64
+ page.unload
65
+ assert_equal 612.0, page.width
66
+ end
67
+
68
+ it "can read text" do
69
+ ascii_text = PDFium::Page.open(textdoc,1).text.encode!("ASCII-8BIT")
70
+ assert_match /Cras tellus. Fusce aliquet/, ascii_text
71
+ end
72
+
73
+ it "can read utf text" do
74
+ strings = [
75
+ "Жълтата дюля беше щастлива",
76
+ "Jove xef, porti whisky amb quinze glaçons d'hidrogen",
77
+ "Příliš žluťoučký kůň úpěl ďábelské ódy",
78
+ "Høj bly gom vandt fræk sexquiz på wc",
79
+ "Doch Bep, flink sexy qua vorm, zwijgt",
80
+ "Törkylempijä vongahdus",
81
+ "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
82
+ ]
83
+ text = PDFium::Page.open(utfdoc,0).text
84
+ utf8 = text.encode("UTF-8")
85
+ strings.each do | sentence |
86
+ assert_match sentence, utf8
87
+ assert_match sentence.encode("UTF-16LE"), text
88
+ end
89
+ end
90
+
91
+ end
@@ -0,0 +1,15 @@
1
+ require_relative 'spec_helper'
2
+ require 'tempfile'
3
+
4
+
5
+ describe PDFium do
6
+
7
+ it "creates classes" do
8
+ assert PDFium::Document
9
+ assert PDFium::Page
10
+ assert PDFium::Bookmark
11
+ assert PDFium::BookmarkList
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/pdfium'
4
+ require 'pathname'
5
+
6
+ puts "Waiting for profiler attachment (PID: #{Process.pid})\nPress enter to continue"
7
+ gets
8
+
9
+ path = Pathname.new(__FILE__).dirname.join('pdfs','example_images.pdf').to_s
10
+
11
+ guide = PDFium::Document.new( path )
12
+ `rm /tmp/images/*`
13
+ page = guide.page_at(0)
14
+ page.each_image do |img|
15
+ img.save("/tmp/images/#{img.index}.png")
16
+ end
17
+
18
+ # count = 0
19
+ # pdf.bookmarks.each do | bm |
20
+ # count +=1
21
+ # print count.to_s + " "
22
+ # puts bm.title
23
+ # end
24
+
25
+ # pdf.each_page do | page |
26
+ # page.width
27
+ # end
28
+
29
+ # GC.start(full_mark: true, immediate_sweep: true)