kristin 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 65a0699a6c0e9749bac8e60eb9d2ff371349c976
4
- data.tar.gz: c84b34baf200cba7e91f79c27540cd2efd2cbc63
3
+ metadata.gz: 422f4e6c184d84f8cdcf0053cbf10758de9a7b29
4
+ data.tar.gz: 2d2ea0e029670d144066ec395710312bb3963527
5
5
  SHA512:
6
- metadata.gz: ea335a10193a365eba695b94d928fb98a24b4ae578d926cd0f26ffb410cda20b8d90ab4ae191b68e2e1e0dc901292822e19f873c2be17ee588155fd2271349d5
7
- data.tar.gz: b2db188faed3fecdc17be055e3e665a139ccf34881fe636f9a1935720a9db031aba9ba3b26a82c3cf536c7eab4cbfc90ea6311cea5f28ed5c6b8e50d8e9aa079
6
+ metadata.gz: 2b250bdf7abafd32f9a755a5eeec912024deeb1205c6a9876f0ca5a51dacb0d82cc4e078b803171c46d5c3611c9d977d70a4e894d28027a65fc58c81bd3656e9
7
+ data.tar.gz: b8014ba3d553d0e55a0757ace66bc0c546723de4cd1cfa6b11899528bfc38c3746d6b0f7baa48d5574e25189a5aeda3f632414001c64e4029156104cf8283011
data/.gitignore CHANGED
@@ -17,3 +17,5 @@ test/version_tmp
17
17
  tmp
18
18
 
19
19
  .DS_Store
20
+
21
+ TODO
data/README.md CHANGED
@@ -30,6 +30,18 @@ Kristin.convert('document.pdf', 'document.html')
30
30
 
31
31
  # You can also convert a source file directly from an URL
32
32
  Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
33
+
34
+ # You can also specify options for fine grained conversion:
35
+ Kristin.convert('document.pdf', 'document.html', { first_page: 2, last_page: 4, hdpi: 72, vdpi: 72})
36
+
37
+ # Available options:
38
+
39
+ # process_outline - show outline in HTML. Default: true
40
+ # first_page - first page to convert. Default: 1
41
+ # last_page - last page to convert. Default: 2147483647
42
+ # hdpi - horizontal resolution for graphics in DPI. Default: 144
43
+ # vdpi - vertical resolution for graphics in DPI. Default: 144
44
+
33
45
  ```
34
46
 
35
47
  ## Contributing
@@ -38,4 +50,4 @@ Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
38
50
  2. Create your feature branch (`git checkout -b my-new-feature`)
39
51
  3. Commit your changes (`git commit -am 'Add some feature'`)
40
52
  4. Push to the branch (`git push origin my-new-feature`)
41
- 5. Create new Pull Request
53
+ 5. Create new Pull Request
@@ -20,4 +20,5 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "nokogiri"
23
24
  end
@@ -3,60 +3,84 @@ require 'open-uri'
3
3
  require "net/http"
4
4
 
5
5
  module Kristin
6
- def self.convert(source, target)
7
- raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
8
- src = determine_source(source)
9
- cmd = "#{pdf2htmlex_command} #{src} #{target}"
10
- pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
11
- Process.waitpid(pid)
12
- ## TODO: Grab error message from pdf2htmlex and raise a better error
13
- raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
14
- end
6
+ class Converter
7
+ def initialize(source, target, options = {})
8
+ @options = options
9
+ @source = source
10
+ @target = target
11
+ end
15
12
 
16
- private
13
+ def convert
14
+ raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
15
+ src = determine_source(@source)
16
+ opts = process_options
17
+ cmd = "#{pdf2htmlex_command} #{opts} #{src} #{@target}"
18
+ pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
19
+ Process.waitpid(pid)
20
+
21
+ ## TODO: Grab error message from pdf2htmlex and raise a better error
22
+ raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
23
+ end
17
24
 
18
- def self.command_available?
19
- pdf2htmlex_command
20
- end
25
+ private
21
26
 
22
- def self.pdf2htmlex_command
23
- cmd = nil
24
- cmd = "pdf2htmlex" if which("pdf2htmlex")
25
- cmd = "pdf2htmlEX" if which("pdf2htmlEX")
26
- end
27
+ def process_options
28
+ opts = []
29
+ opts.push("--process-outline 0") if @options[:process_outline] == false
30
+ opts.push("--first-page #{@options[:first_page]}") if @options[:first_page]
31
+ opts.push("--last-page #{@options[:last_page]}") if @options[:last_page]
32
+ opts.push("--hdpi #{@options[:hdpi]}") if @options[:hdpi]
33
+ opts.push("--vdpi #{@options[:vdpi]}") if @options[:vdpi]
34
+
35
+ opts.join(" ")
36
+ end
27
37
 
28
- def self.which(cmd)
29
- exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
30
- ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
31
- exts.each do |ext|
32
- exe = File.join(path, "#{cmd}#{ext}")
33
- return exe if File.executable? exe
34
- end
38
+ def command_available?
39
+ pdf2htmlex_command
35
40
  end
36
-
37
- return nil
38
- end
39
41
 
40
- def self.random_source_name
41
- rand(16**16).to_s(16)
42
- end
42
+ def pdf2htmlex_command
43
+ cmd = nil
44
+ cmd = "pdf2htmlex" if which("pdf2htmlex")
45
+ cmd = "pdf2htmlEX" if which("pdf2htmlEX")
46
+ end
47
+
48
+ def which(cmd)
49
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
50
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
51
+ exts.each do |ext|
52
+ exe = File.join(path, "#{cmd}#{ext}")
53
+ return exe if File.executable? exe
54
+ end
55
+ end
56
+ return nil
57
+ end
58
+
59
+ def random_source_name
60
+ rand(16**16).to_s(16)
61
+ end
43
62
 
44
- def self.download_file(source)
45
- tmp_file = "/tmp/#{random_source_name}.pdf"
46
- File.open(tmp_file, "wb") do |saved_file|
47
- open(source, 'rb') do |read_file|
48
- saved_file.write(read_file.read)
63
+ def download_file(source)
64
+ tmp_file = "/tmp/#{random_source_name}.pdf"
65
+ File.open(tmp_file, "wb") do |saved_file|
66
+ open(source, 'rb') do |read_file|
67
+ saved_file.write(read_file.read)
68
+ end
49
69
  end
70
+
71
+ tmp_file
50
72
  end
51
73
 
52
- tmp_file
74
+ def determine_source(source)
75
+ is_file = File.exists?(source) && !File.directory?(source)
76
+ is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
77
+ raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
78
+
79
+ is_file ? source : download_file(source)
80
+ end
53
81
  end
54
82
 
55
- def self.determine_source(source)
56
- is_file = File.exists?(source) && !File.directory?(source)
57
- is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
58
- raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
59
-
60
- is_file ? source : download_file(source)
83
+ def self.convert(source, target, options = {})
84
+ Converter.new(source, target, options).convert
61
85
  end
62
86
  end
@@ -1,3 +1,3 @@
1
1
  module Kristin
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
Binary file
@@ -6,49 +6,125 @@ describe Kristin do
6
6
  @one_page_pdf = file_path("one.pdf")
7
7
  @multi_page_pdf = file_path("multi.pdf")
8
8
  @no_pdf = file_path("image.png")
9
+ @large_pdf = file_path("large.pdf")
9
10
  @target_path = "tmp/kristin"
11
+ end
12
+
13
+ before(:each) do
10
14
  FileUtils.mkdir_p @target_path
11
15
  end
12
16
 
13
- after(:all) do
17
+ after(:each) do
14
18
  FileUtils.rm_rf @target_path
15
19
  end
16
20
 
17
- describe ".convert" do
18
- it "should raise error if source file does not exists" do
19
- lambda { Kristin.convert("nonsense.pdf", "nonsense.html") }.should raise_error(IOError)
20
- end
21
+ describe "#convert" do
22
+ describe "with no options" do
23
+ it "should raise error if source file does not exists" do
24
+ c = Kristin::Converter.new("nonsense.pdf", "nonsense.html")
25
+ lambda { c.convert }.should raise_error(IOError)
26
+ end
21
27
 
22
- it "should convert a one page pdf to one html file" do
23
- target = @target_path + "/one.html"
24
- Kristin.convert(@one_page_pdf, target)
25
- File.exists?(target).should == true
26
- end
28
+ it "should convert a one page pdf to one html file" do
29
+ target = @target_path + "/one.html"
30
+ Kristin::Converter.new(@one_page_pdf, target).convert
31
+ File.exists?(target).should == true
32
+ end
27
33
 
28
- it "should convert a multi page pdf to one html file" do
29
- target = @target_path + "/multi.html"
30
- Kristin.convert(@multi_page_pdf, target)
31
- File.exists?(target).should == true
32
- end
34
+ it "should convert a multi page pdf to one html file" do
35
+ target = @target_path + "/multi.html"
36
+ Kristin::Converter.new(@multi_page_pdf, target).convert
37
+ File.exists?(target).should == true
38
+ end
39
+
40
+ it "should raise error if pdf is not a real pdf" do
41
+ lambda { Kristin::Converter.new(@no_pdf, "nonsense.html").convert }.should raise_error(IOError)
42
+ end
43
+
44
+ it "should convert a pdf from an URL" do
45
+ target = @target_path + "/from_url.html"
46
+ Kristin::Converter.new("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target).convert
47
+ File.exists?(target).should == true
48
+ end
49
+
50
+ it "should raise an error if URL does not exist" do
51
+ target = @target_path + "/from_url.html"
52
+ lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/donotexist.pdf", target).convert }.should raise_error(IOError)
53
+ end
33
54
 
34
- it "should raise error if pdf is not a real pdf" do
35
- lambda { Kristin.convert(@no_pdf, "nonsense.html") }.should raise_error(IOError)
55
+ it "should raise an error if URL file is not a real pdf" do
56
+ target = @target_path + "/from_url.html"
57
+ lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target).convert }.should raise_error(IOError)
58
+ end
36
59
  end
37
60
 
38
- it "should convert a pdf from an URL" do
39
- target = @target_path + "/from_url.html"
40
- Kristin.convert("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target)
41
- File.exists?(target).should == true
61
+ describe "options" do
62
+ #TODO: Only convert file once for performance
63
+
64
+ it "should process outline by default" do
65
+ target = @target_path + "/large.html"
66
+ Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
67
+ doc = Nokogiri::HTML(File.open(target))
68
+ el = doc.css("#pdf-outline").first
69
+ el.children.should_not be_empty
70
+ end
71
+
72
+ it "should be possible to disable outline" do
73
+ target = @target_path + "/large.html"
74
+ Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
75
+ doc = Nokogiri::HTML(File.open(target))
76
+ el = doc.css("#pdf-outline").first
77
+ el.children.first.text.strip.should be_empty
78
+ end
79
+
80
+ it "should be possible to specify first page" do
81
+ target = @target_path + "/multi.html"
82
+ Kristin::Converter.new(@multi_page_pdf, target, { first_page: 2 }).convert
83
+ doc = Nokogiri::HTML(File.open(target))
84
+ # Content only present on page 1
85
+ content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
86
+ # Content only present on page 2
87
+ content_from_page_2 = doc.search("//span").map(&:content).select {|c| c.include? "Generating functions"}
88
+ content_from_page_1.should be_empty
89
+ content_from_page_2.should_not be_empty
90
+ end
91
+
92
+ it "should be possible to specify last page" do
93
+ target = @target_path + "/multi.html"
94
+ Kristin::Converter.new(@multi_page_pdf, target, { last_page: 9 }).convert
95
+ doc = Nokogiri::HTML(File.open(target))
96
+ # Content only present on page 1
97
+ content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
98
+ # Content only present on page 10
99
+ content_from_page_10 = doc.search("//span").map(&:content).select {|c| c.include? "William Blake"}
100
+ content_from_page_1.should_not be_empty
101
+ content_from_page_10.should be_empty
102
+ end
103
+
104
+ it "should be possible to specify hdpi and vdpi" do
105
+ target = @target_path + "/one.html"
106
+ Kristin::Converter.new(@one_page_pdf, target, { hdpi: 1, vdpi: 1 }).convert
107
+ doc = Nokogiri::HTML(File.open(target))
108
+ doc.xpath("//img[@class='bi']/@src").first.content.size.should == 538 # The size you get when hdpi and vdpi is 1 on @one_page_pdf
109
+ end
110
+
111
+ it "should be possible to specify vdpi" do
112
+
113
+ end
42
114
  end
115
+ end
43
116
 
44
- it "should raise an error if URL does not exist" do
45
- target = @target_path + "/from_url.html"
46
- lambda { Kristin.convert("https://www.filepicker.io/api/file/donotexist.pdf", target) }.should raise_error(IOError)
117
+ describe ".convert" do
118
+ it "should convert without options" do
119
+ target = @target_path + "/one.html"
120
+ Kristin.convert(@one_page_pdf, target)
121
+ File.exists?(target).should == true
47
122
  end
48
123
 
49
- it "should raise an error if URL file is not a real pdf" do
50
- target = @target_path + "/from_url.html"
51
- lambda { Kristin.convert("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target) }.should raise_error(IOError)
124
+ it "should convert with options" do
125
+ target = @target_path + "/one.html"
126
+ Kristin.convert(@one_page_pdf, target, { hdpi: 1, vdpi: 1 })
127
+ File.exists?(target).should == true
52
128
  end
53
129
  end
54
130
  end
@@ -1,4 +1,5 @@
1
1
  require 'kristin'
2
+ require 'nokogiri'
2
3
 
3
4
  def file_path( *paths )
4
5
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kristin
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Nyström
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-31 00:00:00.000000000 Z
11
+ date: 2013-04-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: ' Convert PDF docs to beautiful HTML files without losing text or format.
42
56
  This gem uses pdf2htmlEX to do the conversion.'
43
57
  email:
@@ -56,6 +70,7 @@ files:
56
70
  - lib/kristin.rb
57
71
  - lib/kristin/version.rb
58
72
  - spec/fixtures/image.png
73
+ - spec/fixtures/large.pdf
59
74
  - spec/fixtures/multi.pdf
60
75
  - spec/fixtures/one.pdf
61
76
  - spec/kristin_spec.rb
@@ -80,13 +95,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
95
  version: '0'
81
96
  requirements: []
82
97
  rubyforge_project:
83
- rubygems_version: 2.0.0
98
+ rubygems_version: 2.0.3
84
99
  signing_key:
85
100
  specification_version: 4
86
101
  summary: Convert PDF docs to beautiful HTML files without losing text or format. This
87
102
  gem uses pdf2htmlEX to do the conversion.
88
103
  test_files:
89
104
  - spec/fixtures/image.png
105
+ - spec/fixtures/large.pdf
90
106
  - spec/fixtures/multi.pdf
91
107
  - spec/fixtures/one.pdf
92
108
  - spec/kristin_spec.rb