kristin 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 65a0699a6c0e9749bac8e60eb9d2ff371349c976
4
- data.tar.gz: c84b34baf200cba7e91f79c27540cd2efd2cbc63
3
+ metadata.gz: 422f4e6c184d84f8cdcf0053cbf10758de9a7b29
4
+ data.tar.gz: 2d2ea0e029670d144066ec395710312bb3963527
5
5
  SHA512:
6
- metadata.gz: ea335a10193a365eba695b94d928fb98a24b4ae578d926cd0f26ffb410cda20b8d90ab4ae191b68e2e1e0dc901292822e19f873c2be17ee588155fd2271349d5
7
- data.tar.gz: b2db188faed3fecdc17be055e3e665a139ccf34881fe636f9a1935720a9db031aba9ba3b26a82c3cf536c7eab4cbfc90ea6311cea5f28ed5c6b8e50d8e9aa079
6
+ metadata.gz: 2b250bdf7abafd32f9a755a5eeec912024deeb1205c6a9876f0ca5a51dacb0d82cc4e078b803171c46d5c3611c9d977d70a4e894d28027a65fc58c81bd3656e9
7
+ data.tar.gz: b8014ba3d553d0e55a0757ace66bc0c546723de4cd1cfa6b11899528bfc38c3746d6b0f7baa48d5574e25189a5aeda3f632414001c64e4029156104cf8283011
data/.gitignore CHANGED
@@ -17,3 +17,5 @@ test/version_tmp
17
17
  tmp
18
18
 
19
19
  .DS_Store
20
+
21
+ TODO
data/README.md CHANGED
@@ -30,6 +30,18 @@ Kristin.convert('document.pdf', 'document.html')
30
30
 
31
31
  # You can also convert a source file directly from an URL
32
32
  Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
33
+
34
+ # You can also specify options for fine grained conversion:
35
+ Kristin.convert('document.pdf', 'document.html', { first_page: 2, last_page: 4, hdpi: 72, vdpi: 72})
36
+
37
+ # Available options:
38
+
39
+ # process_outline - show outline in HTML. Default: true
40
+ # first_page - first page to convert. Default: 1
41
+ # last_page - last page to convert. Default: 2147483647
42
+ # hdpi - horizontal resolution for graphics in DPI. Default: 144
43
+ # vdpi - vertical resolution for graphics in DPI. Default: 144
44
+
33
45
  ```
34
46
 
35
47
  ## Contributing
@@ -38,4 +50,4 @@ Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
38
50
  2. Create your feature branch (`git checkout -b my-new-feature`)
39
51
  3. Commit your changes (`git commit -am 'Add some feature'`)
40
52
  4. Push to the branch (`git push origin my-new-feature`)
41
- 5. Create new Pull Request
53
+ 5. Create new Pull Request
@@ -20,4 +20,5 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "nokogiri"
23
24
  end
@@ -3,60 +3,84 @@ require 'open-uri'
3
3
  require "net/http"
4
4
 
5
5
  module Kristin
6
- def self.convert(source, target)
7
- raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
8
- src = determine_source(source)
9
- cmd = "#{pdf2htmlex_command} #{src} #{target}"
10
- pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
11
- Process.waitpid(pid)
12
- ## TODO: Grab error message from pdf2htmlex and raise a better error
13
- raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
14
- end
6
+ class Converter
7
+ def initialize(source, target, options = {})
8
+ @options = options
9
+ @source = source
10
+ @target = target
11
+ end
15
12
 
16
- private
13
+ def convert
14
+ raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
15
+ src = determine_source(@source)
16
+ opts = process_options
17
+ cmd = "#{pdf2htmlex_command} #{opts} #{src} #{@target}"
18
+ pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
19
+ Process.waitpid(pid)
20
+
21
+ ## TODO: Grab error message from pdf2htmlex and raise a better error
22
+ raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
23
+ end
17
24
 
18
- def self.command_available?
19
- pdf2htmlex_command
20
- end
25
+ private
21
26
 
22
- def self.pdf2htmlex_command
23
- cmd = nil
24
- cmd = "pdf2htmlex" if which("pdf2htmlex")
25
- cmd = "pdf2htmlEX" if which("pdf2htmlEX")
26
- end
27
+ def process_options
28
+ opts = []
29
+ opts.push("--process-outline 0") if @options[:process_outline] == false
30
+ opts.push("--first-page #{@options[:first_page]}") if @options[:first_page]
31
+ opts.push("--last-page #{@options[:last_page]}") if @options[:last_page]
32
+ opts.push("--hdpi #{@options[:hdpi]}") if @options[:hdpi]
33
+ opts.push("--vdpi #{@options[:vdpi]}") if @options[:vdpi]
34
+
35
+ opts.join(" ")
36
+ end
27
37
 
28
- def self.which(cmd)
29
- exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
30
- ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
31
- exts.each do |ext|
32
- exe = File.join(path, "#{cmd}#{ext}")
33
- return exe if File.executable? exe
34
- end
38
+ def command_available?
39
+ pdf2htmlex_command
35
40
  end
36
-
37
- return nil
38
- end
39
41
 
40
- def self.random_source_name
41
- rand(16**16).to_s(16)
42
- end
42
+ def pdf2htmlex_command
43
+ cmd = nil
44
+ cmd = "pdf2htmlex" if which("pdf2htmlex")
45
+ cmd = "pdf2htmlEX" if which("pdf2htmlEX")
46
+ end
47
+
48
+ def which(cmd)
49
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
50
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
51
+ exts.each do |ext|
52
+ exe = File.join(path, "#{cmd}#{ext}")
53
+ return exe if File.executable? exe
54
+ end
55
+ end
56
+ return nil
57
+ end
58
+
59
+ def random_source_name
60
+ rand(16**16).to_s(16)
61
+ end
43
62
 
44
- def self.download_file(source)
45
- tmp_file = "/tmp/#{random_source_name}.pdf"
46
- File.open(tmp_file, "wb") do |saved_file|
47
- open(source, 'rb') do |read_file|
48
- saved_file.write(read_file.read)
63
+ def download_file(source)
64
+ tmp_file = "/tmp/#{random_source_name}.pdf"
65
+ File.open(tmp_file, "wb") do |saved_file|
66
+ open(source, 'rb') do |read_file|
67
+ saved_file.write(read_file.read)
68
+ end
49
69
  end
70
+
71
+ tmp_file
50
72
  end
51
73
 
52
- tmp_file
74
+ def determine_source(source)
75
+ is_file = File.exists?(source) && !File.directory?(source)
76
+ is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
77
+ raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
78
+
79
+ is_file ? source : download_file(source)
80
+ end
53
81
  end
54
82
 
55
- def self.determine_source(source)
56
- is_file = File.exists?(source) && !File.directory?(source)
57
- is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
58
- raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
59
-
60
- is_file ? source : download_file(source)
83
+ def self.convert(source, target, options = {})
84
+ Converter.new(source, target, options).convert
61
85
  end
62
86
  end
@@ -1,3 +1,3 @@
1
1
  module Kristin
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
Binary file
@@ -6,49 +6,125 @@ describe Kristin do
6
6
  @one_page_pdf = file_path("one.pdf")
7
7
  @multi_page_pdf = file_path("multi.pdf")
8
8
  @no_pdf = file_path("image.png")
9
+ @large_pdf = file_path("large.pdf")
9
10
  @target_path = "tmp/kristin"
11
+ end
12
+
13
+ before(:each) do
10
14
  FileUtils.mkdir_p @target_path
11
15
  end
12
16
 
13
- after(:all) do
17
+ after(:each) do
14
18
  FileUtils.rm_rf @target_path
15
19
  end
16
20
 
17
- describe ".convert" do
18
- it "should raise error if source file does not exists" do
19
- lambda { Kristin.convert("nonsense.pdf", "nonsense.html") }.should raise_error(IOError)
20
- end
21
+ describe "#convert" do
22
+ describe "with no options" do
23
+ it "should raise error if source file does not exists" do
24
+ c = Kristin::Converter.new("nonsense.pdf", "nonsense.html")
25
+ lambda { c.convert }.should raise_error(IOError)
26
+ end
21
27
 
22
- it "should convert a one page pdf to one html file" do
23
- target = @target_path + "/one.html"
24
- Kristin.convert(@one_page_pdf, target)
25
- File.exists?(target).should == true
26
- end
28
+ it "should convert a one page pdf to one html file" do
29
+ target = @target_path + "/one.html"
30
+ Kristin::Converter.new(@one_page_pdf, target).convert
31
+ File.exists?(target).should == true
32
+ end
27
33
 
28
- it "should convert a multi page pdf to one html file" do
29
- target = @target_path + "/multi.html"
30
- Kristin.convert(@multi_page_pdf, target)
31
- File.exists?(target).should == true
32
- end
34
+ it "should convert a multi page pdf to one html file" do
35
+ target = @target_path + "/multi.html"
36
+ Kristin::Converter.new(@multi_page_pdf, target).convert
37
+ File.exists?(target).should == true
38
+ end
39
+
40
+ it "should raise error if pdf is not a real pdf" do
41
+ lambda { Kristin::Converter.new(@no_pdf, "nonsense.html").convert }.should raise_error(IOError)
42
+ end
43
+
44
+ it "should convert a pdf from an URL" do
45
+ target = @target_path + "/from_url.html"
46
+ Kristin::Converter.new("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target).convert
47
+ File.exists?(target).should == true
48
+ end
49
+
50
+ it "should raise an error if URL does not exist" do
51
+ target = @target_path + "/from_url.html"
52
+ lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/donotexist.pdf", target).convert }.should raise_error(IOError)
53
+ end
33
54
 
34
- it "should raise error if pdf is not a real pdf" do
35
- lambda { Kristin.convert(@no_pdf, "nonsense.html") }.should raise_error(IOError)
55
+ it "should raise an error if URL file is not a real pdf" do
56
+ target = @target_path + "/from_url.html"
57
+ lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target).convert }.should raise_error(IOError)
58
+ end
36
59
  end
37
60
 
38
- it "should convert a pdf from an URL" do
39
- target = @target_path + "/from_url.html"
40
- Kristin.convert("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target)
41
- File.exists?(target).should == true
61
+ describe "options" do
62
+ #TODO: Only convert file once for performance
63
+
64
+ it "should process outline by default" do
65
+ target = @target_path + "/large.html"
66
+ Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
67
+ doc = Nokogiri::HTML(File.open(target))
68
+ el = doc.css("#pdf-outline").first
69
+ el.children.should_not be_empty
70
+ end
71
+
72
+ it "should be possible to disable outline" do
73
+ target = @target_path + "/large.html"
74
+ Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
75
+ doc = Nokogiri::HTML(File.open(target))
76
+ el = doc.css("#pdf-outline").first
77
+ el.children.first.text.strip.should be_empty
78
+ end
79
+
80
+ it "should be possible to specify first page" do
81
+ target = @target_path + "/multi.html"
82
+ Kristin::Converter.new(@multi_page_pdf, target, { first_page: 2 }).convert
83
+ doc = Nokogiri::HTML(File.open(target))
84
+ # Content only present on page 1
85
+ content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
86
+ # Content only present on page 2
87
+ content_from_page_2 = doc.search("//span").map(&:content).select {|c| c.include? "Generating functions"}
88
+ content_from_page_1.should be_empty
89
+ content_from_page_2.should_not be_empty
90
+ end
91
+
92
+ it "should be possible to specify last page" do
93
+ target = @target_path + "/multi.html"
94
+ Kristin::Converter.new(@multi_page_pdf, target, { last_page: 9 }).convert
95
+ doc = Nokogiri::HTML(File.open(target))
96
+ # Content only present on page 1
97
+ content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
98
+ # Content only present on page 10
99
+ content_from_page_10 = doc.search("//span").map(&:content).select {|c| c.include? "William Blake"}
100
+ content_from_page_1.should_not be_empty
101
+ content_from_page_10.should be_empty
102
+ end
103
+
104
+ it "should be possible to specify hdpi and vdpi" do
105
+ target = @target_path + "/one.html"
106
+ Kristin::Converter.new(@one_page_pdf, target, { hdpi: 1, vdpi: 1 }).convert
107
+ doc = Nokogiri::HTML(File.open(target))
108
+ doc.xpath("//img[@class='bi']/@src").first.content.size.should == 538 # The size you get when hdpi and vdpi is 1 on @one_page_pdf
109
+ end
110
+
111
+ it "should be possible to specify vdpi" do
112
+
113
+ end
42
114
  end
115
+ end
43
116
 
44
- it "should raise an error if URL does not exist" do
45
- target = @target_path + "/from_url.html"
46
- lambda { Kristin.convert("https://www.filepicker.io/api/file/donotexist.pdf", target) }.should raise_error(IOError)
117
+ describe ".convert" do
118
+ it "should convert without options" do
119
+ target = @target_path + "/one.html"
120
+ Kristin.convert(@one_page_pdf, target)
121
+ File.exists?(target).should == true
47
122
  end
48
123
 
49
- it "should raise an error if URL file is not a real pdf" do
50
- target = @target_path + "/from_url.html"
51
- lambda { Kristin.convert("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target) }.should raise_error(IOError)
124
+ it "should convert with options" do
125
+ target = @target_path + "/one.html"
126
+ Kristin.convert(@one_page_pdf, target, { hdpi: 1, vdpi: 1 })
127
+ File.exists?(target).should == true
52
128
  end
53
129
  end
54
130
  end
@@ -1,4 +1,5 @@
1
1
  require 'kristin'
2
+ require 'nokogiri'
2
3
 
3
4
  def file_path( *paths )
4
5
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kristin
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Nyström
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-31 00:00:00.000000000 Z
11
+ date: 2013-04-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: ' Convert PDF docs to beautiful HTML files without losing text or format.
42
56
  This gem uses pdf2htmlEX to do the conversion.'
43
57
  email:
@@ -56,6 +70,7 @@ files:
56
70
  - lib/kristin.rb
57
71
  - lib/kristin/version.rb
58
72
  - spec/fixtures/image.png
73
+ - spec/fixtures/large.pdf
59
74
  - spec/fixtures/multi.pdf
60
75
  - spec/fixtures/one.pdf
61
76
  - spec/kristin_spec.rb
@@ -80,13 +95,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
95
  version: '0'
81
96
  requirements: []
82
97
  rubyforge_project:
83
- rubygems_version: 2.0.0
98
+ rubygems_version: 2.0.3
84
99
  signing_key:
85
100
  specification_version: 4
86
101
  summary: Convert PDF docs to beautiful HTML files without losing text or format. This
87
102
  gem uses pdf2htmlEX to do the conversion.
88
103
  test_files:
89
104
  - spec/fixtures/image.png
105
+ - spec/fixtures/large.pdf
90
106
  - spec/fixtures/multi.pdf
91
107
  - spec/fixtures/one.pdf
92
108
  - spec/kristin_spec.rb