kristin 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +13 -1
- data/kristin.gemspec +1 -0
- data/lib/kristin.rb +67 -43
- data/lib/kristin/version.rb +1 -1
- data/spec/fixtures/large.pdf +0 -0
- data/spec/kristin_spec.rb +103 -27
- data/spec/spec_helper.rb +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 422f4e6c184d84f8cdcf0053cbf10758de9a7b29
|
4
|
+
data.tar.gz: 2d2ea0e029670d144066ec395710312bb3963527
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b250bdf7abafd32f9a755a5eeec912024deeb1205c6a9876f0ca5a51dacb0d82cc4e078b803171c46d5c3611c9d977d70a4e894d28027a65fc58c81bd3656e9
|
7
|
+
data.tar.gz: b8014ba3d553d0e55a0757ace66bc0c546723de4cd1cfa6b11899528bfc38c3746d6b0f7baa48d5574e25189a5aeda3f632414001c64e4029156104cf8283011
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -30,6 +30,18 @@ Kristin.convert('document.pdf', 'document.html')
|
|
30
30
|
|
31
31
|
# You can also convert a source file directly from an URL
|
32
32
|
Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
|
33
|
+
|
34
|
+
# You can also specify options for fine grained conversion:
|
35
|
+
Kristin.convert('document.pdf', 'document.html', { first_page: 2, last_page: 4, hdpi: 72, vdpi: 72})
|
36
|
+
|
37
|
+
# Available options:
|
38
|
+
|
39
|
+
# process_outline - show outline in HTML. Default: true
|
40
|
+
# first_page - first page to convert. Default: 1
|
41
|
+
# last_page - last page to convert. Default: 2147483647
|
42
|
+
# hdpi - horizontal resolution for graphics in DPI. Default: 144
|
43
|
+
# vdpi - vertical resolution for graphics in DPI. Default: 144
|
44
|
+
|
33
45
|
```
|
34
46
|
|
35
47
|
## Contributing
|
@@ -38,4 +50,4 @@ Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
|
|
38
50
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
39
51
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
40
52
|
4. Push to the branch (`git push origin my-new-feature`)
|
41
|
-
5. Create new Pull Request
|
53
|
+
5. Create new Pull Request
|
data/kristin.gemspec
CHANGED
data/lib/kristin.rb
CHANGED
@@ -3,60 +3,84 @@ require 'open-uri'
|
|
3
3
|
require "net/http"
|
4
4
|
|
5
5
|
module Kristin
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
## TODO: Grab error message from pdf2htmlex and raise a better error
|
13
|
-
raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
|
14
|
-
end
|
6
|
+
class Converter
|
7
|
+
def initialize(source, target, options = {})
|
8
|
+
@options = options
|
9
|
+
@source = source
|
10
|
+
@target = target
|
11
|
+
end
|
15
12
|
|
16
|
-
|
13
|
+
def convert
|
14
|
+
raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
|
15
|
+
src = determine_source(@source)
|
16
|
+
opts = process_options
|
17
|
+
cmd = "#{pdf2htmlex_command} #{opts} #{src} #{@target}"
|
18
|
+
pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
|
19
|
+
Process.waitpid(pid)
|
20
|
+
|
21
|
+
## TODO: Grab error message from pdf2htmlex and raise a better error
|
22
|
+
raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
|
23
|
+
end
|
17
24
|
|
18
|
-
|
19
|
-
pdf2htmlex_command
|
20
|
-
end
|
25
|
+
private
|
21
26
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
+
def process_options
|
28
|
+
opts = []
|
29
|
+
opts.push("--process-outline 0") if @options[:process_outline] == false
|
30
|
+
opts.push("--first-page #{@options[:first_page]}") if @options[:first_page]
|
31
|
+
opts.push("--last-page #{@options[:last_page]}") if @options[:last_page]
|
32
|
+
opts.push("--hdpi #{@options[:hdpi]}") if @options[:hdpi]
|
33
|
+
opts.push("--vdpi #{@options[:vdpi]}") if @options[:vdpi]
|
34
|
+
|
35
|
+
opts.join(" ")
|
36
|
+
end
|
27
37
|
|
28
|
-
|
29
|
-
|
30
|
-
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
31
|
-
exts.each do |ext|
|
32
|
-
exe = File.join(path, "#{cmd}#{ext}")
|
33
|
-
return exe if File.executable? exe
|
34
|
-
end
|
38
|
+
def command_available?
|
39
|
+
pdf2htmlex_command
|
35
40
|
end
|
36
|
-
|
37
|
-
return nil
|
38
|
-
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
def pdf2htmlex_command
|
43
|
+
cmd = nil
|
44
|
+
cmd = "pdf2htmlex" if which("pdf2htmlex")
|
45
|
+
cmd = "pdf2htmlEX" if which("pdf2htmlEX")
|
46
|
+
end
|
47
|
+
|
48
|
+
def which(cmd)
|
49
|
+
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
50
|
+
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
51
|
+
exts.each do |ext|
|
52
|
+
exe = File.join(path, "#{cmd}#{ext}")
|
53
|
+
return exe if File.executable? exe
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return nil
|
57
|
+
end
|
58
|
+
|
59
|
+
def random_source_name
|
60
|
+
rand(16**16).to_s(16)
|
61
|
+
end
|
43
62
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
63
|
+
def download_file(source)
|
64
|
+
tmp_file = "/tmp/#{random_source_name}.pdf"
|
65
|
+
File.open(tmp_file, "wb") do |saved_file|
|
66
|
+
open(source, 'rb') do |read_file|
|
67
|
+
saved_file.write(read_file.read)
|
68
|
+
end
|
49
69
|
end
|
70
|
+
|
71
|
+
tmp_file
|
50
72
|
end
|
51
73
|
|
52
|
-
|
74
|
+
def determine_source(source)
|
75
|
+
is_file = File.exists?(source) && !File.directory?(source)
|
76
|
+
is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
|
77
|
+
raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
|
78
|
+
|
79
|
+
is_file ? source : download_file(source)
|
80
|
+
end
|
53
81
|
end
|
54
82
|
|
55
|
-
def self.
|
56
|
-
|
57
|
-
is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
|
58
|
-
raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
|
59
|
-
|
60
|
-
is_file ? source : download_file(source)
|
83
|
+
def self.convert(source, target, options = {})
|
84
|
+
Converter.new(source, target, options).convert
|
61
85
|
end
|
62
86
|
end
|
data/lib/kristin/version.rb
CHANGED
Binary file
|
data/spec/kristin_spec.rb
CHANGED
@@ -6,49 +6,125 @@ describe Kristin do
|
|
6
6
|
@one_page_pdf = file_path("one.pdf")
|
7
7
|
@multi_page_pdf = file_path("multi.pdf")
|
8
8
|
@no_pdf = file_path("image.png")
|
9
|
+
@large_pdf = file_path("large.pdf")
|
9
10
|
@target_path = "tmp/kristin"
|
11
|
+
end
|
12
|
+
|
13
|
+
before(:each) do
|
10
14
|
FileUtils.mkdir_p @target_path
|
11
15
|
end
|
12
16
|
|
13
|
-
after(:
|
17
|
+
after(:each) do
|
14
18
|
FileUtils.rm_rf @target_path
|
15
19
|
end
|
16
20
|
|
17
|
-
describe "
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
+
describe "#convert" do
|
22
|
+
describe "with no options" do
|
23
|
+
it "should raise error if source file does not exists" do
|
24
|
+
c = Kristin::Converter.new("nonsense.pdf", "nonsense.html")
|
25
|
+
lambda { c.convert }.should raise_error(IOError)
|
26
|
+
end
|
21
27
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
28
|
+
it "should convert a one page pdf to one html file" do
|
29
|
+
target = @target_path + "/one.html"
|
30
|
+
Kristin::Converter.new(@one_page_pdf, target).convert
|
31
|
+
File.exists?(target).should == true
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
34
|
+
it "should convert a multi page pdf to one html file" do
|
35
|
+
target = @target_path + "/multi.html"
|
36
|
+
Kristin::Converter.new(@multi_page_pdf, target).convert
|
37
|
+
File.exists?(target).should == true
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should raise error if pdf is not a real pdf" do
|
41
|
+
lambda { Kristin::Converter.new(@no_pdf, "nonsense.html").convert }.should raise_error(IOError)
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should convert a pdf from an URL" do
|
45
|
+
target = @target_path + "/from_url.html"
|
46
|
+
Kristin::Converter.new("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target).convert
|
47
|
+
File.exists?(target).should == true
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should raise an error if URL does not exist" do
|
51
|
+
target = @target_path + "/from_url.html"
|
52
|
+
lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/donotexist.pdf", target).convert }.should raise_error(IOError)
|
53
|
+
end
|
33
54
|
|
34
|
-
|
35
|
-
|
55
|
+
it "should raise an error if URL file is not a real pdf" do
|
56
|
+
target = @target_path + "/from_url.html"
|
57
|
+
lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target).convert }.should raise_error(IOError)
|
58
|
+
end
|
36
59
|
end
|
37
60
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
61
|
+
describe "options" do
|
62
|
+
#TODO: Only convert file once for performance
|
63
|
+
|
64
|
+
it "should process outline by default" do
|
65
|
+
target = @target_path + "/large.html"
|
66
|
+
Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
|
67
|
+
doc = Nokogiri::HTML(File.open(target))
|
68
|
+
el = doc.css("#pdf-outline").first
|
69
|
+
el.children.should_not be_empty
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should be possible to disable outline" do
|
73
|
+
target = @target_path + "/large.html"
|
74
|
+
Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
|
75
|
+
doc = Nokogiri::HTML(File.open(target))
|
76
|
+
el = doc.css("#pdf-outline").first
|
77
|
+
el.children.first.text.strip.should be_empty
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should be possible to specify first page" do
|
81
|
+
target = @target_path + "/multi.html"
|
82
|
+
Kristin::Converter.new(@multi_page_pdf, target, { first_page: 2 }).convert
|
83
|
+
doc = Nokogiri::HTML(File.open(target))
|
84
|
+
# Content only present on page 1
|
85
|
+
content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
|
86
|
+
# Content only present on page 2
|
87
|
+
content_from_page_2 = doc.search("//span").map(&:content).select {|c| c.include? "Generating functions"}
|
88
|
+
content_from_page_1.should be_empty
|
89
|
+
content_from_page_2.should_not be_empty
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should be possible to specify last page" do
|
93
|
+
target = @target_path + "/multi.html"
|
94
|
+
Kristin::Converter.new(@multi_page_pdf, target, { last_page: 9 }).convert
|
95
|
+
doc = Nokogiri::HTML(File.open(target))
|
96
|
+
# Content only present on page 1
|
97
|
+
content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
|
98
|
+
# Content only present on page 10
|
99
|
+
content_from_page_10 = doc.search("//span").map(&:content).select {|c| c.include? "William Blake"}
|
100
|
+
content_from_page_1.should_not be_empty
|
101
|
+
content_from_page_10.should be_empty
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should be possible to specify hdpi and vdpi" do
|
105
|
+
target = @target_path + "/one.html"
|
106
|
+
Kristin::Converter.new(@one_page_pdf, target, { hdpi: 1, vdpi: 1 }).convert
|
107
|
+
doc = Nokogiri::HTML(File.open(target))
|
108
|
+
doc.xpath("//img[@class='bi']/@src").first.content.size.should == 538 # The size you get when hdpi and vdpi is 1 on @one_page_pdf
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should be possible to specify vdpi" do
|
112
|
+
|
113
|
+
end
|
42
114
|
end
|
115
|
+
end
|
43
116
|
|
44
|
-
|
45
|
-
|
46
|
-
|
117
|
+
describe ".convert" do
|
118
|
+
it "should convert without options" do
|
119
|
+
target = @target_path + "/one.html"
|
120
|
+
Kristin.convert(@one_page_pdf, target)
|
121
|
+
File.exists?(target).should == true
|
47
122
|
end
|
48
123
|
|
49
|
-
it "should
|
50
|
-
target = @target_path + "/
|
51
|
-
|
124
|
+
it "should convert with options" do
|
125
|
+
target = @target_path + "/one.html"
|
126
|
+
Kristin.convert(@one_page_pdf, target, { hdpi: 1, vdpi: 1 })
|
127
|
+
File.exists?(target).should == true
|
52
128
|
end
|
53
129
|
end
|
54
130
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kristin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03
|
11
|
+
date: 2013-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: ' Convert PDF docs to beautiful HTML files without losing text or format.
|
42
56
|
This gem uses pdf2htmlEX to do the conversion.'
|
43
57
|
email:
|
@@ -56,6 +70,7 @@ files:
|
|
56
70
|
- lib/kristin.rb
|
57
71
|
- lib/kristin/version.rb
|
58
72
|
- spec/fixtures/image.png
|
73
|
+
- spec/fixtures/large.pdf
|
59
74
|
- spec/fixtures/multi.pdf
|
60
75
|
- spec/fixtures/one.pdf
|
61
76
|
- spec/kristin_spec.rb
|
@@ -80,13 +95,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
95
|
version: '0'
|
81
96
|
requirements: []
|
82
97
|
rubyforge_project:
|
83
|
-
rubygems_version: 2.0.
|
98
|
+
rubygems_version: 2.0.3
|
84
99
|
signing_key:
|
85
100
|
specification_version: 4
|
86
101
|
summary: Convert PDF docs to beautiful HTML files without losing text or format. This
|
87
102
|
gem uses pdf2htmlEX to do the conversion.
|
88
103
|
test_files:
|
89
104
|
- spec/fixtures/image.png
|
105
|
+
- spec/fixtures/large.pdf
|
90
106
|
- spec/fixtures/multi.pdf
|
91
107
|
- spec/fixtures/one.pdf
|
92
108
|
- spec/kristin_spec.rb
|