kristin 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +13 -1
- data/kristin.gemspec +1 -0
- data/lib/kristin.rb +67 -43
- data/lib/kristin/version.rb +1 -1
- data/spec/fixtures/large.pdf +0 -0
- data/spec/kristin_spec.rb +103 -27
- data/spec/spec_helper.rb +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 422f4e6c184d84f8cdcf0053cbf10758de9a7b29
|
4
|
+
data.tar.gz: 2d2ea0e029670d144066ec395710312bb3963527
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b250bdf7abafd32f9a755a5eeec912024deeb1205c6a9876f0ca5a51dacb0d82cc4e078b803171c46d5c3611c9d977d70a4e894d28027a65fc58c81bd3656e9
|
7
|
+
data.tar.gz: b8014ba3d553d0e55a0757ace66bc0c546723de4cd1cfa6b11899528bfc38c3746d6b0f7baa48d5574e25189a5aeda3f632414001c64e4029156104cf8283011
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -30,6 +30,18 @@ Kristin.convert('document.pdf', 'document.html')
|
|
30
30
|
|
31
31
|
# You can also convert a source file directly from an URL
|
32
32
|
Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
|
33
|
+
|
34
|
+
# You can also specify options for fine grained conversion:
|
35
|
+
Kristin.convert('document.pdf', 'document.html', { first_page: 2, last_page: 4, hdpi: 72, vdpi: 72})
|
36
|
+
|
37
|
+
# Available options:
|
38
|
+
|
39
|
+
# process_outline - show outline in HTML. Default: true
|
40
|
+
# first_page - first page to convert. Default: 1
|
41
|
+
# last_page - last page to convert. Default: 2147483647
|
42
|
+
# hdpi - horizontal resolution for graphics in DPI. Default: 144
|
43
|
+
# vdpi - vertical resolution for graphics in DPI. Default: 144
|
44
|
+
|
33
45
|
```
|
34
46
|
|
35
47
|
## Contributing
|
@@ -38,4 +50,4 @@ Kristin.convert('http://myserver.com/123/document.pdf', 'document.html')
|
|
38
50
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
39
51
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
40
52
|
4. Push to the branch (`git push origin my-new-feature`)
|
41
|
-
5. Create new Pull Request
|
53
|
+
5. Create new Pull Request
|
data/kristin.gemspec
CHANGED
data/lib/kristin.rb
CHANGED
@@ -3,60 +3,84 @@ require 'open-uri'
|
|
3
3
|
require "net/http"
|
4
4
|
|
5
5
|
module Kristin
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
## TODO: Grab error message from pdf2htmlex and raise a better error
|
13
|
-
raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
|
14
|
-
end
|
6
|
+
class Converter
|
7
|
+
def initialize(source, target, options = {})
|
8
|
+
@options = options
|
9
|
+
@source = source
|
10
|
+
@target = target
|
11
|
+
end
|
15
12
|
|
16
|
-
|
13
|
+
def convert
|
14
|
+
raise IOError, "Can't find pdf2htmlex executable in PATH" if not command_available?
|
15
|
+
src = determine_source(@source)
|
16
|
+
opts = process_options
|
17
|
+
cmd = "#{pdf2htmlex_command} #{opts} #{src} #{@target}"
|
18
|
+
pid = Process.spawn(cmd, [:out, :err] => "/dev/null")
|
19
|
+
Process.waitpid(pid)
|
20
|
+
|
21
|
+
## TODO: Grab error message from pdf2htmlex and raise a better error
|
22
|
+
raise IOError, "Could not convert #{src}" if $?.exitstatus != 0
|
23
|
+
end
|
17
24
|
|
18
|
-
|
19
|
-
pdf2htmlex_command
|
20
|
-
end
|
25
|
+
private
|
21
26
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
+
def process_options
|
28
|
+
opts = []
|
29
|
+
opts.push("--process-outline 0") if @options[:process_outline] == false
|
30
|
+
opts.push("--first-page #{@options[:first_page]}") if @options[:first_page]
|
31
|
+
opts.push("--last-page #{@options[:last_page]}") if @options[:last_page]
|
32
|
+
opts.push("--hdpi #{@options[:hdpi]}") if @options[:hdpi]
|
33
|
+
opts.push("--vdpi #{@options[:vdpi]}") if @options[:vdpi]
|
34
|
+
|
35
|
+
opts.join(" ")
|
36
|
+
end
|
27
37
|
|
28
|
-
|
29
|
-
|
30
|
-
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
31
|
-
exts.each do |ext|
|
32
|
-
exe = File.join(path, "#{cmd}#{ext}")
|
33
|
-
return exe if File.executable? exe
|
34
|
-
end
|
38
|
+
def command_available?
|
39
|
+
pdf2htmlex_command
|
35
40
|
end
|
36
|
-
|
37
|
-
return nil
|
38
|
-
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
def pdf2htmlex_command
|
43
|
+
cmd = nil
|
44
|
+
cmd = "pdf2htmlex" if which("pdf2htmlex")
|
45
|
+
cmd = "pdf2htmlEX" if which("pdf2htmlEX")
|
46
|
+
end
|
47
|
+
|
48
|
+
def which(cmd)
|
49
|
+
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
50
|
+
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
51
|
+
exts.each do |ext|
|
52
|
+
exe = File.join(path, "#{cmd}#{ext}")
|
53
|
+
return exe if File.executable? exe
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return nil
|
57
|
+
end
|
58
|
+
|
59
|
+
def random_source_name
|
60
|
+
rand(16**16).to_s(16)
|
61
|
+
end
|
43
62
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
63
|
+
def download_file(source)
|
64
|
+
tmp_file = "/tmp/#{random_source_name}.pdf"
|
65
|
+
File.open(tmp_file, "wb") do |saved_file|
|
66
|
+
open(source, 'rb') do |read_file|
|
67
|
+
saved_file.write(read_file.read)
|
68
|
+
end
|
49
69
|
end
|
70
|
+
|
71
|
+
tmp_file
|
50
72
|
end
|
51
73
|
|
52
|
-
|
74
|
+
def determine_source(source)
|
75
|
+
is_file = File.exists?(source) && !File.directory?(source)
|
76
|
+
is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
|
77
|
+
raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
|
78
|
+
|
79
|
+
is_file ? source : download_file(source)
|
80
|
+
end
|
53
81
|
end
|
54
82
|
|
55
|
-
def self.
|
56
|
-
|
57
|
-
is_http = (URI(source).scheme == "http" || URI(source).scheme == "https") && Net::HTTP.get_response(URI(source)).is_a?(Net::HTTPSuccess)
|
58
|
-
raise IOError, "Source (#{source}) is neither a file nor an URL." unless is_file || is_http
|
59
|
-
|
60
|
-
is_file ? source : download_file(source)
|
83
|
+
def self.convert(source, target, options = {})
|
84
|
+
Converter.new(source, target, options).convert
|
61
85
|
end
|
62
86
|
end
|
data/lib/kristin/version.rb
CHANGED
Binary file
|
data/spec/kristin_spec.rb
CHANGED
@@ -6,49 +6,125 @@ describe Kristin do
|
|
6
6
|
@one_page_pdf = file_path("one.pdf")
|
7
7
|
@multi_page_pdf = file_path("multi.pdf")
|
8
8
|
@no_pdf = file_path("image.png")
|
9
|
+
@large_pdf = file_path("large.pdf")
|
9
10
|
@target_path = "tmp/kristin"
|
11
|
+
end
|
12
|
+
|
13
|
+
before(:each) do
|
10
14
|
FileUtils.mkdir_p @target_path
|
11
15
|
end
|
12
16
|
|
13
|
-
after(:
|
17
|
+
after(:each) do
|
14
18
|
FileUtils.rm_rf @target_path
|
15
19
|
end
|
16
20
|
|
17
|
-
describe "
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
+
describe "#convert" do
|
22
|
+
describe "with no options" do
|
23
|
+
it "should raise error if source file does not exists" do
|
24
|
+
c = Kristin::Converter.new("nonsense.pdf", "nonsense.html")
|
25
|
+
lambda { c.convert }.should raise_error(IOError)
|
26
|
+
end
|
21
27
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
28
|
+
it "should convert a one page pdf to one html file" do
|
29
|
+
target = @target_path + "/one.html"
|
30
|
+
Kristin::Converter.new(@one_page_pdf, target).convert
|
31
|
+
File.exists?(target).should == true
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
34
|
+
it "should convert a multi page pdf to one html file" do
|
35
|
+
target = @target_path + "/multi.html"
|
36
|
+
Kristin::Converter.new(@multi_page_pdf, target).convert
|
37
|
+
File.exists?(target).should == true
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should raise error if pdf is not a real pdf" do
|
41
|
+
lambda { Kristin::Converter.new(@no_pdf, "nonsense.html").convert }.should raise_error(IOError)
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should convert a pdf from an URL" do
|
45
|
+
target = @target_path + "/from_url.html"
|
46
|
+
Kristin::Converter.new("https://www.filepicker.io/api/file/vR0btUfRQiCF9ntRkW6Q", target).convert
|
47
|
+
File.exists?(target).should == true
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should raise an error if URL does not exist" do
|
51
|
+
target = @target_path + "/from_url.html"
|
52
|
+
lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/donotexist.pdf", target).convert }.should raise_error(IOError)
|
53
|
+
end
|
33
54
|
|
34
|
-
|
35
|
-
|
55
|
+
it "should raise an error if URL file is not a real pdf" do
|
56
|
+
target = @target_path + "/from_url.html"
|
57
|
+
lambda { Kristin::Converter.new("https://www.filepicker.io/api/file/agxKeTfQSWKvMR4CDXMq", target).convert }.should raise_error(IOError)
|
58
|
+
end
|
36
59
|
end
|
37
60
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
61
|
+
describe "options" do
|
62
|
+
#TODO: Only convert file once for performance
|
63
|
+
|
64
|
+
it "should process outline by default" do
|
65
|
+
target = @target_path + "/large.html"
|
66
|
+
Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
|
67
|
+
doc = Nokogiri::HTML(File.open(target))
|
68
|
+
el = doc.css("#pdf-outline").first
|
69
|
+
el.children.should_not be_empty
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should be possible to disable outline" do
|
73
|
+
target = @target_path + "/large.html"
|
74
|
+
Kristin::Converter.new(@large_pdf, target, { process_outline: false }).convert
|
75
|
+
doc = Nokogiri::HTML(File.open(target))
|
76
|
+
el = doc.css("#pdf-outline").first
|
77
|
+
el.children.first.text.strip.should be_empty
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should be possible to specify first page" do
|
81
|
+
target = @target_path + "/multi.html"
|
82
|
+
Kristin::Converter.new(@multi_page_pdf, target, { first_page: 2 }).convert
|
83
|
+
doc = Nokogiri::HTML(File.open(target))
|
84
|
+
# Content only present on page 1
|
85
|
+
content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
|
86
|
+
# Content only present on page 2
|
87
|
+
content_from_page_2 = doc.search("//span").map(&:content).select {|c| c.include? "Generating functions"}
|
88
|
+
content_from_page_1.should be_empty
|
89
|
+
content_from_page_2.should_not be_empty
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should be possible to specify last page" do
|
93
|
+
target = @target_path + "/multi.html"
|
94
|
+
Kristin::Converter.new(@multi_page_pdf, target, { last_page: 9 }).convert
|
95
|
+
doc = Nokogiri::HTML(File.open(target))
|
96
|
+
# Content only present on page 1
|
97
|
+
content_from_page_1 = doc.search("//span").map(&:content).select {|c| c.include? "Geometric series"}
|
98
|
+
# Content only present on page 10
|
99
|
+
content_from_page_10 = doc.search("//span").map(&:content).select {|c| c.include? "William Blake"}
|
100
|
+
content_from_page_1.should_not be_empty
|
101
|
+
content_from_page_10.should be_empty
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should be possible to specify hdpi and vdpi" do
|
105
|
+
target = @target_path + "/one.html"
|
106
|
+
Kristin::Converter.new(@one_page_pdf, target, { hdpi: 1, vdpi: 1 }).convert
|
107
|
+
doc = Nokogiri::HTML(File.open(target))
|
108
|
+
doc.xpath("//img[@class='bi']/@src").first.content.size.should == 538 # The size you get when hdpi and vdpi is 1 on @one_page_pdf
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should be possible to specify vdpi" do
|
112
|
+
|
113
|
+
end
|
42
114
|
end
|
115
|
+
end
|
43
116
|
|
44
|
-
|
45
|
-
|
46
|
-
|
117
|
+
describe ".convert" do
|
118
|
+
it "should convert without options" do
|
119
|
+
target = @target_path + "/one.html"
|
120
|
+
Kristin.convert(@one_page_pdf, target)
|
121
|
+
File.exists?(target).should == true
|
47
122
|
end
|
48
123
|
|
49
|
-
it "should
|
50
|
-
target = @target_path + "/
|
51
|
-
|
124
|
+
it "should convert with options" do
|
125
|
+
target = @target_path + "/one.html"
|
126
|
+
Kristin.convert(@one_page_pdf, target, { hdpi: 1, vdpi: 1 })
|
127
|
+
File.exists?(target).should == true
|
52
128
|
end
|
53
129
|
end
|
54
130
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kristin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03
|
11
|
+
date: 2013-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: ' Convert PDF docs to beautiful HTML files without losing text or format.
|
42
56
|
This gem uses pdf2htmlEX to do the conversion.'
|
43
57
|
email:
|
@@ -56,6 +70,7 @@ files:
|
|
56
70
|
- lib/kristin.rb
|
57
71
|
- lib/kristin/version.rb
|
58
72
|
- spec/fixtures/image.png
|
73
|
+
- spec/fixtures/large.pdf
|
59
74
|
- spec/fixtures/multi.pdf
|
60
75
|
- spec/fixtures/one.pdf
|
61
76
|
- spec/kristin_spec.rb
|
@@ -80,13 +95,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
95
|
version: '0'
|
81
96
|
requirements: []
|
82
97
|
rubyforge_project:
|
83
|
-
rubygems_version: 2.0.
|
98
|
+
rubygems_version: 2.0.3
|
84
99
|
signing_key:
|
85
100
|
specification_version: 4
|
86
101
|
summary: Convert PDF docs to beautiful HTML files without losing text or format. This
|
87
102
|
gem uses pdf2htmlEX to do the conversion.
|
88
103
|
test_files:
|
89
104
|
- spec/fixtures/image.png
|
105
|
+
- spec/fixtures/large.pdf
|
90
106
|
- spec/fixtures/multi.pdf
|
91
107
|
- spec/fixtures/one.pdf
|
92
108
|
- spec/kristin_spec.rb
|