pdftohtmlr 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +6 -5
- data/Rakefile +3 -3
- data/lib/pdftohtmlr.rb +37 -10
- data/test/pdftohtmlr_test.rb +50 -11
- metadata +2 -2
data/README.textile
CHANGED
@@ -15,15 +15,16 @@ h1. install
|
|
15
15
|
<pre><code>gem install pdftohtmlr</code></pre>
|
16
16
|
|
17
17
|
h1. using
|
18
|
+
"gist examples":http://gist.github.com/254556.js?file=pdftohtmlr_example.rb"
|
19
|
+
|
18
20
|
<pre><code lang="ruby">require 'pdftohtmlr'
|
19
21
|
require 'nokogiri'
|
20
|
-
file =
|
21
|
-
[Target File (not implemented yet)],
|
22
|
-
[user password],
|
23
|
-
[owner password])
|
22
|
+
file = PdfFilePath.new([Path to Source PDF])
|
24
23
|
string = file.convert
|
25
24
|
doc = file.convert_to_document()</code></pre>
|
26
25
|
|
26
|
+
See included test cases for more usage examples, including passwords and URL fetching.
|
27
|
+
|
27
28
|
h1. license
|
28
29
|
|
29
|
-
MIT
|
30
|
+
MIT (See included MIT-LICENSE)
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ desc "Clean generated files"
|
|
18
18
|
task :clean do
|
19
19
|
rm FileList['test/output/*.png']
|
20
20
|
rm_rf 'pkg'
|
21
|
-
rm_rf '
|
21
|
+
rm_rf 'rdoc'
|
22
22
|
end
|
23
23
|
|
24
24
|
desc 'Test the pdftohtmlr gem.'
|
@@ -32,8 +32,8 @@ desc 'Generate documentation for the pdftohtmlr gem.'
|
|
32
32
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
33
33
|
rdoc.rdoc_dir = 'rdoc'
|
34
34
|
rdoc.title = 'pdftohtmlr'
|
35
|
-
rdoc.options << '--line-numbers
|
36
|
-
rdoc.rdoc_files.include('README')
|
35
|
+
rdoc.options << '--line-numbers'
|
36
|
+
rdoc.rdoc_files.include('README.textile')
|
37
37
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
38
38
|
end
|
39
39
|
|
data/lib/pdftohtmlr.rb
CHANGED
@@ -12,13 +12,16 @@
|
|
12
12
|
require 'rubygems'
|
13
13
|
require 'open3'
|
14
14
|
require 'nokogiri'
|
15
|
+
require 'uri'
|
16
|
+
require 'open-uri'
|
17
|
+
require 'tempfile'
|
15
18
|
|
16
19
|
module PDFToHTMLR
|
17
20
|
|
18
21
|
# Simple local error abstraction
|
19
22
|
class PDFToHTMLRError < RuntimeError; end
|
20
23
|
|
21
|
-
VERSION = '0.3.
|
24
|
+
VERSION = '0.3.1'
|
22
25
|
|
23
26
|
# Provides facilities for converting PDFs to HTML from Ruby code.
|
24
27
|
class PdfFile
|
@@ -27,17 +30,11 @@ module PDFToHTMLR
|
|
27
30
|
attr :user_pwd
|
28
31
|
attr :owner_pwd
|
29
32
|
|
30
|
-
def initialize(input_path, target_path, user_pwd, owner_pwd)
|
33
|
+
def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
31
34
|
@path = input_path
|
32
35
|
@target = target_path
|
33
36
|
@user_pwd = user_pwd
|
34
|
-
@owner_pwd = owner_pwd
|
35
|
-
|
36
|
-
# check to make sure file is legit
|
37
|
-
if (!File.exist?(@path))
|
38
|
-
raise PDFToHTMLRError, "invalid file path"
|
39
|
-
end
|
40
|
-
|
37
|
+
@owner_pwd = owner_pwd
|
41
38
|
end
|
42
39
|
|
43
40
|
# Convert the PDF document to HTML. Returns a string
|
@@ -60,7 +57,7 @@ module PDFToHTMLR
|
|
60
57
|
end
|
61
58
|
|
62
59
|
if (errors != "")
|
63
|
-
raise PDFToHTMLRError, errors.to_s
|
60
|
+
raise PDFToHTMLRError, errors.first.to_s.chomp
|
64
61
|
else
|
65
62
|
return output
|
66
63
|
end
|
@@ -72,4 +69,34 @@ module PDFToHTMLR
|
|
72
69
|
end
|
73
70
|
|
74
71
|
end
|
72
|
+
|
73
|
+
# Handle a string-based local path as input, extends PdfFile
|
74
|
+
class PdfFilePath < PdfFile
|
75
|
+
def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
76
|
+
# check to make sure file is legit
|
77
|
+
if (!File.exist?(input_path))
|
78
|
+
raise PDFToHTMLRError, "invalid file path"
|
79
|
+
end
|
80
|
+
|
81
|
+
super(input_path, target_path, user_pwd, owner_pwd)
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Handle a URI as a remote path to a PDF, extends PdfFile
|
87
|
+
class PdfFileUrl < PdfFile
|
88
|
+
def initialize(input_url, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
89
|
+
# check to make sure file is legit
|
90
|
+
begin
|
91
|
+
if ((input_url =~ URI::regexp).nil?)
|
92
|
+
raise PDFToHTMLRError, "invalid file url"
|
93
|
+
end
|
94
|
+
tempfile = Tempfile.new('pdftohtmlr')
|
95
|
+
File.open(tempfile.path, 'w') {|f| f.write(open(input_url).read) }
|
96
|
+
super(tempfile.path, target_path, user_pwd, owner_pwd)
|
97
|
+
rescue => bang
|
98
|
+
raise PDFToHTMLRError, bang.to_s
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
75
102
|
end
|
data/test/pdftohtmlr_test.rb
CHANGED
@@ -9,53 +9,92 @@ class PdfFileTest < Test::Unit::TestCase
|
|
9
9
|
TEST_PWD_PDF_PATH = CURRENT_DIR + "test_pw.pdf"
|
10
10
|
TEST_BAD_PATH = "blah.pdf"
|
11
11
|
TEST_NON_PDF = CURRENT_DIR + "pdftohtmlr_test.rb"
|
12
|
-
|
12
|
+
TEST_URL_PDF =
|
13
|
+
"http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf"
|
14
|
+
TEST_URL_NON_PDF =
|
15
|
+
"http://github.com/kitplummer/pdftohtmlr/raw/master/test/pdftohtmlr_test.rb"
|
13
16
|
def test_pdffile_new
|
14
|
-
file =
|
17
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
15
18
|
assert file
|
16
19
|
end
|
17
20
|
|
18
21
|
def test_invalid_pdffile
|
19
22
|
e = assert_raise PDFToHTMLRError do
|
20
|
-
file =
|
23
|
+
file = PdfFilePath.new(TEST_NON_PDF, ".", nil, nil)
|
21
24
|
file.convert
|
22
25
|
end
|
26
|
+
assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
|
23
27
|
end
|
24
28
|
|
25
29
|
def test_bad_pdffile_new
|
26
|
-
assert_raise PDFToHTMLRError do
|
27
|
-
file =
|
30
|
+
e = assert_raise PDFToHTMLRError do
|
31
|
+
file = PdfFilePath.new(TEST_BAD_PATH, ".", nil, nil)
|
28
32
|
end
|
33
|
+
assert_equal "invalid file path", e.to_s
|
29
34
|
end
|
30
35
|
|
31
36
|
def test_string_from_pdffile
|
32
|
-
file =
|
37
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
33
38
|
assert_equal "String", file.convert().class.to_s
|
34
39
|
assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
|
35
40
|
end
|
36
41
|
|
37
42
|
def test_invalid_pwd_pdffile
|
38
|
-
assert_raise PDFToHTMLRError do
|
39
|
-
file =
|
43
|
+
e = assert_raise PDFToHTMLRError do
|
44
|
+
file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "blah", nil)
|
40
45
|
file.convert
|
41
46
|
end
|
47
|
+
assert_equal "Error: Incorrect password", e.to_s
|
42
48
|
end
|
43
49
|
|
44
50
|
def test_valid_pwd_pdffile
|
45
|
-
file =
|
51
|
+
file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "user", nil)
|
46
52
|
assert_equal "String", file.convert().class.to_s
|
47
53
|
assert_equal `pdftohtml -stdout -upw user #{TEST_PWD_PDF_PATH}`,
|
48
54
|
file.convert()
|
49
55
|
end
|
50
56
|
|
51
57
|
def test_return_document
|
52
|
-
file =
|
58
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
53
59
|
assert_equal "Nokogiri::HTML::Document",
|
54
60
|
file.convert_to_document().class.to_s
|
55
61
|
assert_equal Nokogiri::HTML.parse(
|
56
|
-
`pdftohtml -stdout
|
62
|
+
`pdftohtml -stdout #{TEST_PDF_PATH}`
|
57
63
|
).css('body').first.to_s,
|
58
64
|
file.convert_to_document().css('body').first.to_s
|
59
65
|
end
|
60
66
|
|
67
|
+
def test_invalid_URL_pdffile
|
68
|
+
e = assert_raise PDFToHTMLRError do
|
69
|
+
file = PdfFileUrl.new("blah", ".", nil, nil)
|
70
|
+
end
|
71
|
+
assert_equal "invalid file url", e.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_invalid_URL_resource_pdffile
|
75
|
+
e = assert_raise PDFToHTMLRError do
|
76
|
+
file = PdfFileUrl.new("http://github.com/kitplummer/blah", ".", nil, nil)
|
77
|
+
end
|
78
|
+
assert_equal "404 Not Found", e.to_s
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_invalid_URL_pdf_pdffile
|
82
|
+
e = assert_raise PDFToHTMLRError do
|
83
|
+
file = PdfFileUrl.new(TEST_URL_NON_PDF, ".", nil, nil)
|
84
|
+
file.convert
|
85
|
+
end
|
86
|
+
assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_valid_URL_pdffile
|
90
|
+
# http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
|
91
|
+
file = PdfFileUrl.new(TEST_URL_PDF, ".", nil, nil)
|
92
|
+
assert_equal "String", file.convert().class.to_s
|
93
|
+
assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_args
|
97
|
+
file = PdfFileUrl.new(TEST_URL_PDF)
|
98
|
+
assert_equal "String", file.convert().class.to_s
|
99
|
+
end
|
61
100
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftohtmlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kit Plummer
|
@@ -9,7 +9,7 @@ autorequire: pdftohtml
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|