pdftohtmlr 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +6 -5
- data/Rakefile +3 -3
- data/lib/pdftohtmlr.rb +37 -10
- data/test/pdftohtmlr_test.rb +50 -11
- metadata +2 -2
data/README.textile
CHANGED
@@ -15,15 +15,16 @@ h1. install
|
|
15
15
|
<pre><code>gem install pdftohtmlr</code></pre>
|
16
16
|
|
17
17
|
h1. using
|
18
|
+
"gist examples":http://gist.github.com/254556.js?file=pdftohtmlr_example.rb"
|
19
|
+
|
18
20
|
<pre><code lang="ruby">require 'pdftohtmlr'
|
19
21
|
require 'nokogiri'
|
20
|
-
file =
|
21
|
-
[Target File (not implemented yet)],
|
22
|
-
[user password],
|
23
|
-
[owner password])
|
22
|
+
file = PdfFilePath.new([Path to Source PDF])
|
24
23
|
string = file.convert
|
25
24
|
doc = file.convert_to_document()</code></pre>
|
26
25
|
|
26
|
+
See included test cases for more usage examples, including passwords and URL fetching.
|
27
|
+
|
27
28
|
h1. license
|
28
29
|
|
29
|
-
MIT
|
30
|
+
MIT (See included MIT-LICENSE)
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ desc "Clean generated files"
|
|
18
18
|
task :clean do
|
19
19
|
rm FileList['test/output/*.png']
|
20
20
|
rm_rf 'pkg'
|
21
|
-
rm_rf '
|
21
|
+
rm_rf 'rdoc'
|
22
22
|
end
|
23
23
|
|
24
24
|
desc 'Test the pdftohtmlr gem.'
|
@@ -32,8 +32,8 @@ desc 'Generate documentation for the pdftohtmlr gem.'
|
|
32
32
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
33
33
|
rdoc.rdoc_dir = 'rdoc'
|
34
34
|
rdoc.title = 'pdftohtmlr'
|
35
|
-
rdoc.options << '--line-numbers
|
36
|
-
rdoc.rdoc_files.include('README')
|
35
|
+
rdoc.options << '--line-numbers'
|
36
|
+
rdoc.rdoc_files.include('README.textile')
|
37
37
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
38
38
|
end
|
39
39
|
|
data/lib/pdftohtmlr.rb
CHANGED
@@ -12,13 +12,16 @@
|
|
12
12
|
require 'rubygems'
|
13
13
|
require 'open3'
|
14
14
|
require 'nokogiri'
|
15
|
+
require 'uri'
|
16
|
+
require 'open-uri'
|
17
|
+
require 'tempfile'
|
15
18
|
|
16
19
|
module PDFToHTMLR
|
17
20
|
|
18
21
|
# Simple local error abstraction
|
19
22
|
class PDFToHTMLRError < RuntimeError; end
|
20
23
|
|
21
|
-
VERSION = '0.3.
|
24
|
+
VERSION = '0.3.1'
|
22
25
|
|
23
26
|
# Provides facilities for converting PDFs to HTML from Ruby code.
|
24
27
|
class PdfFile
|
@@ -27,17 +30,11 @@ module PDFToHTMLR
|
|
27
30
|
attr :user_pwd
|
28
31
|
attr :owner_pwd
|
29
32
|
|
30
|
-
def initialize(input_path, target_path, user_pwd, owner_pwd)
|
33
|
+
def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
31
34
|
@path = input_path
|
32
35
|
@target = target_path
|
33
36
|
@user_pwd = user_pwd
|
34
|
-
@owner_pwd = owner_pwd
|
35
|
-
|
36
|
-
# check to make sure file is legit
|
37
|
-
if (!File.exist?(@path))
|
38
|
-
raise PDFToHTMLRError, "invalid file path"
|
39
|
-
end
|
40
|
-
|
37
|
+
@owner_pwd = owner_pwd
|
41
38
|
end
|
42
39
|
|
43
40
|
# Convert the PDF document to HTML. Returns a string
|
@@ -60,7 +57,7 @@ module PDFToHTMLR
|
|
60
57
|
end
|
61
58
|
|
62
59
|
if (errors != "")
|
63
|
-
raise PDFToHTMLRError, errors.to_s
|
60
|
+
raise PDFToHTMLRError, errors.first.to_s.chomp
|
64
61
|
else
|
65
62
|
return output
|
66
63
|
end
|
@@ -72,4 +69,34 @@ module PDFToHTMLR
|
|
72
69
|
end
|
73
70
|
|
74
71
|
end
|
72
|
+
|
73
|
+
# Handle a string-based local path as input, extends PdfFile
|
74
|
+
class PdfFilePath < PdfFile
|
75
|
+
def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
76
|
+
# check to make sure file is legit
|
77
|
+
if (!File.exist?(input_path))
|
78
|
+
raise PDFToHTMLRError, "invalid file path"
|
79
|
+
end
|
80
|
+
|
81
|
+
super(input_path, target_path, user_pwd, owner_pwd)
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Handle a URI as a remote path to a PDF, extends PdfFile
|
87
|
+
class PdfFileUrl < PdfFile
|
88
|
+
def initialize(input_url, target_path=nil, user_pwd=nil, owner_pwd=nil)
|
89
|
+
# check to make sure file is legit
|
90
|
+
begin
|
91
|
+
if ((input_url =~ URI::regexp).nil?)
|
92
|
+
raise PDFToHTMLRError, "invalid file url"
|
93
|
+
end
|
94
|
+
tempfile = Tempfile.new('pdftohtmlr')
|
95
|
+
File.open(tempfile.path, 'w') {|f| f.write(open(input_url).read) }
|
96
|
+
super(tempfile.path, target_path, user_pwd, owner_pwd)
|
97
|
+
rescue => bang
|
98
|
+
raise PDFToHTMLRError, bang.to_s
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
75
102
|
end
|
data/test/pdftohtmlr_test.rb
CHANGED
@@ -9,53 +9,92 @@ class PdfFileTest < Test::Unit::TestCase
|
|
9
9
|
TEST_PWD_PDF_PATH = CURRENT_DIR + "test_pw.pdf"
|
10
10
|
TEST_BAD_PATH = "blah.pdf"
|
11
11
|
TEST_NON_PDF = CURRENT_DIR + "pdftohtmlr_test.rb"
|
12
|
-
|
12
|
+
TEST_URL_PDF =
|
13
|
+
"http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf"
|
14
|
+
TEST_URL_NON_PDF =
|
15
|
+
"http://github.com/kitplummer/pdftohtmlr/raw/master/test/pdftohtmlr_test.rb"
|
13
16
|
def test_pdffile_new
|
14
|
-
file =
|
17
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
15
18
|
assert file
|
16
19
|
end
|
17
20
|
|
18
21
|
def test_invalid_pdffile
|
19
22
|
e = assert_raise PDFToHTMLRError do
|
20
|
-
file =
|
23
|
+
file = PdfFilePath.new(TEST_NON_PDF, ".", nil, nil)
|
21
24
|
file.convert
|
22
25
|
end
|
26
|
+
assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
|
23
27
|
end
|
24
28
|
|
25
29
|
def test_bad_pdffile_new
|
26
|
-
assert_raise PDFToHTMLRError do
|
27
|
-
file =
|
30
|
+
e = assert_raise PDFToHTMLRError do
|
31
|
+
file = PdfFilePath.new(TEST_BAD_PATH, ".", nil, nil)
|
28
32
|
end
|
33
|
+
assert_equal "invalid file path", e.to_s
|
29
34
|
end
|
30
35
|
|
31
36
|
def test_string_from_pdffile
|
32
|
-
file =
|
37
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
33
38
|
assert_equal "String", file.convert().class.to_s
|
34
39
|
assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
|
35
40
|
end
|
36
41
|
|
37
42
|
def test_invalid_pwd_pdffile
|
38
|
-
assert_raise PDFToHTMLRError do
|
39
|
-
file =
|
43
|
+
e = assert_raise PDFToHTMLRError do
|
44
|
+
file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "blah", nil)
|
40
45
|
file.convert
|
41
46
|
end
|
47
|
+
assert_equal "Error: Incorrect password", e.to_s
|
42
48
|
end
|
43
49
|
|
44
50
|
def test_valid_pwd_pdffile
|
45
|
-
file =
|
51
|
+
file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "user", nil)
|
46
52
|
assert_equal "String", file.convert().class.to_s
|
47
53
|
assert_equal `pdftohtml -stdout -upw user #{TEST_PWD_PDF_PATH}`,
|
48
54
|
file.convert()
|
49
55
|
end
|
50
56
|
|
51
57
|
def test_return_document
|
52
|
-
file =
|
58
|
+
file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
|
53
59
|
assert_equal "Nokogiri::HTML::Document",
|
54
60
|
file.convert_to_document().class.to_s
|
55
61
|
assert_equal Nokogiri::HTML.parse(
|
56
|
-
`pdftohtml -stdout
|
62
|
+
`pdftohtml -stdout #{TEST_PDF_PATH}`
|
57
63
|
).css('body').first.to_s,
|
58
64
|
file.convert_to_document().css('body').first.to_s
|
59
65
|
end
|
60
66
|
|
67
|
+
def test_invalid_URL_pdffile
|
68
|
+
e = assert_raise PDFToHTMLRError do
|
69
|
+
file = PdfFileUrl.new("blah", ".", nil, nil)
|
70
|
+
end
|
71
|
+
assert_equal "invalid file url", e.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_invalid_URL_resource_pdffile
|
75
|
+
e = assert_raise PDFToHTMLRError do
|
76
|
+
file = PdfFileUrl.new("http://github.com/kitplummer/blah", ".", nil, nil)
|
77
|
+
end
|
78
|
+
assert_equal "404 Not Found", e.to_s
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_invalid_URL_pdf_pdffile
|
82
|
+
e = assert_raise PDFToHTMLRError do
|
83
|
+
file = PdfFileUrl.new(TEST_URL_NON_PDF, ".", nil, nil)
|
84
|
+
file.convert
|
85
|
+
end
|
86
|
+
assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_valid_URL_pdffile
|
90
|
+
# http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
|
91
|
+
file = PdfFileUrl.new(TEST_URL_PDF, ".", nil, nil)
|
92
|
+
assert_equal "String", file.convert().class.to_s
|
93
|
+
assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_args
|
97
|
+
file = PdfFileUrl.new(TEST_URL_PDF)
|
98
|
+
assert_equal "String", file.convert().class.to_s
|
99
|
+
end
|
61
100
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftohtmlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kit Plummer
|
@@ -9,7 +9,7 @@ autorequire: pdftohtml
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|