pdftohtmlr 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -15,15 +15,16 @@ h1. install
15
15
  <pre><code>gem install pdftohtmlr</code></pre>
16
16
 
17
17
  h1. using
18
+ "gist examples":http://gist.github.com/254556.js?file=pdftohtmlr_example.rb"
19
+
18
20
  <pre><code lang="ruby">require 'pdftohtmlr'
19
21
  require 'nokogiri'
20
- file = PdfFile.new([Path to Source PDF],
21
- [Target File (not implemented yet)],
22
- [user password],
23
- [owner password])
22
+ file = PdfFilePath.new([Path to Source PDF])
24
23
  string = file.convert
25
24
  doc = file.convert_to_document()</code></pre>
26
25
 
26
+ See included test cases for more usage examples, including passwords and URL fetching.
27
+
27
28
  h1. license
28
29
 
29
- MIT
30
+ MIT (See included MIT-LICENSE)
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ desc "Clean generated files"
18
18
  task :clean do
19
19
  rm FileList['test/output/*.png']
20
20
  rm_rf 'pkg'
21
- rm_rf 'doc'
21
+ rm_rf 'rdoc'
22
22
  end
23
23
 
24
24
  desc 'Test the pdftohtmlr gem.'
@@ -32,8 +32,8 @@ desc 'Generate documentation for the pdftohtmlr gem.'
32
32
  Rake::RDocTask.new(:rdoc) do |rdoc|
33
33
  rdoc.rdoc_dir = 'rdoc'
34
34
  rdoc.title = 'pdftohtmlr'
35
- rdoc.options << '--line-numbers --inline-source'
36
- rdoc.rdoc_files.include('README')
35
+ rdoc.options << '--line-numbers'
36
+ rdoc.rdoc_files.include('README.textile')
37
37
  rdoc.rdoc_files.include('lib/**/*.rb')
38
38
  end
39
39
 
data/lib/pdftohtmlr.rb CHANGED
@@ -12,13 +12,16 @@
12
12
  require 'rubygems'
13
13
  require 'open3'
14
14
  require 'nokogiri'
15
+ require 'uri'
16
+ require 'open-uri'
17
+ require 'tempfile'
15
18
 
16
19
  module PDFToHTMLR
17
20
 
18
21
  # Simple local error abstraction
19
22
  class PDFToHTMLRError < RuntimeError; end
20
23
 
21
- VERSION = '0.3.0'
24
+ VERSION = '0.3.1'
22
25
 
23
26
  # Provides facilities for converting PDFs to HTML from Ruby code.
24
27
  class PdfFile
@@ -27,17 +30,11 @@ module PDFToHTMLR
27
30
  attr :user_pwd
28
31
  attr :owner_pwd
29
32
 
30
- def initialize(input_path, target_path, user_pwd, owner_pwd)
33
+ def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
31
34
  @path = input_path
32
35
  @target = target_path
33
36
  @user_pwd = user_pwd
34
- @owner_pwd = owner_pwd
35
-
36
- # check to make sure file is legit
37
- if (!File.exist?(@path))
38
- raise PDFToHTMLRError, "invalid file path"
39
- end
40
-
37
+ @owner_pwd = owner_pwd
41
38
  end
42
39
 
43
40
  # Convert the PDF document to HTML. Returns a string
@@ -60,7 +57,7 @@ module PDFToHTMLR
60
57
  end
61
58
 
62
59
  if (errors != "")
63
- raise PDFToHTMLRError, errors.to_s
60
+ raise PDFToHTMLRError, errors.first.to_s.chomp
64
61
  else
65
62
  return output
66
63
  end
@@ -72,4 +69,34 @@ module PDFToHTMLR
72
69
  end
73
70
 
74
71
  end
72
+
73
+ # Handle a string-based local path as input, extends PdfFile
74
+ class PdfFilePath < PdfFile
75
+ def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
76
+ # check to make sure file is legit
77
+ if (!File.exist?(input_path))
78
+ raise PDFToHTMLRError, "invalid file path"
79
+ end
80
+
81
+ super(input_path, target_path, user_pwd, owner_pwd)
82
+
83
+ end
84
+ end
85
+
86
+ # Handle a URI as a remote path to a PDF, extends PdfFile
87
+ class PdfFileUrl < PdfFile
88
+ def initialize(input_url, target_path=nil, user_pwd=nil, owner_pwd=nil)
89
+ # check to make sure file is legit
90
+ begin
91
+ if ((input_url =~ URI::regexp).nil?)
92
+ raise PDFToHTMLRError, "invalid file url"
93
+ end
94
+ tempfile = Tempfile.new('pdftohtmlr')
95
+ File.open(tempfile.path, 'w') {|f| f.write(open(input_url).read) }
96
+ super(tempfile.path, target_path, user_pwd, owner_pwd)
97
+ rescue => bang
98
+ raise PDFToHTMLRError, bang.to_s
99
+ end
100
+ end
101
+ end
75
102
  end
@@ -9,53 +9,92 @@ class PdfFileTest < Test::Unit::TestCase
9
9
  TEST_PWD_PDF_PATH = CURRENT_DIR + "test_pw.pdf"
10
10
  TEST_BAD_PATH = "blah.pdf"
11
11
  TEST_NON_PDF = CURRENT_DIR + "pdftohtmlr_test.rb"
12
-
12
+ TEST_URL_PDF =
13
+ "http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf"
14
+ TEST_URL_NON_PDF =
15
+ "http://github.com/kitplummer/pdftohtmlr/raw/master/test/pdftohtmlr_test.rb"
13
16
  def test_pdffile_new
14
- file = PdfFile.new(TEST_PDF_PATH, ".", nil, nil)
17
+ file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
15
18
  assert file
16
19
  end
17
20
 
18
21
  def test_invalid_pdffile
19
22
  e = assert_raise PDFToHTMLRError do
20
- file = PdfFile.new(TEST_NON_PDF, ".", nil, nil)
23
+ file = PdfFilePath.new(TEST_NON_PDF, ".", nil, nil)
21
24
  file.convert
22
25
  end
26
+ assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
23
27
  end
24
28
 
25
29
  def test_bad_pdffile_new
26
- assert_raise PDFToHTMLRError do
27
- file = PdfFile.new(TEST_BAD_PATH, ".", nil, nil)
30
+ e = assert_raise PDFToHTMLRError do
31
+ file = PdfFilePath.new(TEST_BAD_PATH, ".", nil, nil)
28
32
  end
33
+ assert_equal "invalid file path", e.to_s
29
34
  end
30
35
 
31
36
  def test_string_from_pdffile
32
- file = PdfFile.new(TEST_PDF_PATH, ".", nil, nil)
37
+ file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
33
38
  assert_equal "String", file.convert().class.to_s
34
39
  assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
35
40
  end
36
41
 
37
42
  def test_invalid_pwd_pdffile
38
- assert_raise PDFToHTMLRError do
39
- file = PdfFile.new(TEST_PWD_PDF_PATH, ".", "blah", nil)
43
+ e = assert_raise PDFToHTMLRError do
44
+ file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "blah", nil)
40
45
  file.convert
41
46
  end
47
+ assert_equal "Error: Incorrect password", e.to_s
42
48
  end
43
49
 
44
50
  def test_valid_pwd_pdffile
45
- file = PdfFile.new(TEST_PWD_PDF_PATH, ".", "user", nil)
51
+ file = PdfFilePath.new(TEST_PWD_PDF_PATH, ".", "user", nil)
46
52
  assert_equal "String", file.convert().class.to_s
47
53
  assert_equal `pdftohtml -stdout -upw user #{TEST_PWD_PDF_PATH}`,
48
54
  file.convert()
49
55
  end
50
56
 
51
57
  def test_return_document
52
- file = PdfFile.new(TEST_PDF_PATH, ".", nil, nil)
58
+ file = PdfFilePath.new(TEST_PDF_PATH, ".", nil, nil)
53
59
  assert_equal "Nokogiri::HTML::Document",
54
60
  file.convert_to_document().class.to_s
55
61
  assert_equal Nokogiri::HTML.parse(
56
- `pdftohtml -stdout -upw user #{TEST_PWD_PDF_PATH}`
62
+ `pdftohtml -stdout #{TEST_PDF_PATH}`
57
63
  ).css('body').first.to_s,
58
64
  file.convert_to_document().css('body').first.to_s
59
65
  end
60
66
 
67
+ def test_invalid_URL_pdffile
68
+ e = assert_raise PDFToHTMLRError do
69
+ file = PdfFileUrl.new("blah", ".", nil, nil)
70
+ end
71
+ assert_equal "invalid file url", e.to_s
72
+ end
73
+
74
+ def test_invalid_URL_resource_pdffile
75
+ e = assert_raise PDFToHTMLRError do
76
+ file = PdfFileUrl.new("http://github.com/kitplummer/blah", ".", nil, nil)
77
+ end
78
+ assert_equal "404 Not Found", e.to_s
79
+ end
80
+
81
+ def test_invalid_URL_pdf_pdffile
82
+ e = assert_raise PDFToHTMLRError do
83
+ file = PdfFileUrl.new(TEST_URL_NON_PDF, ".", nil, nil)
84
+ file.convert
85
+ end
86
+ assert_equal "Error: May not be a PDF file (continuing anyway)", e.to_s
87
+ end
88
+
89
+ def test_valid_URL_pdffile
90
+ # http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
91
+ file = PdfFileUrl.new(TEST_URL_PDF, ".", nil, nil)
92
+ assert_equal "String", file.convert().class.to_s
93
+ assert_equal `pdftohtml -stdout #{TEST_PDF_PATH}`, file.convert()
94
+ end
95
+
96
+ def test_args
97
+ file = PdfFileUrl.new(TEST_URL_PDF)
98
+ assert_equal "String", file.convert().class.to_s
99
+ end
61
100
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftohtmlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kit Plummer
@@ -9,7 +9,7 @@ autorequire: pdftohtml
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-13 00:00:00 -07:00
12
+ date: 2009-12-14 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15