antiwordr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2009 kitplummer@gmail.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.textile ADDED
@@ -0,0 +1,22 @@
1
+ h1. antiwordr
2
+
3
+ Wrapper around the command line tool antiword which converts Word Document (97/2003) to text or Docbook.
4
+
5
+ h1. requirements
6
+
7
+ Just antiword, Ruby and a few gems (1.8.6+ as far as I know).
8
+
9
+ h1. using
10
+
11
+ <pre><code lang="ruby">require 'antiwordr'
12
+ require 'nokogiri'
13
+ file = DocFilePath.new([Path to Source PDF])
14
+ string = file.convert()
15
+ xml = file.convert_to_docbook()
16
+ doc = file.convert_to_docbook_document()</code></pre>
17
+
18
+ See included test cases for more usage examples.
19
+
20
+ h1. license
21
+
22
+ MIT (See included MIT-LICENSE)
data/Rakefile ADDED
@@ -0,0 +1,67 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/packagetask'
5
+ require 'rake/gempackagetask'
6
+
7
+ $:.unshift(File.dirname(__FILE__) + "/lib")
8
+ require 'antiwordr'
9
+
10
+ PKG_NAME = 'antiwordr'
11
+ PKG_VERSION = AntiWordR::VERSION
12
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
13
+
14
+ desc 'Default: run unit tests.'
15
+ task :default => :test
16
+
17
+ desc "Clean generated files"
18
+ task :clean do
19
+ rm_rf 'pkg'
20
+ rm_rf 'rdoc'
21
+ end
22
+
23
+ desc 'Test the antiwordr gem.'
24
+ Rake::TestTask.new(:test) do |t|
25
+ t.libs << 'lib'
26
+ t.pattern = 'test/**/*_test.rb'
27
+ t.verbose = true
28
+ end
29
+
30
+ desc 'Generate documentation for the antiwordr gem.'
31
+ Rake::RDocTask.new(:rdoc) do |rdoc|
32
+ rdoc.rdoc_dir = 'rdoc'
33
+ rdoc.title = 'antiwordr'
34
+ rdoc.options << '--line-numbers'
35
+ rdoc.rdoc_files.include('README.textile')
36
+ rdoc.rdoc_files.include('lib/**/*.rb')
37
+ end
38
+
39
+
40
+ # Create compressed packages
41
+ spec = Gem::Specification.new do |s|
42
+ s.platform = Gem::Platform::RUBY
43
+ s.name = PKG_NAME
44
+ s.summary = "Convert Word Docs to text."
45
+ s.description = %q{Uses command-line antiword tools to convert Docs to text.}
46
+ s.version = PKG_VERSION
47
+
48
+ s.author = "Kit Plummer"
49
+ s.email = "kitplummer@gmail.com"
50
+ s.rubyforge_project = PKG_NAME
51
+ s.homepage = "http://github.com/kitplummer/antiwordr"
52
+
53
+ s.has_rdoc = true
54
+ s.requirements << 'none'
55
+ s.require_path = 'lib'
56
+ s.autorequire = 'antiwordr'
57
+ s.add_dependency("nokogiri", ">= 1.3.3")
58
+ s.files = [ "Rakefile", "README.textile", "MIT-LICENSE" ]
59
+ s.files = s.files + Dir.glob( "lib/**/*" ).delete_if { |item| item.include?( "\.svn" ) }
60
+ s.files = s.files + Dir.glob( "test/**/*" ).delete_if { |item| item.include?( "\.svn" ) || item.include?("\.png") }
61
+ end
62
+
63
+ Rake::GemPackageTask.new(spec) do |p|
64
+ p.gem_spec = spec
65
+ p.need_tar = false
66
+ p.need_zip = true
67
+ end
data/lib/antiwordr.rb ADDED
@@ -0,0 +1,96 @@
1
+ # The library has a single method for converting PDF files into HTML. The
2
+ # method current takes in the source path, and either/both the user and owner
3
+ # passwords set on the source PDF document. The convert method returns the
4
+ # HTML as a string for further manipulation of loading into a Document.
5
+ #
6
+ # Requires that pdftohtml be installed and on the path
7
+ #
8
+ # Author:: Kit Plummer (mailto:kitplummer@gmail.com)
9
+ # Copyright:: Copyright (c) 2010 Kit Plummer
10
+ # License:: MIT
11
+
12
+ require 'rubygems'
13
+ require 'nokogiri'
14
+ require 'uri'
15
+ require 'open-uri'
16
+ require 'tempfile'
17
+
18
+ module AntiWordR
19
+
20
+ # Simple local error abstraction
21
+ class AntiWordRError < RuntimeError; end
22
+
23
+ VERSION = '0.1.0'
24
+
25
+ # Provides facilities for converting Word Docs to Text rom Ruby code.
26
+ class DocFile
27
+ attr :path
28
+ attr :target
29
+ attr :format
30
+
31
+ def initialize(input_path, target_path=nil)
32
+ @path = input_path
33
+ @target = target_path
34
+ end
35
+
36
+ # Convert the PDF document to HTML. Returns a string
37
+ def convert()
38
+ errors = ""
39
+ output = ""
40
+
41
+ cmd = "antiword #{format}" + ' "' + @path + '"'
42
+
43
+ output = `#{cmd} 2>&1`
44
+
45
+ if (output.include?("command not found"))
46
+ raise AntiWordRError, "AntiWordR requires antiword to be installed"
47
+ elsif (output.include?("is not a Word Document"))
48
+ raise AntiWordRError, "Source document is not a Word Document"
49
+ elsif (output.include?("Error:"))
50
+ raise AntiWordRError, output.split("\n").first.to_s.chomp
51
+ else
52
+ return output
53
+ end
54
+ end
55
+
56
+ # Convert the PDF document to HTML. Returns a Nokogiri::HTML:Document
57
+ def convert_to_docbook_document()
58
+ Nokogiri::XML.parse(convert_to_docbook())
59
+ end
60
+
61
+ def convert_to_docbook()
62
+ @format = "-x db"
63
+ convert()
64
+ end
65
+ end
66
+
67
+ # Handle a string-based local path as input, extends PdfFile
68
+ class DocFilePath < DocFile
69
+ def initialize(input_path, target_path=nil)
70
+ # check to make sure file is legit
71
+ if (!File.exist?(input_path))
72
+ raise AntiWordRError, "invalid file path"
73
+ end
74
+
75
+ super(input_path, target_path)
76
+
77
+ end
78
+ end
79
+
80
+ # Handle a URI as a remote path to a PDF, extends PdfFile
81
+ class DocFileUrl < DocFile
82
+ def initialize(input_url, target_path=nil)
83
+ # check to make sure file is legit
84
+ begin
85
+ if ((input_url =~ URI::regexp).nil?)
86
+ raise AntiWordRError, "invalid file url"
87
+ end
88
+ tempfile = Tempfile.new('antiwordr')
89
+ File.open(tempfile.path, 'wb') {|f| f.write(open(input_url).read) }
90
+ super(tempfile.path, target_path)
91
+ rescue => bang
92
+ raise AntiWordRError, bang.to_s
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,86 @@
1
+ require 'test/unit'
2
+ require File.join(File.dirname(__FILE__), '../lib/antiwordr')
3
+
4
+ class DocFileTest < Test::Unit::TestCase
5
+ include AntiWordR
6
+
7
+ CURRENT_DIR = File.dirname(File.expand_path(__FILE__)) + "/"
8
+ TEST_DOC_PATH = CURRENT_DIR + "test.doc"
9
+ TEST_BAD_PATH = "blah.doc"
10
+ TEST_NON_DOC = CURRENT_DIR + "antiwordr_test.rb"
11
+ TEST_URL_DOC =
12
+ "http://github.com/kitplummer/antiwordr/raw/master/test/test.doc"
13
+ TEST_URL_NON_DOC =
14
+ "http://github.com/kitplummer/antiwordr/raw/master/test/antiwordr_test.rb"
15
+
16
+ def test_docfile_new
17
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
18
+ assert file
19
+ end
20
+
21
+ def test_invalid_docfile
22
+ e = assert_raise AntiWordRError do
23
+ file = DocFilePath.new(TEST_NON_DOC, ".")
24
+ file.convert
25
+ end
26
+ assert_equal "Source document is not a Word Document", e.to_s
27
+ end
28
+
29
+ def test_bad_docfile_new
30
+ e = assert_raise AntiWordRError do
31
+ file = DocFilePath.new(TEST_BAD_PATH, ".")
32
+ end
33
+ assert_equal "invalid file path", e.to_s
34
+ end
35
+
36
+ def test_string_from_docfile
37
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
38
+ assert_equal "String", file.convert().class.to_s
39
+ assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
40
+ end
41
+
42
+ def test_return_docbook
43
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
44
+ assert_equal "String", file.convert_to_docbook().class.to_s
45
+ end
46
+
47
+ def test_return_docbook_document
48
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
49
+ assert_equal "Nokogiri::XML::Document",
50
+ file.convert_to_docbook_document().class.to_s
51
+ assert_equal Nokogiri::XML.parse(
52
+ `antiword -x db "#{TEST_DOC_PATH}"`
53
+ ).css('para').first.to_s,
54
+ file.convert_to_docbook_document().css('para').first.to_s
55
+ end
56
+
57
+ def test_invalid_URL_docfile
58
+ e = assert_raise AntiWordRError do
59
+ file = DocFileUrl.new("blah", ".")
60
+ end
61
+ assert_equal "invalid file url", e.to_s
62
+ end
63
+
64
+ def test_invalid_URL_resource_docfile
65
+ e = assert_raise AntiWordRError do
66
+ file = DocFileUrl.new("http://github.com/kitplummer/blah", ".")
67
+ end
68
+ assert_equal "404 Not Found", e.to_s
69
+ end
70
+
71
+ def test_invalid_URL_docfile
72
+ e = assert_raise AntiWordRError do
73
+ file = DocFileUrl.new(TEST_URL_NON_DOC, ".")
74
+ file.convert
75
+ end
76
+ assert_equal "Source document is not a Word Document", e.to_s
77
+ end
78
+
79
+ def test_valid_URL_docfile
80
+ # http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
81
+ file = DocFileUrl.new(TEST_URL_DOC)
82
+ assert_equal "String", file.convert().class.to_s
83
+ assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
84
+ end
85
+
86
+ end
data/test/test.doc ADDED
Binary file
data/test/test.docx ADDED
Binary file
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: antiwordr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kit Plummer
8
+ autorequire: antiwordr
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-07 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.3.3
24
+ version:
25
+ description: Uses command-line antiword tools to convert Docs to text.
26
+ email: kitplummer@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - Rakefile
35
+ - README.textile
36
+ - MIT-LICENSE
37
+ - lib/antiwordr.rb
38
+ - test/antiwordr_test.rb
39
+ - test/test.doc
40
+ - test/test.docx
41
+ has_rdoc: true
42
+ homepage: http://github.com/kitplummer/antiwordr
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements:
63
+ - none
64
+ rubyforge_project: antiwordr
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: Convert Word Docs to text.
69
+ test_files: []
70
+