antiwordr 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2009 kitplummer@gmail.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.textile ADDED
@@ -0,0 +1,22 @@
1
+ h1. antiwordr
2
+
3
+ Wrapper around the command line tool antiword which converts Word Document (97/2003) to text or Docbook.
4
+
5
+ h1. requirements
6
+
7
+ Just antiword, Ruby and a few gems (1.8.6+ as far as I know).
8
+
9
+ h1. using
10
+
11
+ <pre><code lang="ruby">require 'antiwordr'
12
+ require 'nokogiri'
13
+ file = DocFilePath.new([Path to Source PDF])
14
+ string = file.convert()
15
+ xml = file.convert_to_docbook()
16
+ doc = file.convert_to_docbook_document()</code></pre>
17
+
18
+ See included test cases for more usage examples.
19
+
20
+ h1. license
21
+
22
+ MIT (See included MIT-LICENSE)
data/Rakefile ADDED
@@ -0,0 +1,67 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/packagetask'
5
+ require 'rake/gempackagetask'
6
+
7
+ $:.unshift(File.dirname(__FILE__) + "/lib")
8
+ require 'antiwordr'
9
+
10
+ PKG_NAME = 'antiwordr'
11
+ PKG_VERSION = AntiWordR::VERSION
12
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
13
+
14
+ desc 'Default: run unit tests.'
15
+ task :default => :test
16
+
17
+ desc "Clean generated files"
18
+ task :clean do
19
+ rm_rf 'pkg'
20
+ rm_rf 'rdoc'
21
+ end
22
+
23
+ desc 'Test the antiwordr gem.'
24
+ Rake::TestTask.new(:test) do |t|
25
+ t.libs << 'lib'
26
+ t.pattern = 'test/**/*_test.rb'
27
+ t.verbose = true
28
+ end
29
+
30
+ desc 'Generate documentation for the antiwordr gem.'
31
+ Rake::RDocTask.new(:rdoc) do |rdoc|
32
+ rdoc.rdoc_dir = 'rdoc'
33
+ rdoc.title = 'antiwordr'
34
+ rdoc.options << '--line-numbers'
35
+ rdoc.rdoc_files.include('README.textile')
36
+ rdoc.rdoc_files.include('lib/**/*.rb')
37
+ end
38
+
39
+
40
+ # Create compressed packages
41
+ spec = Gem::Specification.new do |s|
42
+ s.platform = Gem::Platform::RUBY
43
+ s.name = PKG_NAME
44
+ s.summary = "Convert Word Docs to text."
45
+ s.description = %q{Uses command-line antiword tools to convert Docs to text.}
46
+ s.version = PKG_VERSION
47
+
48
+ s.author = "Kit Plummer"
49
+ s.email = "kitplummer@gmail.com"
50
+ s.rubyforge_project = PKG_NAME
51
+ s.homepage = "http://github.com/kitplummer/antiwordr"
52
+
53
+ s.has_rdoc = true
54
+ s.requirements << 'none'
55
+ s.require_path = 'lib'
56
+ s.autorequire = 'antiwordr'
57
+ s.add_dependency("nokogiri", ">= 1.3.3")
58
+ s.files = [ "Rakefile", "README.textile", "MIT-LICENSE" ]
59
+ s.files = s.files + Dir.glob( "lib/**/*" ).delete_if { |item| item.include?( "\.svn" ) }
60
+ s.files = s.files + Dir.glob( "test/**/*" ).delete_if { |item| item.include?( "\.svn" ) || item.include?("\.png") }
61
+ end
62
+
63
+ Rake::GemPackageTask.new(spec) do |p|
64
+ p.gem_spec = spec
65
+ p.need_tar = false
66
+ p.need_zip = true
67
+ end
data/lib/antiwordr.rb ADDED
@@ -0,0 +1,96 @@
1
+ # The library has a single method for converting PDF files into HTML. The
2
+ # method current takes in the source path, and either/both the user and owner
3
+ # passwords set on the source PDF document. The convert method returns the
4
+ # HTML as a string for further manipulation of loading into a Document.
5
+ #
6
+ # Requires that pdftohtml be installed and on the path
7
+ #
8
+ # Author:: Kit Plummer (mailto:kitplummer@gmail.com)
9
+ # Copyright:: Copyright (c) 2010 Kit Plummer
10
+ # License:: MIT
11
+
12
+ require 'rubygems'
13
+ require 'nokogiri'
14
+ require 'uri'
15
+ require 'open-uri'
16
+ require 'tempfile'
17
+
18
+ module AntiWordR
19
+
20
+ # Simple local error abstraction
21
+ class AntiWordRError < RuntimeError; end
22
+
23
+ VERSION = '0.1.0'
24
+
25
+ # Provides facilities for converting Word Docs to Text rom Ruby code.
26
+ class DocFile
27
+ attr :path
28
+ attr :target
29
+ attr :format
30
+
31
+ def initialize(input_path, target_path=nil)
32
+ @path = input_path
33
+ @target = target_path
34
+ end
35
+
36
+ # Convert the PDF document to HTML. Returns a string
37
+ def convert()
38
+ errors = ""
39
+ output = ""
40
+
41
+ cmd = "antiword #{format}" + ' "' + @path + '"'
42
+
43
+ output = `#{cmd} 2>&1`
44
+
45
+ if (output.include?("command not found"))
46
+ raise AntiWordRError, "AntiWordR requires antiword to be installed"
47
+ elsif (output.include?("is not a Word Document"))
48
+ raise AntiWordRError, "Source document is not a Word Document"
49
+ elsif (output.include?("Error:"))
50
+ raise AntiWordRError, output.split("\n").first.to_s.chomp
51
+ else
52
+ return output
53
+ end
54
+ end
55
+
56
+ # Convert the PDF document to HTML. Returns a Nokogiri::HTML:Document
57
+ def convert_to_docbook_document()
58
+ Nokogiri::XML.parse(convert_to_docbook())
59
+ end
60
+
61
+ def convert_to_docbook()
62
+ @format = "-x db"
63
+ convert()
64
+ end
65
+ end
66
+
67
+ # Handle a string-based local path as input, extends PdfFile
68
+ class DocFilePath < DocFile
69
+ def initialize(input_path, target_path=nil)
70
+ # check to make sure file is legit
71
+ if (!File.exist?(input_path))
72
+ raise AntiWordRError, "invalid file path"
73
+ end
74
+
75
+ super(input_path, target_path)
76
+
77
+ end
78
+ end
79
+
80
+ # Handle a URI as a remote path to a PDF, extends PdfFile
81
+ class DocFileUrl < DocFile
82
+ def initialize(input_url, target_path=nil)
83
+ # check to make sure file is legit
84
+ begin
85
+ if ((input_url =~ URI::regexp).nil?)
86
+ raise AntiWordRError, "invalid file url"
87
+ end
88
+ tempfile = Tempfile.new('antiwordr')
89
+ File.open(tempfile.path, 'wb') {|f| f.write(open(input_url).read) }
90
+ super(tempfile.path, target_path)
91
+ rescue => bang
92
+ raise AntiWordRError, bang.to_s
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,86 @@
1
+ require 'test/unit'
2
+ require File.join(File.dirname(__FILE__), '../lib/antiwordr')
3
+
4
+ class DocFileTest < Test::Unit::TestCase
5
+ include AntiWordR
6
+
7
+ CURRENT_DIR = File.dirname(File.expand_path(__FILE__)) + "/"
8
+ TEST_DOC_PATH = CURRENT_DIR + "test.doc"
9
+ TEST_BAD_PATH = "blah.doc"
10
+ TEST_NON_DOC = CURRENT_DIR + "antiwordr_test.rb"
11
+ TEST_URL_DOC =
12
+ "http://github.com/kitplummer/antiwordr/raw/master/test/test.doc"
13
+ TEST_URL_NON_DOC =
14
+ "http://github.com/kitplummer/antiwordr/raw/master/test/antiwordr_test.rb"
15
+
16
+ def test_docfile_new
17
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
18
+ assert file
19
+ end
20
+
21
+ def test_invalid_docfile
22
+ e = assert_raise AntiWordRError do
23
+ file = DocFilePath.new(TEST_NON_DOC, ".")
24
+ file.convert
25
+ end
26
+ assert_equal "Source document is not a Word Document", e.to_s
27
+ end
28
+
29
+ def test_bad_docfile_new
30
+ e = assert_raise AntiWordRError do
31
+ file = DocFilePath.new(TEST_BAD_PATH, ".")
32
+ end
33
+ assert_equal "invalid file path", e.to_s
34
+ end
35
+
36
+ def test_string_from_docfile
37
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
38
+ assert_equal "String", file.convert().class.to_s
39
+ assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
40
+ end
41
+
42
+ def test_return_docbook
43
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
44
+ assert_equal "String", file.convert_to_docbook().class.to_s
45
+ end
46
+
47
+ def test_return_docbook_document
48
+ file = DocFilePath.new(TEST_DOC_PATH, ".")
49
+ assert_equal "Nokogiri::XML::Document",
50
+ file.convert_to_docbook_document().class.to_s
51
+ assert_equal Nokogiri::XML.parse(
52
+ `antiword -x db "#{TEST_DOC_PATH}"`
53
+ ).css('para').first.to_s,
54
+ file.convert_to_docbook_document().css('para').first.to_s
55
+ end
56
+
57
+ def test_invalid_URL_docfile
58
+ e = assert_raise AntiWordRError do
59
+ file = DocFileUrl.new("blah", ".")
60
+ end
61
+ assert_equal "invalid file url", e.to_s
62
+ end
63
+
64
+ def test_invalid_URL_resource_docfile
65
+ e = assert_raise AntiWordRError do
66
+ file = DocFileUrl.new("http://github.com/kitplummer/blah", ".")
67
+ end
68
+ assert_equal "404 Not Found", e.to_s
69
+ end
70
+
71
+ def test_invalid_URL_docfile
72
+ e = assert_raise AntiWordRError do
73
+ file = DocFileUrl.new(TEST_URL_NON_DOC, ".")
74
+ file.convert
75
+ end
76
+ assert_equal "Source document is not a Word Document", e.to_s
77
+ end
78
+
79
+ def test_valid_URL_docfile
80
+ # http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
81
+ file = DocFileUrl.new(TEST_URL_DOC)
82
+ assert_equal "String", file.convert().class.to_s
83
+ assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
84
+ end
85
+
86
+ end
data/test/test.doc ADDED
Binary file
data/test/test.docx ADDED
Binary file
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: antiwordr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kit Plummer
8
+ autorequire: antiwordr
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-07 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.3.3
24
+ version:
25
+ description: Uses command-line antiword tools to convert Docs to text.
26
+ email: kitplummer@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - Rakefile
35
+ - README.textile
36
+ - MIT-LICENSE
37
+ - lib/antiwordr.rb
38
+ - test/antiwordr_test.rb
39
+ - test/test.doc
40
+ - test/test.docx
41
+ has_rdoc: true
42
+ homepage: http://github.com/kitplummer/antiwordr
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements:
63
+ - none
64
+ rubyforge_project: antiwordr
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: Convert Word Docs to text.
69
+ test_files: []
70
+