antiwordr 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +21 -0
- data/README.textile +22 -0
- data/Rakefile +67 -0
- data/lib/antiwordr.rb +96 -0
- data/test/antiwordr_test.rb +86 -0
- data/test/test.doc +0 -0
- data/test/test.docx +0 -0
- metadata +70 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2009 kitplummer@gmail.com
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.textile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
h1. antiwordr
|
2
|
+
|
3
|
+
Wrapper around the command line tool antiword which converts Word Document (97/2003) to text or Docbook.
|
4
|
+
|
5
|
+
h1. requirements
|
6
|
+
|
7
|
+
Just antiword, Ruby and a few gems (1.8.6+ as far as I know).
|
8
|
+
|
9
|
+
h1. using
|
10
|
+
|
11
|
+
<pre><code lang="ruby">require 'antiwordr'
|
12
|
+
require 'nokogiri'
|
13
|
+
file = DocFilePath.new([Path to Source PDF])
|
14
|
+
string = file.convert()
|
15
|
+
xml = file.convert_to_docbook()
|
16
|
+
doc = file.convert_to_docbook_document()</code></pre>
|
17
|
+
|
18
|
+
See included test cases for more usage examples.
|
19
|
+
|
20
|
+
h1. license
|
21
|
+
|
22
|
+
MIT (See included MIT-LICENSE)
|
data/Rakefile
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake/packagetask'
|
5
|
+
require 'rake/gempackagetask'
|
6
|
+
|
7
|
+
$:.unshift(File.dirname(__FILE__) + "/lib")
|
8
|
+
require 'antiwordr'
|
9
|
+
|
10
|
+
PKG_NAME = 'antiwordr'
|
11
|
+
PKG_VERSION = AntiWordR::VERSION
|
12
|
+
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
13
|
+
|
14
|
+
desc 'Default: run unit tests.'
|
15
|
+
task :default => :test
|
16
|
+
|
17
|
+
desc "Clean generated files"
|
18
|
+
task :clean do
|
19
|
+
rm_rf 'pkg'
|
20
|
+
rm_rf 'rdoc'
|
21
|
+
end
|
22
|
+
|
23
|
+
desc 'Test the antiwordr gem.'
|
24
|
+
Rake::TestTask.new(:test) do |t|
|
25
|
+
t.libs << 'lib'
|
26
|
+
t.pattern = 'test/**/*_test.rb'
|
27
|
+
t.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
desc 'Generate documentation for the antiwordr gem.'
|
31
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
32
|
+
rdoc.rdoc_dir = 'rdoc'
|
33
|
+
rdoc.title = 'antiwordr'
|
34
|
+
rdoc.options << '--line-numbers'
|
35
|
+
rdoc.rdoc_files.include('README.textile')
|
36
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
# Create compressed packages
|
41
|
+
spec = Gem::Specification.new do |s|
|
42
|
+
s.platform = Gem::Platform::RUBY
|
43
|
+
s.name = PKG_NAME
|
44
|
+
s.summary = "Convert Word Docs to text."
|
45
|
+
s.description = %q{Uses command-line antiword tools to convert Docs to text.}
|
46
|
+
s.version = PKG_VERSION
|
47
|
+
|
48
|
+
s.author = "Kit Plummer"
|
49
|
+
s.email = "kitplummer@gmail.com"
|
50
|
+
s.rubyforge_project = PKG_NAME
|
51
|
+
s.homepage = "http://github.com/kitplummer/antiwordr"
|
52
|
+
|
53
|
+
s.has_rdoc = true
|
54
|
+
s.requirements << 'none'
|
55
|
+
s.require_path = 'lib'
|
56
|
+
s.autorequire = 'antiwordr'
|
57
|
+
s.add_dependency("nokogiri", ">= 1.3.3")
|
58
|
+
s.files = [ "Rakefile", "README.textile", "MIT-LICENSE" ]
|
59
|
+
s.files = s.files + Dir.glob( "lib/**/*" ).delete_if { |item| item.include?( "\.svn" ) }
|
60
|
+
s.files = s.files + Dir.glob( "test/**/*" ).delete_if { |item| item.include?( "\.svn" ) || item.include?("\.png") }
|
61
|
+
end
|
62
|
+
|
63
|
+
Rake::GemPackageTask.new(spec) do |p|
|
64
|
+
p.gem_spec = spec
|
65
|
+
p.need_tar = false
|
66
|
+
p.need_zip = true
|
67
|
+
end
|
data/lib/antiwordr.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# The library has a single method for converting PDF files into HTML. The
|
2
|
+
# method current takes in the source path, and either/both the user and owner
|
3
|
+
# passwords set on the source PDF document. The convert method returns the
|
4
|
+
# HTML as a string for further manipulation of loading into a Document.
|
5
|
+
#
|
6
|
+
# Requires that pdftohtml be installed and on the path
|
7
|
+
#
|
8
|
+
# Author:: Kit Plummer (mailto:kitplummer@gmail.com)
|
9
|
+
# Copyright:: Copyright (c) 2010 Kit Plummer
|
10
|
+
# License:: MIT
|
11
|
+
|
12
|
+
require 'rubygems'
|
13
|
+
require 'nokogiri'
|
14
|
+
require 'uri'
|
15
|
+
require 'open-uri'
|
16
|
+
require 'tempfile'
|
17
|
+
|
18
|
+
module AntiWordR
|
19
|
+
|
20
|
+
# Simple local error abstraction
|
21
|
+
class AntiWordRError < RuntimeError; end
|
22
|
+
|
23
|
+
VERSION = '0.1.0'
|
24
|
+
|
25
|
+
# Provides facilities for converting Word Docs to Text rom Ruby code.
|
26
|
+
class DocFile
|
27
|
+
attr :path
|
28
|
+
attr :target
|
29
|
+
attr :format
|
30
|
+
|
31
|
+
def initialize(input_path, target_path=nil)
|
32
|
+
@path = input_path
|
33
|
+
@target = target_path
|
34
|
+
end
|
35
|
+
|
36
|
+
# Convert the PDF document to HTML. Returns a string
|
37
|
+
def convert()
|
38
|
+
errors = ""
|
39
|
+
output = ""
|
40
|
+
|
41
|
+
cmd = "antiword #{format}" + ' "' + @path + '"'
|
42
|
+
|
43
|
+
output = `#{cmd} 2>&1`
|
44
|
+
|
45
|
+
if (output.include?("command not found"))
|
46
|
+
raise AntiWordRError, "AntiWordR requires antiword to be installed"
|
47
|
+
elsif (output.include?("is not a Word Document"))
|
48
|
+
raise AntiWordRError, "Source document is not a Word Document"
|
49
|
+
elsif (output.include?("Error:"))
|
50
|
+
raise AntiWordRError, output.split("\n").first.to_s.chomp
|
51
|
+
else
|
52
|
+
return output
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Convert the PDF document to HTML. Returns a Nokogiri::HTML:Document
|
57
|
+
def convert_to_docbook_document()
|
58
|
+
Nokogiri::XML.parse(convert_to_docbook())
|
59
|
+
end
|
60
|
+
|
61
|
+
def convert_to_docbook()
|
62
|
+
@format = "-x db"
|
63
|
+
convert()
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Handle a string-based local path as input, extends PdfFile
|
68
|
+
class DocFilePath < DocFile
|
69
|
+
def initialize(input_path, target_path=nil)
|
70
|
+
# check to make sure file is legit
|
71
|
+
if (!File.exist?(input_path))
|
72
|
+
raise AntiWordRError, "invalid file path"
|
73
|
+
end
|
74
|
+
|
75
|
+
super(input_path, target_path)
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Handle a URI as a remote path to a PDF, extends PdfFile
|
81
|
+
class DocFileUrl < DocFile
|
82
|
+
def initialize(input_url, target_path=nil)
|
83
|
+
# check to make sure file is legit
|
84
|
+
begin
|
85
|
+
if ((input_url =~ URI::regexp).nil?)
|
86
|
+
raise AntiWordRError, "invalid file url"
|
87
|
+
end
|
88
|
+
tempfile = Tempfile.new('antiwordr')
|
89
|
+
File.open(tempfile.path, 'wb') {|f| f.write(open(input_url).read) }
|
90
|
+
super(tempfile.path, target_path)
|
91
|
+
rescue => bang
|
92
|
+
raise AntiWordRError, bang.to_s
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require File.join(File.dirname(__FILE__), '../lib/antiwordr')
|
3
|
+
|
4
|
+
class DocFileTest < Test::Unit::TestCase
|
5
|
+
include AntiWordR
|
6
|
+
|
7
|
+
CURRENT_DIR = File.dirname(File.expand_path(__FILE__)) + "/"
|
8
|
+
TEST_DOC_PATH = CURRENT_DIR + "test.doc"
|
9
|
+
TEST_BAD_PATH = "blah.doc"
|
10
|
+
TEST_NON_DOC = CURRENT_DIR + "antiwordr_test.rb"
|
11
|
+
TEST_URL_DOC =
|
12
|
+
"http://github.com/kitplummer/antiwordr/raw/master/test/test.doc"
|
13
|
+
TEST_URL_NON_DOC =
|
14
|
+
"http://github.com/kitplummer/antiwordr/raw/master/test/antiwordr_test.rb"
|
15
|
+
|
16
|
+
def test_docfile_new
|
17
|
+
file = DocFilePath.new(TEST_DOC_PATH, ".")
|
18
|
+
assert file
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_invalid_docfile
|
22
|
+
e = assert_raise AntiWordRError do
|
23
|
+
file = DocFilePath.new(TEST_NON_DOC, ".")
|
24
|
+
file.convert
|
25
|
+
end
|
26
|
+
assert_equal "Source document is not a Word Document", e.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_bad_docfile_new
|
30
|
+
e = assert_raise AntiWordRError do
|
31
|
+
file = DocFilePath.new(TEST_BAD_PATH, ".")
|
32
|
+
end
|
33
|
+
assert_equal "invalid file path", e.to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_string_from_docfile
|
37
|
+
file = DocFilePath.new(TEST_DOC_PATH, ".")
|
38
|
+
assert_equal "String", file.convert().class.to_s
|
39
|
+
assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_return_docbook
|
43
|
+
file = DocFilePath.new(TEST_DOC_PATH, ".")
|
44
|
+
assert_equal "String", file.convert_to_docbook().class.to_s
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_return_docbook_document
|
48
|
+
file = DocFilePath.new(TEST_DOC_PATH, ".")
|
49
|
+
assert_equal "Nokogiri::XML::Document",
|
50
|
+
file.convert_to_docbook_document().class.to_s
|
51
|
+
assert_equal Nokogiri::XML.parse(
|
52
|
+
`antiword -x db "#{TEST_DOC_PATH}"`
|
53
|
+
).css('para').first.to_s,
|
54
|
+
file.convert_to_docbook_document().css('para').first.to_s
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_invalid_URL_docfile
|
58
|
+
e = assert_raise AntiWordRError do
|
59
|
+
file = DocFileUrl.new("blah", ".")
|
60
|
+
end
|
61
|
+
assert_equal "invalid file url", e.to_s
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_invalid_URL_resource_docfile
|
65
|
+
e = assert_raise AntiWordRError do
|
66
|
+
file = DocFileUrl.new("http://github.com/kitplummer/blah", ".")
|
67
|
+
end
|
68
|
+
assert_equal "404 Not Found", e.to_s
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_invalid_URL_docfile
|
72
|
+
e = assert_raise AntiWordRError do
|
73
|
+
file = DocFileUrl.new(TEST_URL_NON_DOC, ".")
|
74
|
+
file.convert
|
75
|
+
end
|
76
|
+
assert_equal "Source document is not a Word Document", e.to_s
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_valid_URL_docfile
|
80
|
+
# http://github.com/kitplummer/pdftohtmlr/raw/master/test/test.pdf
|
81
|
+
file = DocFileUrl.new(TEST_URL_DOC)
|
82
|
+
assert_equal "String", file.convert().class.to_s
|
83
|
+
assert_equal `antiword "#{TEST_DOC_PATH}"`, file.convert()
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/test/test.doc
ADDED
Binary file
|
data/test/test.docx
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: antiwordr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kit Plummer
|
8
|
+
autorequire: antiwordr
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-07 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.3.3
|
24
|
+
version:
|
25
|
+
description: Uses command-line antiword tools to convert Docs to text.
|
26
|
+
email: kitplummer@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- Rakefile
|
35
|
+
- README.textile
|
36
|
+
- MIT-LICENSE
|
37
|
+
- lib/antiwordr.rb
|
38
|
+
- test/antiwordr_test.rb
|
39
|
+
- test/test.doc
|
40
|
+
- test/test.docx
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://github.com/kitplummer/antiwordr
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
requirements:
|
63
|
+
- none
|
64
|
+
rubyforge_project: antiwordr
|
65
|
+
rubygems_version: 1.3.5
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: Convert Word Docs to text.
|
69
|
+
test_files: []
|
70
|
+
|