docpdftotext 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +17 -0
- data/README.rdoc +1 -1
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/docpdftotext.gemspec +57 -0
- data/lib/DocumentConverter.py +231 -0
- data/lib/docpdftotext.rb +82 -0
- data/test/docpdftotext_test.rb +5 -6
- data/test/test.doc +0 -0
- data/test/test.docx +0 -0
- data/test/test.pdf +0 -0
- metadata +14 -12
data/.document
ADDED
data/.gitignore
ADDED
data/README.rdoc
CHANGED
@@ -9,7 +9,7 @@ This gem enables you to interact with document conversion libraries through Rail
|
|
9
9
|
|
10
10
|
== Requirements
|
11
11
|
* Antiword: http://www.winfield.demon.nl/
|
12
|
-
*
|
12
|
+
* pdftotext: http://packages.ubuntu.com/hardy/poppler-utils
|
13
13
|
* OdfConverter: http://www.oooninja.com/2008/01/convert-openxml-docx-etc-in-linux-using.html
|
14
14
|
* Openoffice-headless: http://wiki.alfresco.com/wiki/Running_OpenOffice_From_Terminal
|
15
15
|
* DocumentConverter.py (included): http://artofsolving.com/opensource/pyodconverter
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "docpdftotext"
|
8
|
+
gem.summary = %Q{Convert word to text in ruby}
|
9
|
+
gem.description = %Q{wrappers for libraries to convert documents into text}
|
10
|
+
gem.email = "eric@ericsilverberg.com"
|
11
|
+
gem.homepage = "http://github.com/esilverberg/docpdftotext"
|
12
|
+
gem.authors = ["esilverberg"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
rescue LoadError
|
36
|
+
task :rcov do
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test => :check_dependencies
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION')
|
48
|
+
version = File.read('VERSION')
|
49
|
+
else
|
50
|
+
version = ""
|
51
|
+
end
|
52
|
+
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
54
|
+
rdoc.title = "antiword #{version}"
|
55
|
+
rdoc.rdoc_files.include('README*')
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
57
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.3
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{docpdftotext}
|
8
|
+
s.version = "0.0.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["esilverberg"]
|
12
|
+
s.date = %q{2009-11-23}
|
13
|
+
s.description = %q{wrappers for libraries to convert documents into text}
|
14
|
+
s.email = %q{eric@ericsilverberg.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"docpdftotext.gemspec",
|
27
|
+
"lib/DocumentConverter.py",
|
28
|
+
"lib/docpdftotext.rb",
|
29
|
+
"test/docpdftotext_test.rb",
|
30
|
+
"test/test.doc",
|
31
|
+
"test/test.docx",
|
32
|
+
"test/test.pdf",
|
33
|
+
"test/test_helper.rb"
|
34
|
+
]
|
35
|
+
s.homepage = %q{http://github.com/esilverberg/docpdftotext}
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.5}
|
39
|
+
s.summary = %q{Convert word to text in ruby}
|
40
|
+
s.test_files = [
|
41
|
+
"test/docpdftotext_test.rb",
|
42
|
+
"test/test_helper.rb"
|
43
|
+
]
|
44
|
+
|
45
|
+
if s.respond_to? :specification_version then
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
47
|
+
s.specification_version = 3
|
48
|
+
|
49
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
50
|
+
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
53
|
+
end
|
54
|
+
else
|
55
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,231 @@
|
|
1
|
+
#
|
2
|
+
# PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
|
3
|
+
#
|
4
|
+
# This script converts a document from one office format to another by
|
5
|
+
# connecting to an OpenOffice.org instance via Python-UNO bridge.
|
6
|
+
#
|
7
|
+
# Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
|
8
|
+
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
|
9
|
+
# - or any later version.
|
10
|
+
#
|
11
|
+
DEFAULT_OPENOFFICE_PORT = 8100
|
12
|
+
|
13
|
+
import uno
|
14
|
+
from os.path import abspath, isfile, splitext
|
15
|
+
from com.sun.star.beans import PropertyValue
|
16
|
+
from com.sun.star.task import ErrorCodeIOException
|
17
|
+
from com.sun.star.connection import NoConnectException
|
18
|
+
|
19
|
+
FAMILY_TEXT = "Text"
|
20
|
+
FAMILY_WEB = "Web"
|
21
|
+
FAMILY_SPREADSHEET = "Spreadsheet"
|
22
|
+
FAMILY_PRESENTATION = "Presentation"
|
23
|
+
FAMILY_DRAWING = "Drawing"
|
24
|
+
|
25
|
+
#---------------------#
|
26
|
+
# Configuration Start #
|
27
|
+
#---------------------#
|
28
|
+
|
29
|
+
# see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
|
30
|
+
|
31
|
+
# most formats are auto-detected; only those requiring options are defined here
|
32
|
+
IMPORT_FILTER_MAP = {
|
33
|
+
"txt": {
|
34
|
+
"FilterName": "Text (encoded)",
|
35
|
+
"FilterOptions": "utf8"
|
36
|
+
},
|
37
|
+
"csv": {
|
38
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
39
|
+
"FilterOptions": "44,34,0"
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
EXPORT_FILTER_MAP = {
|
44
|
+
"pdf": {
|
45
|
+
FAMILY_TEXT: { "FilterName": "writer_pdf_Export" },
|
46
|
+
FAMILY_WEB: { "FilterName": "writer_web_pdf_Export" },
|
47
|
+
FAMILY_SPREADSHEET: { "FilterName": "calc_pdf_Export" },
|
48
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_pdf_Export" },
|
49
|
+
FAMILY_DRAWING: { "FilterName": "draw_pdf_Export" }
|
50
|
+
},
|
51
|
+
"html": {
|
52
|
+
FAMILY_TEXT: { "FilterName": "HTML (StarWriter)" },
|
53
|
+
FAMILY_SPREADSHEET: { "FilterName": "HTML (StarCalc)" },
|
54
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_html_Export" }
|
55
|
+
},
|
56
|
+
"odt": {
|
57
|
+
FAMILY_TEXT: { "FilterName": "writer8" },
|
58
|
+
FAMILY_WEB: { "FilterName": "writerweb8_writer" }
|
59
|
+
},
|
60
|
+
"doc": {
|
61
|
+
FAMILY_TEXT: { "FilterName": "MS Word 97" }
|
62
|
+
},
|
63
|
+
"rtf": {
|
64
|
+
FAMILY_TEXT: { "FilterName": "Rich Text Format" }
|
65
|
+
},
|
66
|
+
"txt": {
|
67
|
+
FAMILY_TEXT: {
|
68
|
+
"FilterName": "Text",
|
69
|
+
"FilterOptions": "utf8"
|
70
|
+
}
|
71
|
+
},
|
72
|
+
"ods": {
|
73
|
+
FAMILY_SPREADSHEET: { "FilterName": "calc8" }
|
74
|
+
},
|
75
|
+
"xls": {
|
76
|
+
FAMILY_SPREADSHEET: { "FilterName": "MS Excel 97" }
|
77
|
+
},
|
78
|
+
"csv": {
|
79
|
+
FAMILY_SPREADSHEET: {
|
80
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
81
|
+
"FilterOptions": "44,34,0"
|
82
|
+
}
|
83
|
+
},
|
84
|
+
"odp": {
|
85
|
+
FAMILY_PRESENTATION: { "FilterName": "impress8" }
|
86
|
+
},
|
87
|
+
"ppt": {
|
88
|
+
FAMILY_PRESENTATION: { "FilterName": "MS PowerPoint 97" }
|
89
|
+
},
|
90
|
+
"swf": {
|
91
|
+
FAMILY_DRAWING: { "FilterName": "draw_flash_Export" },
|
92
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_flash_Export" }
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
PAGE_STYLE_OVERRIDE_PROPERTIES = {
|
97
|
+
FAMILY_SPREADSHEET: {
|
98
|
+
#--- Scale options: uncomment 1 of the 3 ---
|
99
|
+
# a) 'Reduce / enlarge printout': 'Scaling factor'
|
100
|
+
"PageScale": 100,
|
101
|
+
# b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
|
102
|
+
#"ScaleToPagesX": 1, "ScaleToPagesY": 1000,
|
103
|
+
# c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
|
104
|
+
#"ScaleToPages": 1,
|
105
|
+
"PrintGrid": False
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
#-------------------#
|
110
|
+
# Configuration End #
|
111
|
+
#-------------------#
|
112
|
+
|
113
|
+
class DocumentConversionException(Exception):
|
114
|
+
|
115
|
+
def __init__(self, message):
|
116
|
+
self.message = message
|
117
|
+
|
118
|
+
def __str__(self):
|
119
|
+
return self.message
|
120
|
+
|
121
|
+
|
122
|
+
class DocumentConverter:
|
123
|
+
|
124
|
+
def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
|
125
|
+
localContext = uno.getComponentContext()
|
126
|
+
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
|
127
|
+
try:
|
128
|
+
context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
|
129
|
+
except NoConnectException:
|
130
|
+
raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
|
131
|
+
self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
|
132
|
+
|
133
|
+
def convert(self, inputFile, outputFile):
|
134
|
+
|
135
|
+
inputUrl = self._toFileUrl(inputFile)
|
136
|
+
outputUrl = self._toFileUrl(outputFile)
|
137
|
+
|
138
|
+
loadProperties = { "Hidden": True }
|
139
|
+
inputExt = self._getFileExt(inputFile)
|
140
|
+
if IMPORT_FILTER_MAP.has_key(inputExt):
|
141
|
+
loadProperties.update(IMPORT_FILTER_MAP[inputExt])
|
142
|
+
|
143
|
+
document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
|
144
|
+
try:
|
145
|
+
document.refresh()
|
146
|
+
except AttributeError:
|
147
|
+
pass
|
148
|
+
|
149
|
+
family = self._detectFamily(document)
|
150
|
+
self._overridePageStyleProperties(document, family)
|
151
|
+
|
152
|
+
outputExt = self._getFileExt(outputFile)
|
153
|
+
storeProperties = self._getStoreProperties(document, outputExt)
|
154
|
+
|
155
|
+
try:
|
156
|
+
document.storeToURL(outputUrl, self._toProperties(storeProperties))
|
157
|
+
finally:
|
158
|
+
document.close(True)
|
159
|
+
|
160
|
+
def _overridePageStyleProperties(self, document, family):
|
161
|
+
if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
|
162
|
+
properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
|
163
|
+
pageStyles = document.getStyleFamilies().getByName('PageStyles')
|
164
|
+
for styleName in pageStyles.getElementNames():
|
165
|
+
pageStyle = pageStyles.getByName(styleName)
|
166
|
+
for name, value in properties.items():
|
167
|
+
pageStyle.setPropertyValue(name, value)
|
168
|
+
|
169
|
+
def _getStoreProperties(self, document, outputExt):
|
170
|
+
family = self._detectFamily(document)
|
171
|
+
try:
|
172
|
+
propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
|
173
|
+
except KeyError:
|
174
|
+
raise DocumentConversionException, "unknown output format: '%s'" % outputExt
|
175
|
+
try:
|
176
|
+
return propertiesByFamily[family]
|
177
|
+
except KeyError:
|
178
|
+
raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
|
179
|
+
|
180
|
+
def _detectFamily(self, document):
|
181
|
+
if document.supportsService("com.sun.star.text.WebDocument"):
|
182
|
+
return FAMILY_WEB
|
183
|
+
if document.supportsService("com.sun.star.text.GenericTextDocument"):
|
184
|
+
# must be TextDocument or GlobalDocument
|
185
|
+
return FAMILY_TEXT
|
186
|
+
if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
|
187
|
+
return FAMILY_SPREADSHEET
|
188
|
+
if document.supportsService("com.sun.star.presentation.PresentationDocument"):
|
189
|
+
return FAMILY_PRESENTATION
|
190
|
+
if document.supportsService("com.sun.star.drawing.DrawingDocument"):
|
191
|
+
return FAMILY_DRAWING
|
192
|
+
raise DocumentConversionException, "unknown document family: %s" % document
|
193
|
+
|
194
|
+
def _getFileExt(self, path):
|
195
|
+
ext = splitext(path)[1]
|
196
|
+
if ext is not None:
|
197
|
+
return ext[1:].lower()
|
198
|
+
|
199
|
+
def _toFileUrl(self, path):
|
200
|
+
return uno.systemPathToFileUrl(abspath(path))
|
201
|
+
|
202
|
+
def _toProperties(self, dict):
|
203
|
+
props = []
|
204
|
+
for key in dict:
|
205
|
+
prop = PropertyValue()
|
206
|
+
prop.Name = key
|
207
|
+
prop.Value = dict[key]
|
208
|
+
props.append(prop)
|
209
|
+
return tuple(props)
|
210
|
+
|
211
|
+
|
212
|
+
if __name__ == "__main__":
|
213
|
+
from sys import argv, exit
|
214
|
+
|
215
|
+
if len(argv) < 3:
|
216
|
+
print "USAGE: python %s <input-file> <output-file>" % argv[0]
|
217
|
+
exit(255)
|
218
|
+
if not isfile(argv[1]):
|
219
|
+
print "no such input file: %s" % argv[1]
|
220
|
+
exit(1)
|
221
|
+
|
222
|
+
try:
|
223
|
+
converter = DocumentConverter()
|
224
|
+
converter.convert(argv[1], argv[2])
|
225
|
+
except DocumentConversionException, exception:
|
226
|
+
print "ERROR! " + str(exception)
|
227
|
+
exit(1)
|
228
|
+
except ErrorCodeIOException, exception:
|
229
|
+
print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
|
230
|
+
exit(1)
|
231
|
+
|
data/lib/docpdftotext.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
module DocPdfToText
|
4
|
+
VERSION = "1.0.0"
|
5
|
+
ANTIWORD_PATH = "antiword"
|
6
|
+
ODF_CONVERTER_PATH = "OdfConverter"
|
7
|
+
PYTHON_PATH = "python"
|
8
|
+
DOC_CONVERTER_PATH = File.join(File.dirname(__FILE__), "DocumentConverter.py")
|
9
|
+
PDFTOTEXT_PATH = "pdftotext"
|
10
|
+
|
11
|
+
def file_to_txt(file_path)
|
12
|
+
expanded_path = File.expand_path(file_path)
|
13
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
14
|
+
return case File.extname(expanded_path)
|
15
|
+
when ".docx"
|
16
|
+
docx_to_txt(file_path)
|
17
|
+
when ".doc"
|
18
|
+
doc_to_txt(file_path)
|
19
|
+
when ".pdf"
|
20
|
+
pdf_to_txt(file_path)
|
21
|
+
when ".txt"
|
22
|
+
read_txt_file(file_path)
|
23
|
+
else
|
24
|
+
raise ArgumentError, "Invalid file type"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def docx_to_txt(file_path)
|
29
|
+
expanded_path = File.expand_path(file_path)
|
30
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
31
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"
|
32
|
+
|
33
|
+
tmp_odt = Tempfile.new("docx")
|
34
|
+
tmp_odt_path = tmp_odt.path + ".odt"
|
35
|
+
tmp_odt.close # so our script can write to it; it isn't deleted till gc
|
36
|
+
|
37
|
+
cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
|
38
|
+
`#{cmd}`
|
39
|
+
|
40
|
+
tmp_final = Tempfile.new("txt")
|
41
|
+
tmp_final_path = tmp_final.path + ".txt"
|
42
|
+
tmp_final.close
|
43
|
+
|
44
|
+
cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
|
45
|
+
`#{cmd}`
|
46
|
+
|
47
|
+
return read_txt_file(tmp_final_path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_txt_file(file_path)
|
51
|
+
expanded_path = File.expand_path(file_path)
|
52
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
53
|
+
|
54
|
+
final = []
|
55
|
+
File.open(expanded_path, "r") do |infile|
|
56
|
+
final.push(infile.gets)
|
57
|
+
end
|
58
|
+
return final.join("\n")
|
59
|
+
end
|
60
|
+
|
61
|
+
def doc_to_txt(file_path)
|
62
|
+
expanded_path = File.expand_path(file_path)
|
63
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
64
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
|
65
|
+
cmd = "#{ANTIWORD_PATH} #{expanded_path}"
|
66
|
+
return `#{cmd}`
|
67
|
+
end
|
68
|
+
|
69
|
+
def pdf_to_txt(file_path)
|
70
|
+
expanded_path = File.expand_path(file_path)
|
71
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
72
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"
|
73
|
+
|
74
|
+
tmp = Tempfile.new("pdf")
|
75
|
+
tmp_path = tmp.path
|
76
|
+
tmp.close # so our script can write to it; it isn't deleted till gc
|
77
|
+
|
78
|
+
cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
|
79
|
+
`#{cmd}`
|
80
|
+
return read_txt_file(tmp_path)
|
81
|
+
end
|
82
|
+
end
|
data/test/docpdftotext_test.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'test_helper'
|
2
|
-
require 'tempfile'
|
3
2
|
|
4
3
|
class DocPdfToTextTest < Test::Unit::TestCase
|
5
4
|
include DocPdfToText
|
6
5
|
|
6
|
+
should "Convert a pdf file" do
|
7
|
+
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
8
|
+
assert(file_to_txt(test_file).length > 0)
|
9
|
+
end
|
10
|
+
|
7
11
|
should "Convert a docx file" do
|
8
12
|
test_file = File.join(File.dirname(__FILE__), "test.docx")
|
9
13
|
assert(file_to_txt(test_file).length > 0)
|
@@ -14,11 +18,6 @@ class DocPdfToTextTest < Test::Unit::TestCase
|
|
14
18
|
assert(file_to_txt(test_file).length > 0)
|
15
19
|
end
|
16
20
|
|
17
|
-
should "Convert a pdf file" do
|
18
|
-
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
19
|
-
assert(file_to_txt(test_file).length > 0)
|
20
|
-
end
|
21
|
-
|
22
21
|
should "raise invalid file format" do
|
23
22
|
assert_raise ArgumentError do
|
24
23
|
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
data/test/test.doc
ADDED
Binary file
|
data/test/test.docx
ADDED
Binary file
|
data/test/test.pdf
ADDED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docpdftotext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- esilverberg
|
@@ -22,16 +22,6 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0"
|
24
24
|
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: pdf-reader
|
27
|
-
type: :runtime
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: "0"
|
34
|
-
version:
|
35
25
|
description: wrappers for libraries to convert documents into text
|
36
26
|
email: eric@ericsilverberg.com
|
37
27
|
executables: []
|
@@ -42,8 +32,20 @@ extra_rdoc_files:
|
|
42
32
|
- LICENSE
|
43
33
|
- README.rdoc
|
44
34
|
files:
|
35
|
+
- .document
|
36
|
+
- .gitignore
|
45
37
|
- LICENSE
|
46
38
|
- README.rdoc
|
39
|
+
- Rakefile
|
40
|
+
- VERSION
|
41
|
+
- docpdftotext.gemspec
|
42
|
+
- lib/DocumentConverter.py
|
43
|
+
- lib/docpdftotext.rb
|
44
|
+
- test/docpdftotext_test.rb
|
45
|
+
- test/test.doc
|
46
|
+
- test/test.docx
|
47
|
+
- test/test.pdf
|
48
|
+
- test/test_helper.rb
|
47
49
|
has_rdoc: true
|
48
50
|
homepage: http://github.com/esilverberg/docpdftotext
|
49
51
|
licenses: []
|
@@ -73,5 +75,5 @@ signing_key:
|
|
73
75
|
specification_version: 3
|
74
76
|
summary: Convert word to text in ruby
|
75
77
|
test_files:
|
76
|
-
- test/test_helper.rb
|
77
78
|
- test/docpdftotext_test.rb
|
79
|
+
- test/test_helper.rb
|