docpdftotext 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +17 -0
- data/README.rdoc +1 -1
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/docpdftotext.gemspec +57 -0
- data/lib/DocumentConverter.py +231 -0
- data/lib/docpdftotext.rb +82 -0
- data/test/docpdftotext_test.rb +5 -6
- data/test/test.doc +0 -0
- data/test/test.docx +0 -0
- data/test/test.pdf +0 -0
- metadata +14 -12
data/.document
ADDED
data/.gitignore
ADDED
data/README.rdoc
CHANGED
|
@@ -9,7 +9,7 @@ This gem enables you to interact with document conversion libraries through Rail
|
|
|
9
9
|
|
|
10
10
|
== Requirements
|
|
11
11
|
* Antiword: http://www.winfield.demon.nl/
|
|
12
|
-
*
|
|
12
|
+
* pdftotext: http://packages.ubuntu.com/hardy/poppler-utils
|
|
13
13
|
* OdfConverter: http://www.oooninja.com/2008/01/convert-openxml-docx-etc-in-linux-using.html
|
|
14
14
|
* Openoffice-headless: http://wiki.alfresco.com/wiki/Running_OpenOffice_From_Terminal
|
|
15
15
|
* DocumentConverter.py (included): http://artofsolving.com/opensource/pyodconverter
|
data/Rakefile
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
gem.name = "docpdftotext"
|
|
8
|
+
gem.summary = %Q{Convert word to text in ruby}
|
|
9
|
+
gem.description = %Q{wrappers for libraries to convert documents into text}
|
|
10
|
+
gem.email = "eric@ericsilverberg.com"
|
|
11
|
+
gem.homepage = "http://github.com/esilverberg/docpdftotext"
|
|
12
|
+
gem.authors = ["esilverberg"]
|
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda"
|
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
15
|
+
end
|
|
16
|
+
Jeweler::GemcutterTasks.new
|
|
17
|
+
rescue LoadError
|
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
require 'rake/testtask'
|
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
|
23
|
+
test.libs << 'lib' << 'test'
|
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
|
25
|
+
test.verbose = true
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
begin
|
|
29
|
+
require 'rcov/rcovtask'
|
|
30
|
+
Rcov::RcovTask.new do |test|
|
|
31
|
+
test.libs << 'test'
|
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
|
33
|
+
test.verbose = true
|
|
34
|
+
end
|
|
35
|
+
rescue LoadError
|
|
36
|
+
task :rcov do
|
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
task :test => :check_dependencies
|
|
42
|
+
|
|
43
|
+
task :default => :test
|
|
44
|
+
|
|
45
|
+
require 'rake/rdoctask'
|
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
|
47
|
+
if File.exist?('VERSION')
|
|
48
|
+
version = File.read('VERSION')
|
|
49
|
+
else
|
|
50
|
+
version = ""
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
54
|
+
rdoc.title = "antiword #{version}"
|
|
55
|
+
rdoc.rdoc_files.include('README*')
|
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
57
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.3
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
|
4
|
+
# -*- encoding: utf-8 -*-
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.name = %q{docpdftotext}
|
|
8
|
+
s.version = "0.0.3"
|
|
9
|
+
|
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
|
+
s.authors = ["esilverberg"]
|
|
12
|
+
s.date = %q{2009-11-23}
|
|
13
|
+
s.description = %q{wrappers for libraries to convert documents into text}
|
|
14
|
+
s.email = %q{eric@ericsilverberg.com}
|
|
15
|
+
s.extra_rdoc_files = [
|
|
16
|
+
"LICENSE",
|
|
17
|
+
"README.rdoc"
|
|
18
|
+
]
|
|
19
|
+
s.files = [
|
|
20
|
+
".document",
|
|
21
|
+
".gitignore",
|
|
22
|
+
"LICENSE",
|
|
23
|
+
"README.rdoc",
|
|
24
|
+
"Rakefile",
|
|
25
|
+
"VERSION",
|
|
26
|
+
"docpdftotext.gemspec",
|
|
27
|
+
"lib/DocumentConverter.py",
|
|
28
|
+
"lib/docpdftotext.rb",
|
|
29
|
+
"test/docpdftotext_test.rb",
|
|
30
|
+
"test/test.doc",
|
|
31
|
+
"test/test.docx",
|
|
32
|
+
"test/test.pdf",
|
|
33
|
+
"test/test_helper.rb"
|
|
34
|
+
]
|
|
35
|
+
s.homepage = %q{http://github.com/esilverberg/docpdftotext}
|
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
|
37
|
+
s.require_paths = ["lib"]
|
|
38
|
+
s.rubygems_version = %q{1.3.5}
|
|
39
|
+
s.summary = %q{Convert word to text in ruby}
|
|
40
|
+
s.test_files = [
|
|
41
|
+
"test/docpdftotext_test.rb",
|
|
42
|
+
"test/test_helper.rb"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
if s.respond_to? :specification_version then
|
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
|
47
|
+
s.specification_version = 3
|
|
48
|
+
|
|
49
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
|
50
|
+
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
|
51
|
+
else
|
|
52
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
|
53
|
+
end
|
|
54
|
+
else
|
|
55
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
#
|
|
2
|
+
# PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
|
|
3
|
+
#
|
|
4
|
+
# This script converts a document from one office format to another by
|
|
5
|
+
# connecting to an OpenOffice.org instance via Python-UNO bridge.
|
|
6
|
+
#
|
|
7
|
+
# Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
|
|
8
|
+
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
|
|
9
|
+
# - or any later version.
|
|
10
|
+
#
|
|
11
|
+
DEFAULT_OPENOFFICE_PORT = 8100
|
|
12
|
+
|
|
13
|
+
import uno
|
|
14
|
+
from os.path import abspath, isfile, splitext
|
|
15
|
+
from com.sun.star.beans import PropertyValue
|
|
16
|
+
from com.sun.star.task import ErrorCodeIOException
|
|
17
|
+
from com.sun.star.connection import NoConnectException
|
|
18
|
+
|
|
19
|
+
FAMILY_TEXT = "Text"
|
|
20
|
+
FAMILY_WEB = "Web"
|
|
21
|
+
FAMILY_SPREADSHEET = "Spreadsheet"
|
|
22
|
+
FAMILY_PRESENTATION = "Presentation"
|
|
23
|
+
FAMILY_DRAWING = "Drawing"
|
|
24
|
+
|
|
25
|
+
#---------------------#
|
|
26
|
+
# Configuration Start #
|
|
27
|
+
#---------------------#
|
|
28
|
+
|
|
29
|
+
# see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
|
|
30
|
+
|
|
31
|
+
# most formats are auto-detected; only those requiring options are defined here
|
|
32
|
+
IMPORT_FILTER_MAP = {
|
|
33
|
+
"txt": {
|
|
34
|
+
"FilterName": "Text (encoded)",
|
|
35
|
+
"FilterOptions": "utf8"
|
|
36
|
+
},
|
|
37
|
+
"csv": {
|
|
38
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
|
39
|
+
"FilterOptions": "44,34,0"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
EXPORT_FILTER_MAP = {
|
|
44
|
+
"pdf": {
|
|
45
|
+
FAMILY_TEXT: { "FilterName": "writer_pdf_Export" },
|
|
46
|
+
FAMILY_WEB: { "FilterName": "writer_web_pdf_Export" },
|
|
47
|
+
FAMILY_SPREADSHEET: { "FilterName": "calc_pdf_Export" },
|
|
48
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_pdf_Export" },
|
|
49
|
+
FAMILY_DRAWING: { "FilterName": "draw_pdf_Export" }
|
|
50
|
+
},
|
|
51
|
+
"html": {
|
|
52
|
+
FAMILY_TEXT: { "FilterName": "HTML (StarWriter)" },
|
|
53
|
+
FAMILY_SPREADSHEET: { "FilterName": "HTML (StarCalc)" },
|
|
54
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_html_Export" }
|
|
55
|
+
},
|
|
56
|
+
"odt": {
|
|
57
|
+
FAMILY_TEXT: { "FilterName": "writer8" },
|
|
58
|
+
FAMILY_WEB: { "FilterName": "writerweb8_writer" }
|
|
59
|
+
},
|
|
60
|
+
"doc": {
|
|
61
|
+
FAMILY_TEXT: { "FilterName": "MS Word 97" }
|
|
62
|
+
},
|
|
63
|
+
"rtf": {
|
|
64
|
+
FAMILY_TEXT: { "FilterName": "Rich Text Format" }
|
|
65
|
+
},
|
|
66
|
+
"txt": {
|
|
67
|
+
FAMILY_TEXT: {
|
|
68
|
+
"FilterName": "Text",
|
|
69
|
+
"FilterOptions": "utf8"
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"ods": {
|
|
73
|
+
FAMILY_SPREADSHEET: { "FilterName": "calc8" }
|
|
74
|
+
},
|
|
75
|
+
"xls": {
|
|
76
|
+
FAMILY_SPREADSHEET: { "FilterName": "MS Excel 97" }
|
|
77
|
+
},
|
|
78
|
+
"csv": {
|
|
79
|
+
FAMILY_SPREADSHEET: {
|
|
80
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
|
81
|
+
"FilterOptions": "44,34,0"
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
"odp": {
|
|
85
|
+
FAMILY_PRESENTATION: { "FilterName": "impress8" }
|
|
86
|
+
},
|
|
87
|
+
"ppt": {
|
|
88
|
+
FAMILY_PRESENTATION: { "FilterName": "MS PowerPoint 97" }
|
|
89
|
+
},
|
|
90
|
+
"swf": {
|
|
91
|
+
FAMILY_DRAWING: { "FilterName": "draw_flash_Export" },
|
|
92
|
+
FAMILY_PRESENTATION: { "FilterName": "impress_flash_Export" }
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
PAGE_STYLE_OVERRIDE_PROPERTIES = {
|
|
97
|
+
FAMILY_SPREADSHEET: {
|
|
98
|
+
#--- Scale options: uncomment 1 of the 3 ---
|
|
99
|
+
# a) 'Reduce / enlarge printout': 'Scaling factor'
|
|
100
|
+
"PageScale": 100,
|
|
101
|
+
# b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
|
|
102
|
+
#"ScaleToPagesX": 1, "ScaleToPagesY": 1000,
|
|
103
|
+
# c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
|
|
104
|
+
#"ScaleToPages": 1,
|
|
105
|
+
"PrintGrid": False
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
#-------------------#
|
|
110
|
+
# Configuration End #
|
|
111
|
+
#-------------------#
|
|
112
|
+
|
|
113
|
+
class DocumentConversionException(Exception):
|
|
114
|
+
|
|
115
|
+
def __init__(self, message):
|
|
116
|
+
self.message = message
|
|
117
|
+
|
|
118
|
+
def __str__(self):
|
|
119
|
+
return self.message
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class DocumentConverter:
|
|
123
|
+
|
|
124
|
+
def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
|
|
125
|
+
localContext = uno.getComponentContext()
|
|
126
|
+
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
|
|
127
|
+
try:
|
|
128
|
+
context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
|
|
129
|
+
except NoConnectException:
|
|
130
|
+
raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
|
|
131
|
+
self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
|
|
132
|
+
|
|
133
|
+
def convert(self, inputFile, outputFile):
|
|
134
|
+
|
|
135
|
+
inputUrl = self._toFileUrl(inputFile)
|
|
136
|
+
outputUrl = self._toFileUrl(outputFile)
|
|
137
|
+
|
|
138
|
+
loadProperties = { "Hidden": True }
|
|
139
|
+
inputExt = self._getFileExt(inputFile)
|
|
140
|
+
if IMPORT_FILTER_MAP.has_key(inputExt):
|
|
141
|
+
loadProperties.update(IMPORT_FILTER_MAP[inputExt])
|
|
142
|
+
|
|
143
|
+
document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
|
|
144
|
+
try:
|
|
145
|
+
document.refresh()
|
|
146
|
+
except AttributeError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
family = self._detectFamily(document)
|
|
150
|
+
self._overridePageStyleProperties(document, family)
|
|
151
|
+
|
|
152
|
+
outputExt = self._getFileExt(outputFile)
|
|
153
|
+
storeProperties = self._getStoreProperties(document, outputExt)
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
document.storeToURL(outputUrl, self._toProperties(storeProperties))
|
|
157
|
+
finally:
|
|
158
|
+
document.close(True)
|
|
159
|
+
|
|
160
|
+
def _overridePageStyleProperties(self, document, family):
|
|
161
|
+
if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
|
|
162
|
+
properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
|
|
163
|
+
pageStyles = document.getStyleFamilies().getByName('PageStyles')
|
|
164
|
+
for styleName in pageStyles.getElementNames():
|
|
165
|
+
pageStyle = pageStyles.getByName(styleName)
|
|
166
|
+
for name, value in properties.items():
|
|
167
|
+
pageStyle.setPropertyValue(name, value)
|
|
168
|
+
|
|
169
|
+
def _getStoreProperties(self, document, outputExt):
|
|
170
|
+
family = self._detectFamily(document)
|
|
171
|
+
try:
|
|
172
|
+
propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
|
|
173
|
+
except KeyError:
|
|
174
|
+
raise DocumentConversionException, "unknown output format: '%s'" % outputExt
|
|
175
|
+
try:
|
|
176
|
+
return propertiesByFamily[family]
|
|
177
|
+
except KeyError:
|
|
178
|
+
raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
|
|
179
|
+
|
|
180
|
+
def _detectFamily(self, document):
|
|
181
|
+
if document.supportsService("com.sun.star.text.WebDocument"):
|
|
182
|
+
return FAMILY_WEB
|
|
183
|
+
if document.supportsService("com.sun.star.text.GenericTextDocument"):
|
|
184
|
+
# must be TextDocument or GlobalDocument
|
|
185
|
+
return FAMILY_TEXT
|
|
186
|
+
if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
|
|
187
|
+
return FAMILY_SPREADSHEET
|
|
188
|
+
if document.supportsService("com.sun.star.presentation.PresentationDocument"):
|
|
189
|
+
return FAMILY_PRESENTATION
|
|
190
|
+
if document.supportsService("com.sun.star.drawing.DrawingDocument"):
|
|
191
|
+
return FAMILY_DRAWING
|
|
192
|
+
raise DocumentConversionException, "unknown document family: %s" % document
|
|
193
|
+
|
|
194
|
+
def _getFileExt(self, path):
|
|
195
|
+
ext = splitext(path)[1]
|
|
196
|
+
if ext is not None:
|
|
197
|
+
return ext[1:].lower()
|
|
198
|
+
|
|
199
|
+
def _toFileUrl(self, path):
|
|
200
|
+
return uno.systemPathToFileUrl(abspath(path))
|
|
201
|
+
|
|
202
|
+
def _toProperties(self, dict):
|
|
203
|
+
props = []
|
|
204
|
+
for key in dict:
|
|
205
|
+
prop = PropertyValue()
|
|
206
|
+
prop.Name = key
|
|
207
|
+
prop.Value = dict[key]
|
|
208
|
+
props.append(prop)
|
|
209
|
+
return tuple(props)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
|
+
from sys import argv, exit
|
|
214
|
+
|
|
215
|
+
if len(argv) < 3:
|
|
216
|
+
print "USAGE: python %s <input-file> <output-file>" % argv[0]
|
|
217
|
+
exit(255)
|
|
218
|
+
if not isfile(argv[1]):
|
|
219
|
+
print "no such input file: %s" % argv[1]
|
|
220
|
+
exit(1)
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
converter = DocumentConverter()
|
|
224
|
+
converter.convert(argv[1], argv[2])
|
|
225
|
+
except DocumentConversionException, exception:
|
|
226
|
+
print "ERROR! " + str(exception)
|
|
227
|
+
exit(1)
|
|
228
|
+
except ErrorCodeIOException, exception:
|
|
229
|
+
print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
|
|
230
|
+
exit(1)
|
|
231
|
+
|
data/lib/docpdftotext.rb
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
require 'tempfile'
|
|
2
|
+
|
|
3
|
+
module DocPdfToText
|
|
4
|
+
VERSION = "1.0.0"
|
|
5
|
+
ANTIWORD_PATH = "antiword"
|
|
6
|
+
ODF_CONVERTER_PATH = "OdfConverter"
|
|
7
|
+
PYTHON_PATH = "python"
|
|
8
|
+
DOC_CONVERTER_PATH = File.join(File.dirname(__FILE__), "DocumentConverter.py")
|
|
9
|
+
PDFTOTEXT_PATH = "pdftotext"
|
|
10
|
+
|
|
11
|
+
def file_to_txt(file_path)
|
|
12
|
+
expanded_path = File.expand_path(file_path)
|
|
13
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
|
14
|
+
return case File.extname(expanded_path)
|
|
15
|
+
when ".docx"
|
|
16
|
+
docx_to_txt(file_path)
|
|
17
|
+
when ".doc"
|
|
18
|
+
doc_to_txt(file_path)
|
|
19
|
+
when ".pdf"
|
|
20
|
+
pdf_to_txt(file_path)
|
|
21
|
+
when ".txt"
|
|
22
|
+
read_txt_file(file_path)
|
|
23
|
+
else
|
|
24
|
+
raise ArgumentError, "Invalid file type"
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def docx_to_txt(file_path)
|
|
29
|
+
expanded_path = File.expand_path(file_path)
|
|
30
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
|
31
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"
|
|
32
|
+
|
|
33
|
+
tmp_odt = Tempfile.new("docx")
|
|
34
|
+
tmp_odt_path = tmp_odt.path + ".odt"
|
|
35
|
+
tmp_odt.close # so our script can write to it; it isn't deleted till gc
|
|
36
|
+
|
|
37
|
+
cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
|
|
38
|
+
`#{cmd}`
|
|
39
|
+
|
|
40
|
+
tmp_final = Tempfile.new("txt")
|
|
41
|
+
tmp_final_path = tmp_final.path + ".txt"
|
|
42
|
+
tmp_final.close
|
|
43
|
+
|
|
44
|
+
cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
|
|
45
|
+
`#{cmd}`
|
|
46
|
+
|
|
47
|
+
return read_txt_file(tmp_final_path)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def read_txt_file(file_path)
|
|
51
|
+
expanded_path = File.expand_path(file_path)
|
|
52
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
|
53
|
+
|
|
54
|
+
final = []
|
|
55
|
+
File.open(expanded_path, "r") do |infile|
|
|
56
|
+
final.push(infile.gets)
|
|
57
|
+
end
|
|
58
|
+
return final.join("\n")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def doc_to_txt(file_path)
|
|
62
|
+
expanded_path = File.expand_path(file_path)
|
|
63
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
|
64
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
|
|
65
|
+
cmd = "#{ANTIWORD_PATH} #{expanded_path}"
|
|
66
|
+
return `#{cmd}`
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def pdf_to_txt(file_path)
|
|
70
|
+
expanded_path = File.expand_path(file_path)
|
|
71
|
+
raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
|
|
72
|
+
raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"
|
|
73
|
+
|
|
74
|
+
tmp = Tempfile.new("pdf")
|
|
75
|
+
tmp_path = tmp.path
|
|
76
|
+
tmp.close # so our script can write to it; it isn't deleted till gc
|
|
77
|
+
|
|
78
|
+
cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
|
|
79
|
+
`#{cmd}`
|
|
80
|
+
return read_txt_file(tmp_path)
|
|
81
|
+
end
|
|
82
|
+
end
|
data/test/docpdftotext_test.rb
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
require 'test_helper'
|
|
2
|
-
require 'tempfile'
|
|
3
2
|
|
|
4
3
|
class DocPdfToTextTest < Test::Unit::TestCase
|
|
5
4
|
include DocPdfToText
|
|
6
5
|
|
|
6
|
+
should "Convert a pdf file" do
|
|
7
|
+
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
|
8
|
+
assert(file_to_txt(test_file).length > 0)
|
|
9
|
+
end
|
|
10
|
+
|
|
7
11
|
should "Convert a docx file" do
|
|
8
12
|
test_file = File.join(File.dirname(__FILE__), "test.docx")
|
|
9
13
|
assert(file_to_txt(test_file).length > 0)
|
|
@@ -14,11 +18,6 @@ class DocPdfToTextTest < Test::Unit::TestCase
|
|
|
14
18
|
assert(file_to_txt(test_file).length > 0)
|
|
15
19
|
end
|
|
16
20
|
|
|
17
|
-
should "Convert a pdf file" do
|
|
18
|
-
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
|
19
|
-
assert(file_to_txt(test_file).length > 0)
|
|
20
|
-
end
|
|
21
|
-
|
|
22
21
|
should "raise invalid file format" do
|
|
23
22
|
assert_raise ArgumentError do
|
|
24
23
|
test_file = File.join(File.dirname(__FILE__), "test.pdf")
|
data/test/test.doc
ADDED
|
Binary file
|
data/test/test.docx
ADDED
|
Binary file
|
data/test/test.pdf
ADDED
|
Binary file
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: docpdftotext
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- esilverberg
|
|
@@ -22,16 +22,6 @@ dependencies:
|
|
|
22
22
|
- !ruby/object:Gem::Version
|
|
23
23
|
version: "0"
|
|
24
24
|
version:
|
|
25
|
-
- !ruby/object:Gem::Dependency
|
|
26
|
-
name: pdf-reader
|
|
27
|
-
type: :runtime
|
|
28
|
-
version_requirement:
|
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
30
|
-
requirements:
|
|
31
|
-
- - ">="
|
|
32
|
-
- !ruby/object:Gem::Version
|
|
33
|
-
version: "0"
|
|
34
|
-
version:
|
|
35
25
|
description: wrappers for libraries to convert documents into text
|
|
36
26
|
email: eric@ericsilverberg.com
|
|
37
27
|
executables: []
|
|
@@ -42,8 +32,20 @@ extra_rdoc_files:
|
|
|
42
32
|
- LICENSE
|
|
43
33
|
- README.rdoc
|
|
44
34
|
files:
|
|
35
|
+
- .document
|
|
36
|
+
- .gitignore
|
|
45
37
|
- LICENSE
|
|
46
38
|
- README.rdoc
|
|
39
|
+
- Rakefile
|
|
40
|
+
- VERSION
|
|
41
|
+
- docpdftotext.gemspec
|
|
42
|
+
- lib/DocumentConverter.py
|
|
43
|
+
- lib/docpdftotext.rb
|
|
44
|
+
- test/docpdftotext_test.rb
|
|
45
|
+
- test/test.doc
|
|
46
|
+
- test/test.docx
|
|
47
|
+
- test/test.pdf
|
|
48
|
+
- test/test_helper.rb
|
|
47
49
|
has_rdoc: true
|
|
48
50
|
homepage: http://github.com/esilverberg/docpdftotext
|
|
49
51
|
licenses: []
|
|
@@ -73,5 +75,5 @@ signing_key:
|
|
|
73
75
|
specification_version: 3
|
|
74
76
|
summary: Convert word to text in ruby
|
|
75
77
|
test_files:
|
|
76
|
-
- test/test_helper.rb
|
|
77
78
|
- test/docpdftotext_test.rb
|
|
79
|
+
- test/test_helper.rb
|