docpdftotext 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,17 @@
1
+ log/*
2
+ tmp/**/*
3
+ doc/api
4
+ doc/app
5
+ doc/plugins
6
+ *~
7
+ config/keys
8
+ /public/images/Thumbs.db
9
+ Thumbs.db
10
+ public/system
11
+ public/demos
12
+ *.sw?
13
+ .DS_Store
14
+ coverage
15
+ rdoc
16
+ pkg
17
+ *~
@@ -9,7 +9,7 @@ This gem enables you to interact with document conversion libraries through Rail
9
9
 
10
10
  == Requirements
11
11
  * Antiword: http://www.winfield.demon.nl/
12
- * pdf-reader: http://github.com/yob/pdf-reader
12
+ * pdftotext: http://packages.ubuntu.com/hardy/poppler-utils
13
13
  * OdfConverter: http://www.oooninja.com/2008/01/convert-openxml-docx-etc-in-linux-using.html
14
14
  * Openoffice-headless: http://wiki.alfresco.com/wiki/Running_OpenOffice_From_Terminal
15
15
  * DocumentConverter.py (included): http://artofsolving.com/opensource/pyodconverter
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "docpdftotext"
8
+ gem.summary = %Q{Convert word to text in ruby}
9
+ gem.description = %Q{wrappers for libraries to convert documents into text}
10
+ gem.email = "eric@ericsilverberg.com"
11
+ gem.homepage = "http://github.com/esilverberg/docpdftotext"
12
+ gem.authors = ["esilverberg"]
13
+ gem.add_development_dependency "thoughtbot-shoulda"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "antiword #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,57 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{docpdftotext}
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["esilverberg"]
12
+ s.date = %q{2009-11-23}
13
+ s.description = %q{wrappers for libraries to convert documents into text}
14
+ s.email = %q{eric@ericsilverberg.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "docpdftotext.gemspec",
27
+ "lib/DocumentConverter.py",
28
+ "lib/docpdftotext.rb",
29
+ "test/docpdftotext_test.rb",
30
+ "test/test.doc",
31
+ "test/test.docx",
32
+ "test/test.pdf",
33
+ "test/test_helper.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/esilverberg/docpdftotext}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.5}
39
+ s.summary = %q{Convert word to text in ruby}
40
+ s.test_files = [
41
+ "test/docpdftotext_test.rb",
42
+ "test/test_helper.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
51
+ else
52
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
56
+ end
57
+ end
@@ -0,0 +1,231 @@
1
+ #
2
+ # PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
3
+ #
4
+ # This script converts a document from one office format to another by
5
+ # connecting to an OpenOffice.org instance via Python-UNO bridge.
6
+ #
7
+ # Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
8
+ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
9
+ # - or any later version.
10
+ #
11
+ DEFAULT_OPENOFFICE_PORT = 8100
12
+
13
+ import uno
14
+ from os.path import abspath, isfile, splitext
15
+ from com.sun.star.beans import PropertyValue
16
+ from com.sun.star.task import ErrorCodeIOException
17
+ from com.sun.star.connection import NoConnectException
18
+
19
+ FAMILY_TEXT = "Text"
20
+ FAMILY_WEB = "Web"
21
+ FAMILY_SPREADSHEET = "Spreadsheet"
22
+ FAMILY_PRESENTATION = "Presentation"
23
+ FAMILY_DRAWING = "Drawing"
24
+
25
+ #---------------------#
26
+ # Configuration Start #
27
+ #---------------------#
28
+
29
+ # see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
30
+
31
+ # most formats are auto-detected; only those requiring options are defined here
32
+ IMPORT_FILTER_MAP = {
33
+ "txt": {
34
+ "FilterName": "Text (encoded)",
35
+ "FilterOptions": "utf8"
36
+ },
37
+ "csv": {
38
+ "FilterName": "Text - txt - csv (StarCalc)",
39
+ "FilterOptions": "44,34,0"
40
+ }
41
+ }
42
+
43
+ EXPORT_FILTER_MAP = {
44
+ "pdf": {
45
+ FAMILY_TEXT: { "FilterName": "writer_pdf_Export" },
46
+ FAMILY_WEB: { "FilterName": "writer_web_pdf_Export" },
47
+ FAMILY_SPREADSHEET: { "FilterName": "calc_pdf_Export" },
48
+ FAMILY_PRESENTATION: { "FilterName": "impress_pdf_Export" },
49
+ FAMILY_DRAWING: { "FilterName": "draw_pdf_Export" }
50
+ },
51
+ "html": {
52
+ FAMILY_TEXT: { "FilterName": "HTML (StarWriter)" },
53
+ FAMILY_SPREADSHEET: { "FilterName": "HTML (StarCalc)" },
54
+ FAMILY_PRESENTATION: { "FilterName": "impress_html_Export" }
55
+ },
56
+ "odt": {
57
+ FAMILY_TEXT: { "FilterName": "writer8" },
58
+ FAMILY_WEB: { "FilterName": "writerweb8_writer" }
59
+ },
60
+ "doc": {
61
+ FAMILY_TEXT: { "FilterName": "MS Word 97" }
62
+ },
63
+ "rtf": {
64
+ FAMILY_TEXT: { "FilterName": "Rich Text Format" }
65
+ },
66
+ "txt": {
67
+ FAMILY_TEXT: {
68
+ "FilterName": "Text",
69
+ "FilterOptions": "utf8"
70
+ }
71
+ },
72
+ "ods": {
73
+ FAMILY_SPREADSHEET: { "FilterName": "calc8" }
74
+ },
75
+ "xls": {
76
+ FAMILY_SPREADSHEET: { "FilterName": "MS Excel 97" }
77
+ },
78
+ "csv": {
79
+ FAMILY_SPREADSHEET: {
80
+ "FilterName": "Text - txt - csv (StarCalc)",
81
+ "FilterOptions": "44,34,0"
82
+ }
83
+ },
84
+ "odp": {
85
+ FAMILY_PRESENTATION: { "FilterName": "impress8" }
86
+ },
87
+ "ppt": {
88
+ FAMILY_PRESENTATION: { "FilterName": "MS PowerPoint 97" }
89
+ },
90
+ "swf": {
91
+ FAMILY_DRAWING: { "FilterName": "draw_flash_Export" },
92
+ FAMILY_PRESENTATION: { "FilterName": "impress_flash_Export" }
93
+ }
94
+ }
95
+
96
+ PAGE_STYLE_OVERRIDE_PROPERTIES = {
97
+ FAMILY_SPREADSHEET: {
98
+ #--- Scale options: uncomment 1 of the 3 ---
99
+ # a) 'Reduce / enlarge printout': 'Scaling factor'
100
+ "PageScale": 100,
101
+ # b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
102
+ #"ScaleToPagesX": 1, "ScaleToPagesY": 1000,
103
+ # c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
104
+ #"ScaleToPages": 1,
105
+ "PrintGrid": False
106
+ }
107
+ }
108
+
109
+ #-------------------#
110
+ # Configuration End #
111
+ #-------------------#
112
+
113
+ class DocumentConversionException(Exception):
114
+
115
+ def __init__(self, message):
116
+ self.message = message
117
+
118
+ def __str__(self):
119
+ return self.message
120
+
121
+
122
+ class DocumentConverter:
123
+
124
+ def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
125
+ localContext = uno.getComponentContext()
126
+ resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
127
+ try:
128
+ context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
129
+ except NoConnectException:
130
+ raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
131
+ self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
132
+
133
+ def convert(self, inputFile, outputFile):
134
+
135
+ inputUrl = self._toFileUrl(inputFile)
136
+ outputUrl = self._toFileUrl(outputFile)
137
+
138
+ loadProperties = { "Hidden": True }
139
+ inputExt = self._getFileExt(inputFile)
140
+ if IMPORT_FILTER_MAP.has_key(inputExt):
141
+ loadProperties.update(IMPORT_FILTER_MAP[inputExt])
142
+
143
+ document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
144
+ try:
145
+ document.refresh()
146
+ except AttributeError:
147
+ pass
148
+
149
+ family = self._detectFamily(document)
150
+ self._overridePageStyleProperties(document, family)
151
+
152
+ outputExt = self._getFileExt(outputFile)
153
+ storeProperties = self._getStoreProperties(document, outputExt)
154
+
155
+ try:
156
+ document.storeToURL(outputUrl, self._toProperties(storeProperties))
157
+ finally:
158
+ document.close(True)
159
+
160
+ def _overridePageStyleProperties(self, document, family):
161
+ if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
162
+ properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
163
+ pageStyles = document.getStyleFamilies().getByName('PageStyles')
164
+ for styleName in pageStyles.getElementNames():
165
+ pageStyle = pageStyles.getByName(styleName)
166
+ for name, value in properties.items():
167
+ pageStyle.setPropertyValue(name, value)
168
+
169
+ def _getStoreProperties(self, document, outputExt):
170
+ family = self._detectFamily(document)
171
+ try:
172
+ propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
173
+ except KeyError:
174
+ raise DocumentConversionException, "unknown output format: '%s'" % outputExt
175
+ try:
176
+ return propertiesByFamily[family]
177
+ except KeyError:
178
+ raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
179
+
180
+ def _detectFamily(self, document):
181
+ if document.supportsService("com.sun.star.text.WebDocument"):
182
+ return FAMILY_WEB
183
+ if document.supportsService("com.sun.star.text.GenericTextDocument"):
184
+ # must be TextDocument or GlobalDocument
185
+ return FAMILY_TEXT
186
+ if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
187
+ return FAMILY_SPREADSHEET
188
+ if document.supportsService("com.sun.star.presentation.PresentationDocument"):
189
+ return FAMILY_PRESENTATION
190
+ if document.supportsService("com.sun.star.drawing.DrawingDocument"):
191
+ return FAMILY_DRAWING
192
+ raise DocumentConversionException, "unknown document family: %s" % document
193
+
194
+ def _getFileExt(self, path):
195
+ ext = splitext(path)[1]
196
+ if ext is not None:
197
+ return ext[1:].lower()
198
+
199
+ def _toFileUrl(self, path):
200
+ return uno.systemPathToFileUrl(abspath(path))
201
+
202
+ def _toProperties(self, dict):
203
+ props = []
204
+ for key in dict:
205
+ prop = PropertyValue()
206
+ prop.Name = key
207
+ prop.Value = dict[key]
208
+ props.append(prop)
209
+ return tuple(props)
210
+
211
+
212
+ if __name__ == "__main__":
213
+ from sys import argv, exit
214
+
215
+ if len(argv) < 3:
216
+ print "USAGE: python %s <input-file> <output-file>" % argv[0]
217
+ exit(255)
218
+ if not isfile(argv[1]):
219
+ print "no such input file: %s" % argv[1]
220
+ exit(1)
221
+
222
+ try:
223
+ converter = DocumentConverter()
224
+ converter.convert(argv[1], argv[2])
225
+ except DocumentConversionException, exception:
226
+ print "ERROR! " + str(exception)
227
+ exit(1)
228
+ except ErrorCodeIOException, exception:
229
+ print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
230
+ exit(1)
231
+
@@ -0,0 +1,82 @@
1
+ require 'tempfile'
2
+
3
+ module DocPdfToText
4
+ VERSION = "1.0.0"
5
+ ANTIWORD_PATH = "antiword"
6
+ ODF_CONVERTER_PATH = "OdfConverter"
7
+ PYTHON_PATH = "python"
8
+ DOC_CONVERTER_PATH = File.join(File.dirname(__FILE__), "DocumentConverter.py")
9
+ PDFTOTEXT_PATH = "pdftotext"
10
+
11
+ def file_to_txt(file_path)
12
+ expanded_path = File.expand_path(file_path)
13
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
14
+ return case File.extname(expanded_path)
15
+ when ".docx"
16
+ docx_to_txt(file_path)
17
+ when ".doc"
18
+ doc_to_txt(file_path)
19
+ when ".pdf"
20
+ pdf_to_txt(file_path)
21
+ when ".txt"
22
+ read_txt_file(file_path)
23
+ else
24
+ raise ArgumentError, "Invalid file type"
25
+ end
26
+ end
27
+
28
+ def docx_to_txt(file_path)
29
+ expanded_path = File.expand_path(file_path)
30
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
31
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"
32
+
33
+ tmp_odt = Tempfile.new("docx")
34
+ tmp_odt_path = tmp_odt.path + ".odt"
35
+ tmp_odt.close # so our script can write to it; it isn't deleted till gc
36
+
37
+ cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
38
+ `#{cmd}`
39
+
40
+ tmp_final = Tempfile.new("txt")
41
+ tmp_final_path = tmp_final.path + ".txt"
42
+ tmp_final.close
43
+
44
+ cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
45
+ `#{cmd}`
46
+
47
+ return read_txt_file(tmp_final_path)
48
+ end
49
+
50
+ def read_txt_file(file_path)
51
+ expanded_path = File.expand_path(file_path)
52
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
53
+
54
+ final = []
55
+ File.open(expanded_path, "r") do |infile|
56
+ final.push(infile.gets)
57
+ end
58
+ return final.join("\n")
59
+ end
60
+
61
+ def doc_to_txt(file_path)
62
+ expanded_path = File.expand_path(file_path)
63
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
64
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
65
+ cmd = "#{ANTIWORD_PATH} #{expanded_path}"
66
+ return `#{cmd}`
67
+ end
68
+
69
+ def pdf_to_txt(file_path)
70
+ expanded_path = File.expand_path(file_path)
71
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
72
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"
73
+
74
+ tmp = Tempfile.new("pdf")
75
+ tmp_path = tmp.path
76
+ tmp.close # so our script can write to it; it isn't deleted till gc
77
+
78
+ cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
79
+ `#{cmd}`
80
+ return read_txt_file(tmp_path)
81
+ end
82
+ end
@@ -1,9 +1,13 @@
1
1
  require 'test_helper'
2
- require 'tempfile'
3
2
 
4
3
  class DocPdfToTextTest < Test::Unit::TestCase
5
4
  include DocPdfToText
6
5
 
6
+ should "Convert a pdf file" do
7
+ test_file = File.join(File.dirname(__FILE__), "test.pdf")
8
+ assert(file_to_txt(test_file).length > 0)
9
+ end
10
+
7
11
  should "Convert a docx file" do
8
12
  test_file = File.join(File.dirname(__FILE__), "test.docx")
9
13
  assert(file_to_txt(test_file).length > 0)
@@ -14,11 +18,6 @@ class DocPdfToTextTest < Test::Unit::TestCase
14
18
  assert(file_to_txt(test_file).length > 0)
15
19
  end
16
20
 
17
- should "Convert a pdf file" do
18
- test_file = File.join(File.dirname(__FILE__), "test.pdf")
19
- assert(file_to_txt(test_file).length > 0)
20
- end
21
-
22
21
  should "raise invalid file format" do
23
22
  assert_raise ArgumentError do
24
23
  test_file = File.join(File.dirname(__FILE__), "test.pdf")
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docpdftotext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - esilverberg
@@ -22,16 +22,6 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: "0"
24
24
  version:
25
- - !ruby/object:Gem::Dependency
26
- name: pdf-reader
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: "0"
34
- version:
35
25
  description: wrappers for libraries to convert documents into text
36
26
  email: eric@ericsilverberg.com
37
27
  executables: []
@@ -42,8 +32,20 @@ extra_rdoc_files:
42
32
  - LICENSE
43
33
  - README.rdoc
44
34
  files:
35
+ - .document
36
+ - .gitignore
45
37
  - LICENSE
46
38
  - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - docpdftotext.gemspec
42
+ - lib/DocumentConverter.py
43
+ - lib/docpdftotext.rb
44
+ - test/docpdftotext_test.rb
45
+ - test/test.doc
46
+ - test/test.docx
47
+ - test/test.pdf
48
+ - test/test_helper.rb
47
49
  has_rdoc: true
48
50
  homepage: http://github.com/esilverberg/docpdftotext
49
51
  licenses: []
@@ -73,5 +75,5 @@ signing_key:
73
75
  specification_version: 3
74
76
  summary: Convert word to text in ruby
75
77
  test_files:
76
- - test/test_helper.rb
77
78
  - test/docpdftotext_test.rb
79
+ - test/test_helper.rb