docpdftotext 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,17 @@
1
+ log/*
2
+ tmp/**/*
3
+ doc/api
4
+ doc/app
5
+ doc/plugins
6
+ *~
7
+ config/keys
8
+ /public/images/Thumbs.db
9
+ Thumbs.db
10
+ public/system
11
+ public/demos
12
+ *.sw?
13
+ .DS_Store
14
+ coverage
15
+ rdoc
16
+ pkg
17
+ *~
@@ -9,7 +9,7 @@ This gem enables you to interact with document conversion libraries through Rail
9
9
 
10
10
  == Requirements
11
11
  * Antiword: http://www.winfield.demon.nl/
12
- * pdf-reader: http://github.com/yob/pdf-reader
12
+ * pdftotext: http://packages.ubuntu.com/hardy/poppler-utils
13
13
  * OdfConverter: http://www.oooninja.com/2008/01/convert-openxml-docx-etc-in-linux-using.html
14
14
  * Openoffice-headless: http://wiki.alfresco.com/wiki/Running_OpenOffice_From_Terminal
15
15
  * DocumentConverter.py (included): http://artofsolving.com/opensource/pyodconverter
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "docpdftotext"
8
+ gem.summary = %Q{Convert word to text in ruby}
9
+ gem.description = %Q{wrappers for libraries to convert documents into text}
10
+ gem.email = "eric@ericsilverberg.com"
11
+ gem.homepage = "http://github.com/esilverberg/docpdftotext"
12
+ gem.authors = ["esilverberg"]
13
+ gem.add_development_dependency "thoughtbot-shoulda"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "antiword #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,57 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{docpdftotext}
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["esilverberg"]
12
+ s.date = %q{2009-11-23}
13
+ s.description = %q{wrappers for libraries to convert documents into text}
14
+ s.email = %q{eric@ericsilverberg.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "docpdftotext.gemspec",
27
+ "lib/DocumentConverter.py",
28
+ "lib/docpdftotext.rb",
29
+ "test/docpdftotext_test.rb",
30
+ "test/test.doc",
31
+ "test/test.docx",
32
+ "test/test.pdf",
33
+ "test/test_helper.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/esilverberg/docpdftotext}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.5}
39
+ s.summary = %q{Convert word to text in ruby}
40
+ s.test_files = [
41
+ "test/docpdftotext_test.rb",
42
+ "test/test_helper.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
51
+ else
52
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
56
+ end
57
+ end
@@ -0,0 +1,231 @@
1
+ #
2
+ # PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
3
+ #
4
+ # This script converts a document from one office format to another by
5
+ # connecting to an OpenOffice.org instance via Python-UNO bridge.
6
+ #
7
+ # Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
8
+ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
9
+ # - or any later version.
10
+ #
11
+ DEFAULT_OPENOFFICE_PORT = 8100
12
+
13
+ import uno
14
+ from os.path import abspath, isfile, splitext
15
+ from com.sun.star.beans import PropertyValue
16
+ from com.sun.star.task import ErrorCodeIOException
17
+ from com.sun.star.connection import NoConnectException
18
+
19
+ FAMILY_TEXT = "Text"
20
+ FAMILY_WEB = "Web"
21
+ FAMILY_SPREADSHEET = "Spreadsheet"
22
+ FAMILY_PRESENTATION = "Presentation"
23
+ FAMILY_DRAWING = "Drawing"
24
+
25
+ #---------------------#
26
+ # Configuration Start #
27
+ #---------------------#
28
+
29
+ # see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
30
+
31
+ # most formats are auto-detected; only those requiring options are defined here
32
+ IMPORT_FILTER_MAP = {
33
+ "txt": {
34
+ "FilterName": "Text (encoded)",
35
+ "FilterOptions": "utf8"
36
+ },
37
+ "csv": {
38
+ "FilterName": "Text - txt - csv (StarCalc)",
39
+ "FilterOptions": "44,34,0"
40
+ }
41
+ }
42
+
43
+ EXPORT_FILTER_MAP = {
44
+ "pdf": {
45
+ FAMILY_TEXT: { "FilterName": "writer_pdf_Export" },
46
+ FAMILY_WEB: { "FilterName": "writer_web_pdf_Export" },
47
+ FAMILY_SPREADSHEET: { "FilterName": "calc_pdf_Export" },
48
+ FAMILY_PRESENTATION: { "FilterName": "impress_pdf_Export" },
49
+ FAMILY_DRAWING: { "FilterName": "draw_pdf_Export" }
50
+ },
51
+ "html": {
52
+ FAMILY_TEXT: { "FilterName": "HTML (StarWriter)" },
53
+ FAMILY_SPREADSHEET: { "FilterName": "HTML (StarCalc)" },
54
+ FAMILY_PRESENTATION: { "FilterName": "impress_html_Export" }
55
+ },
56
+ "odt": {
57
+ FAMILY_TEXT: { "FilterName": "writer8" },
58
+ FAMILY_WEB: { "FilterName": "writerweb8_writer" }
59
+ },
60
+ "doc": {
61
+ FAMILY_TEXT: { "FilterName": "MS Word 97" }
62
+ },
63
+ "rtf": {
64
+ FAMILY_TEXT: { "FilterName": "Rich Text Format" }
65
+ },
66
+ "txt": {
67
+ FAMILY_TEXT: {
68
+ "FilterName": "Text",
69
+ "FilterOptions": "utf8"
70
+ }
71
+ },
72
+ "ods": {
73
+ FAMILY_SPREADSHEET: { "FilterName": "calc8" }
74
+ },
75
+ "xls": {
76
+ FAMILY_SPREADSHEET: { "FilterName": "MS Excel 97" }
77
+ },
78
+ "csv": {
79
+ FAMILY_SPREADSHEET: {
80
+ "FilterName": "Text - txt - csv (StarCalc)",
81
+ "FilterOptions": "44,34,0"
82
+ }
83
+ },
84
+ "odp": {
85
+ FAMILY_PRESENTATION: { "FilterName": "impress8" }
86
+ },
87
+ "ppt": {
88
+ FAMILY_PRESENTATION: { "FilterName": "MS PowerPoint 97" }
89
+ },
90
+ "swf": {
91
+ FAMILY_DRAWING: { "FilterName": "draw_flash_Export" },
92
+ FAMILY_PRESENTATION: { "FilterName": "impress_flash_Export" }
93
+ }
94
+ }
95
+
96
+ PAGE_STYLE_OVERRIDE_PROPERTIES = {
97
+ FAMILY_SPREADSHEET: {
98
+ #--- Scale options: uncomment 1 of the 3 ---
99
+ # a) 'Reduce / enlarge printout': 'Scaling factor'
100
+ "PageScale": 100,
101
+ # b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
102
+ #"ScaleToPagesX": 1, "ScaleToPagesY": 1000,
103
+ # c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
104
+ #"ScaleToPages": 1,
105
+ "PrintGrid": False
106
+ }
107
+ }
108
+
109
+ #-------------------#
110
+ # Configuration End #
111
+ #-------------------#
112
+
113
+ class DocumentConversionException(Exception):
114
+
115
+ def __init__(self, message):
116
+ self.message = message
117
+
118
+ def __str__(self):
119
+ return self.message
120
+
121
+
122
+ class DocumentConverter:
123
+
124
+ def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
125
+ localContext = uno.getComponentContext()
126
+ resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
127
+ try:
128
+ context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
129
+ except NoConnectException:
130
+ raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
131
+ self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
132
+
133
+ def convert(self, inputFile, outputFile):
134
+
135
+ inputUrl = self._toFileUrl(inputFile)
136
+ outputUrl = self._toFileUrl(outputFile)
137
+
138
+ loadProperties = { "Hidden": True }
139
+ inputExt = self._getFileExt(inputFile)
140
+ if IMPORT_FILTER_MAP.has_key(inputExt):
141
+ loadProperties.update(IMPORT_FILTER_MAP[inputExt])
142
+
143
+ document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
144
+ try:
145
+ document.refresh()
146
+ except AttributeError:
147
+ pass
148
+
149
+ family = self._detectFamily(document)
150
+ self._overridePageStyleProperties(document, family)
151
+
152
+ outputExt = self._getFileExt(outputFile)
153
+ storeProperties = self._getStoreProperties(document, outputExt)
154
+
155
+ try:
156
+ document.storeToURL(outputUrl, self._toProperties(storeProperties))
157
+ finally:
158
+ document.close(True)
159
+
160
+ def _overridePageStyleProperties(self, document, family):
161
+ if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
162
+ properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
163
+ pageStyles = document.getStyleFamilies().getByName('PageStyles')
164
+ for styleName in pageStyles.getElementNames():
165
+ pageStyle = pageStyles.getByName(styleName)
166
+ for name, value in properties.items():
167
+ pageStyle.setPropertyValue(name, value)
168
+
169
+ def _getStoreProperties(self, document, outputExt):
170
+ family = self._detectFamily(document)
171
+ try:
172
+ propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
173
+ except KeyError:
174
+ raise DocumentConversionException, "unknown output format: '%s'" % outputExt
175
+ try:
176
+ return propertiesByFamily[family]
177
+ except KeyError:
178
+ raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
179
+
180
+ def _detectFamily(self, document):
181
+ if document.supportsService("com.sun.star.text.WebDocument"):
182
+ return FAMILY_WEB
183
+ if document.supportsService("com.sun.star.text.GenericTextDocument"):
184
+ # must be TextDocument or GlobalDocument
185
+ return FAMILY_TEXT
186
+ if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
187
+ return FAMILY_SPREADSHEET
188
+ if document.supportsService("com.sun.star.presentation.PresentationDocument"):
189
+ return FAMILY_PRESENTATION
190
+ if document.supportsService("com.sun.star.drawing.DrawingDocument"):
191
+ return FAMILY_DRAWING
192
+ raise DocumentConversionException, "unknown document family: %s" % document
193
+
194
+ def _getFileExt(self, path):
195
+ ext = splitext(path)[1]
196
+ if ext is not None:
197
+ return ext[1:].lower()
198
+
199
+ def _toFileUrl(self, path):
200
+ return uno.systemPathToFileUrl(abspath(path))
201
+
202
+ def _toProperties(self, dict):
203
+ props = []
204
+ for key in dict:
205
+ prop = PropertyValue()
206
+ prop.Name = key
207
+ prop.Value = dict[key]
208
+ props.append(prop)
209
+ return tuple(props)
210
+
211
+
212
+ if __name__ == "__main__":
213
+ from sys import argv, exit
214
+
215
+ if len(argv) < 3:
216
+ print "USAGE: python %s <input-file> <output-file>" % argv[0]
217
+ exit(255)
218
+ if not isfile(argv[1]):
219
+ print "no such input file: %s" % argv[1]
220
+ exit(1)
221
+
222
+ try:
223
+ converter = DocumentConverter()
224
+ converter.convert(argv[1], argv[2])
225
+ except DocumentConversionException, exception:
226
+ print "ERROR! " + str(exception)
227
+ exit(1)
228
+ except ErrorCodeIOException, exception:
229
+ print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
230
+ exit(1)
231
+
@@ -0,0 +1,82 @@
1
+ require 'tempfile'
2
+
3
+ module DocPdfToText
4
+ VERSION = "1.0.0"
5
+ ANTIWORD_PATH = "antiword"
6
+ ODF_CONVERTER_PATH = "OdfConverter"
7
+ PYTHON_PATH = "python"
8
+ DOC_CONVERTER_PATH = File.join(File.dirname(__FILE__), "DocumentConverter.py")
9
+ PDFTOTEXT_PATH = "pdftotext"
10
+
11
+ def file_to_txt(file_path)
12
+ expanded_path = File.expand_path(file_path)
13
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
14
+ return case File.extname(expanded_path)
15
+ when ".docx"
16
+ docx_to_txt(file_path)
17
+ when ".doc"
18
+ doc_to_txt(file_path)
19
+ when ".pdf"
20
+ pdf_to_txt(file_path)
21
+ when ".txt"
22
+ read_txt_file(file_path)
23
+ else
24
+ raise ArgumentError, "Invalid file type"
25
+ end
26
+ end
27
+
28
+ def docx_to_txt(file_path)
29
+ expanded_path = File.expand_path(file_path)
30
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
31
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"
32
+
33
+ tmp_odt = Tempfile.new("docx")
34
+ tmp_odt_path = tmp_odt.path + ".odt"
35
+ tmp_odt.close # so our script can write to it; it isn't deleted till gc
36
+
37
+ cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
38
+ `#{cmd}`
39
+
40
+ tmp_final = Tempfile.new("txt")
41
+ tmp_final_path = tmp_final.path + ".txt"
42
+ tmp_final.close
43
+
44
+ cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
45
+ `#{cmd}`
46
+
47
+ return read_txt_file(tmp_final_path)
48
+ end
49
+
50
+ def read_txt_file(file_path)
51
+ expanded_path = File.expand_path(file_path)
52
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
53
+
54
+ final = []
55
+ File.open(expanded_path, "r") do |infile|
56
+ final.push(infile.gets)
57
+ end
58
+ return final.join("\n")
59
+ end
60
+
61
+ def doc_to_txt(file_path)
62
+ expanded_path = File.expand_path(file_path)
63
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
64
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
65
+ cmd = "#{ANTIWORD_PATH} #{expanded_path}"
66
+ return `#{cmd}`
67
+ end
68
+
69
+ def pdf_to_txt(file_path)
70
+ expanded_path = File.expand_path(file_path)
71
+ raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
72
+ raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"
73
+
74
+ tmp = Tempfile.new("pdf")
75
+ tmp_path = tmp.path
76
+ tmp.close # so our script can write to it; it isn't deleted till gc
77
+
78
+ cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
79
+ `#{cmd}`
80
+ return read_txt_file(tmp_path)
81
+ end
82
+ end
@@ -1,9 +1,13 @@
1
1
  require 'test_helper'
2
- require 'tempfile'
3
2
 
4
3
  class DocPdfToTextTest < Test::Unit::TestCase
5
4
  include DocPdfToText
6
5
 
6
+ should "Convert a pdf file" do
7
+ test_file = File.join(File.dirname(__FILE__), "test.pdf")
8
+ assert(file_to_txt(test_file).length > 0)
9
+ end
10
+
7
11
  should "Convert a docx file" do
8
12
  test_file = File.join(File.dirname(__FILE__), "test.docx")
9
13
  assert(file_to_txt(test_file).length > 0)
@@ -14,11 +18,6 @@ class DocPdfToTextTest < Test::Unit::TestCase
14
18
  assert(file_to_txt(test_file).length > 0)
15
19
  end
16
20
 
17
- should "Convert a pdf file" do
18
- test_file = File.join(File.dirname(__FILE__), "test.pdf")
19
- assert(file_to_txt(test_file).length > 0)
20
- end
21
-
22
21
  should "raise invalid file format" do
23
22
  assert_raise ArgumentError do
24
23
  test_file = File.join(File.dirname(__FILE__), "test.pdf")
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docpdftotext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - esilverberg
@@ -22,16 +22,6 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: "0"
24
24
  version:
25
- - !ruby/object:Gem::Dependency
26
- name: pdf-reader
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: "0"
34
- version:
35
25
  description: wrappers for libraries to convert documents into text
36
26
  email: eric@ericsilverberg.com
37
27
  executables: []
@@ -42,8 +32,20 @@ extra_rdoc_files:
42
32
  - LICENSE
43
33
  - README.rdoc
44
34
  files:
35
+ - .document
36
+ - .gitignore
45
37
  - LICENSE
46
38
  - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - docpdftotext.gemspec
42
+ - lib/DocumentConverter.py
43
+ - lib/docpdftotext.rb
44
+ - test/docpdftotext_test.rb
45
+ - test/test.doc
46
+ - test/test.docx
47
+ - test/test.pdf
48
+ - test/test_helper.rb
47
49
  has_rdoc: true
48
50
  homepage: http://github.com/esilverberg/docpdftotext
49
51
  licenses: []
@@ -73,5 +75,5 @@ signing_key:
73
75
  specification_version: 3
74
76
  summary: Convert word to text in ruby
75
77
  test_files:
76
- - test/test_helper.rb
77
78
  - test/docpdftotext_test.rb
79
+ - test/test_helper.rb