docsplit 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +1 -1
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/pdf_extractor.rb +22 -4
- metadata +1 -1
data/docsplit.gemspec
CHANGED
data/lib/docsplit.rb
CHANGED
@@ -4,6 +4,7 @@ module Docsplit
|
|
4
4
|
class PdfExtractor
|
5
5
|
@@executable = nil
|
6
6
|
|
7
|
+
# Provide a set of helper functions to determine the OS.
|
7
8
|
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
8
9
|
def windows?
|
9
10
|
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
@@ -15,18 +16,23 @@ module Docsplit
|
|
15
16
|
!!HOST_OS.match(/linux/i)
|
16
17
|
end
|
17
18
|
|
19
|
+
# The first line of the help output holds the name and version number
|
20
|
+
# of the office software to be used for extraction.
|
18
21
|
def version_string
|
19
22
|
@@help ||= `#{office_executable} -h 2>&1`.split("\n").first
|
20
23
|
end
|
21
|
-
|
22
24
|
def libre_office?
|
23
25
|
!!version_string.match(/^LibreOffice/)
|
24
26
|
end
|
25
|
-
|
26
27
|
def open_office?
|
27
28
|
!!version_string.match(/^OpenOffice.org/)
|
28
29
|
end
|
29
30
|
|
31
|
+
# A set of default locations to search for office software
|
32
|
+
# These have been extracted from JODConverter. Each listed
|
33
|
+
# path should contain a directory "program" which in turn
|
34
|
+
# contains the "soffice" executable.
|
35
|
+
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
30
36
|
def office_search_paths
|
31
37
|
if windows?
|
32
38
|
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
@@ -48,14 +54,19 @@ module Docsplit
|
|
48
54
|
search_paths
|
49
55
|
end
|
50
56
|
|
57
|
+
# Identify the path to a working office executable.
|
51
58
|
def office_executable
|
52
59
|
paths = office_search_paths
|
53
60
|
|
61
|
+
# If an OFFICE_PATH has been specified on the commandline
|
62
|
+
# raise an error if that path isn't valid, otherwise, add
|
63
|
+
# it to the front of our search paths.
|
54
64
|
if ENV['OFFICE_PATH']
|
55
65
|
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
56
66
|
paths.unshift(ENV['OFFICE_PATH'])
|
57
67
|
end
|
58
68
|
|
69
|
+
# The location of the office executable is OS dependent
|
59
70
|
path_pieces = ["soffice"]
|
60
71
|
if windows?
|
61
72
|
path_pieces += [["program", "soffice.bin"]]
|
@@ -65,6 +76,8 @@ module Docsplit
|
|
65
76
|
path_pieces += [["program", "soffice"]]
|
66
77
|
end
|
67
78
|
|
79
|
+
# Search for the first suitable office executable
|
80
|
+
# and short circuit an executable is found.
|
68
81
|
paths.each do |path|
|
69
82
|
if File.exists? path
|
70
83
|
@@executable ||= path unless File.directory? path
|
@@ -79,10 +92,12 @@ module Docsplit
|
|
79
92
|
@@executable
|
80
93
|
end
|
81
94
|
|
95
|
+
# Used to specify the office location for JODConverter
|
82
96
|
def office_path
|
83
97
|
File.dirname(File.dirname(office_executable))
|
84
98
|
end
|
85
99
|
|
100
|
+
# Convert documents to PDF.
|
86
101
|
def extract(docs, opts)
|
87
102
|
out = opts[:output] || '.'
|
88
103
|
FileUtils.mkdir_p out unless File.exists?(out)
|
@@ -95,12 +110,15 @@ module Docsplit
|
|
95
110
|
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
96
111
|
else
|
97
112
|
if libre_office?
|
98
|
-
|
113
|
+
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
114
|
+
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
115
|
+
|
116
|
+
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
99
117
|
cmd = "#{office_executable} #{options} 2>&1"
|
100
118
|
result = `#{cmd}`.chomp
|
101
119
|
raise ExtractionFailed, result if $? != 0
|
102
120
|
true
|
103
|
-
else # open office presumably
|
121
|
+
else # open office presumably, rely on JODConverter to figure it out.
|
104
122
|
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
105
123
|
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
106
124
|
end
|