docsplit 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +1 -1
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/pdf_extractor.rb +22 -4
- metadata +1 -1
data/docsplit.gemspec
CHANGED
data/lib/docsplit.rb
CHANGED
@@ -4,6 +4,7 @@ module Docsplit
|
|
4
4
|
class PdfExtractor
|
5
5
|
@@executable = nil
|
6
6
|
|
7
|
+
# Provide a set of helper functions to determine the OS.
|
7
8
|
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
8
9
|
def windows?
|
9
10
|
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
@@ -15,18 +16,23 @@ module Docsplit
|
|
15
16
|
!!HOST_OS.match(/linux/i)
|
16
17
|
end
|
17
18
|
|
19
|
+
# The first line of the help output holds the name and version number
|
20
|
+
# of the office software to be used for extraction.
|
18
21
|
def version_string
|
19
22
|
@@help ||= `#{office_executable} -h 2>&1`.split("\n").first
|
20
23
|
end
|
21
|
-
|
22
24
|
def libre_office?
|
23
25
|
!!version_string.match(/^LibreOffice/)
|
24
26
|
end
|
25
|
-
|
26
27
|
def open_office?
|
27
28
|
!!version_string.match(/^OpenOffice.org/)
|
28
29
|
end
|
29
30
|
|
31
|
+
# A set of default locations to search for office software
|
32
|
+
# These have been extracted from JODConverter. Each listed
|
33
|
+
# path should contain a directory "program" which in turn
|
34
|
+
# contains the "soffice" executable.
|
35
|
+
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
30
36
|
def office_search_paths
|
31
37
|
if windows?
|
32
38
|
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
@@ -48,14 +54,19 @@ module Docsplit
|
|
48
54
|
search_paths
|
49
55
|
end
|
50
56
|
|
57
|
+
# Identify the path to a working office executable.
|
51
58
|
def office_executable
|
52
59
|
paths = office_search_paths
|
53
60
|
|
61
|
+
# If an OFFICE_PATH has been specified on the commandline
|
62
|
+
# raise an error if that path isn't valid, otherwise, add
|
63
|
+
# it to the front of our search paths.
|
54
64
|
if ENV['OFFICE_PATH']
|
55
65
|
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
56
66
|
paths.unshift(ENV['OFFICE_PATH'])
|
57
67
|
end
|
58
68
|
|
69
|
+
# The location of the office executable is OS dependent
|
59
70
|
path_pieces = ["soffice"]
|
60
71
|
if windows?
|
61
72
|
path_pieces += [["program", "soffice.bin"]]
|
@@ -65,6 +76,8 @@ module Docsplit
|
|
65
76
|
path_pieces += [["program", "soffice"]]
|
66
77
|
end
|
67
78
|
|
79
|
+
# Search for the first suitable office executable
|
80
|
+
# and short circuit an executable is found.
|
68
81
|
paths.each do |path|
|
69
82
|
if File.exists? path
|
70
83
|
@@executable ||= path unless File.directory? path
|
@@ -79,10 +92,12 @@ module Docsplit
|
|
79
92
|
@@executable
|
80
93
|
end
|
81
94
|
|
95
|
+
# Used to specify the office location for JODConverter
|
82
96
|
def office_path
|
83
97
|
File.dirname(File.dirname(office_executable))
|
84
98
|
end
|
85
99
|
|
100
|
+
# Convert documents to PDF.
|
86
101
|
def extract(docs, opts)
|
87
102
|
out = opts[:output] || '.'
|
88
103
|
FileUtils.mkdir_p out unless File.exists?(out)
|
@@ -95,12 +110,15 @@ module Docsplit
|
|
95
110
|
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
96
111
|
else
|
97
112
|
if libre_office?
|
98
|
-
|
113
|
+
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
114
|
+
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
115
|
+
|
116
|
+
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
99
117
|
cmd = "#{office_executable} #{options} 2>&1"
|
100
118
|
result = `#{cmd}`.chomp
|
101
119
|
raise ExtractionFailed, result if $? != 0
|
102
120
|
true
|
103
|
-
else # open office presumably
|
121
|
+
else # open office presumably, rely on JODConverter to figure it out.
|
104
122
|
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
105
123
|
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
106
124
|
end
|