docsplit 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.1' # Keep version in sync with docsplit.rb
3
+ s.version = '0.7.2' # Keep version in sync with docsplit.rb
4
4
  s.date = '2013-02-21'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
data/lib/docsplit.rb CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.1' # Keep in sync with gemspec.
8
+ VERSION = '0.7.2' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -4,6 +4,7 @@ module Docsplit
4
4
  class PdfExtractor
5
5
  @@executable = nil
6
6
 
7
+ # Provide a set of helper functions to determine the OS.
7
8
  HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
8
9
  def windows?
9
10
  !!HOST_OS.match(/mswin|windows|cygwin/i)
@@ -15,18 +16,23 @@ module Docsplit
15
16
  !!HOST_OS.match(/linux/i)
16
17
  end
17
18
 
19
+ # The first line of the help output holds the name and version number
20
+ # of the office software to be used for extraction.
18
21
  def version_string
19
22
  @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
20
23
  end
21
-
22
24
  def libre_office?
23
25
  !!version_string.match(/^LibreOffice/)
24
26
  end
25
-
26
27
  def open_office?
27
28
  !!version_string.match(/^OpenOffice.org/)
28
29
  end
29
30
 
31
+ # A set of default locations to search for office software
32
+ # These have been extracted from JODConverter. Each listed
33
+ # path should contain a directory "program" which in turn
34
+ # contains the "soffice" executable.
35
+ # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
30
36
  def office_search_paths
31
37
  if windows?
32
38
  office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
@@ -48,14 +54,19 @@ module Docsplit
48
54
  search_paths
49
55
  end
50
56
 
57
+ # Identify the path to a working office executable.
51
58
  def office_executable
52
59
  paths = office_search_paths
53
60
 
61
+ # If an OFFICE_PATH has been specified on the commandline
62
+ # raise an error if that path isn't valid, otherwise, add
63
+ # it to the front of our search paths.
54
64
  if ENV['OFFICE_PATH']
55
65
  raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
56
66
  paths.unshift(ENV['OFFICE_PATH'])
57
67
  end
58
68
 
69
+ # The location of the office executable is OS dependent
59
70
  path_pieces = ["soffice"]
60
71
  if windows?
61
72
  path_pieces += [["program", "soffice.bin"]]
@@ -65,6 +76,8 @@ module Docsplit
65
76
  path_pieces += [["program", "soffice"]]
66
77
  end
67
78
 
79
+ # Search for the first suitable office executable
80
+ # and short circuit an executable is found.
68
81
  paths.each do |path|
69
82
  if File.exists? path
70
83
  @@executable ||= path unless File.directory? path
@@ -79,10 +92,12 @@ module Docsplit
79
92
  @@executable
80
93
  end
81
94
 
95
+ # Used to specify the office location for JODConverter
82
96
  def office_path
83
97
  File.dirname(File.dirname(office_executable))
84
98
  end
85
99
 
100
+ # Convert documents to PDF.
86
101
  def extract(docs, opts)
87
102
  out = opts[:output] || '.'
88
103
  FileUtils.mkdir_p out unless File.exists?(out)
@@ -95,12 +110,15 @@ module Docsplit
95
110
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
96
111
  else
97
112
  if libre_office?
98
- options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
113
+ # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
114
+ ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
115
+
116
+ options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
99
117
  cmd = "#{office_executable} #{options} 2>&1"
100
118
  result = `#{cmd}`.chomp
101
119
  raise ExtractionFailed, result if $? != 0
102
120
  true
103
- else # open office presumably
121
+ else # open office presumably, rely on JODConverter to figure it out.
104
122
  options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
105
123
  run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
106
124
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: