docsplit 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.1' # Keep version in sync with docsplit.rb
3
+ s.version = '0.7.2' # Keep version in sync with docsplit.rb
4
4
  s.date = '2013-02-21'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
data/lib/docsplit.rb CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.1' # Keep in sync with gemspec.
8
+ VERSION = '0.7.2' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -4,6 +4,7 @@ module Docsplit
4
4
  class PdfExtractor
5
5
  @@executable = nil
6
6
 
7
+ # Provide a set of helper functions to determine the OS.
7
8
  HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
8
9
  def windows?
9
10
  !!HOST_OS.match(/mswin|windows|cygwin/i)
@@ -15,18 +16,23 @@ module Docsplit
15
16
  !!HOST_OS.match(/linux/i)
16
17
  end
17
18
 
19
+ # The first line of the help output holds the name and version number
20
+ # of the office software to be used for extraction.
18
21
  def version_string
19
22
  @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
20
23
  end
21
-
22
24
  def libre_office?
23
25
  !!version_string.match(/^LibreOffice/)
24
26
  end
25
-
26
27
  def open_office?
27
28
  !!version_string.match(/^OpenOffice.org/)
28
29
  end
29
30
 
31
+ # A set of default locations to search for office software
32
+ # These have been extracted from JODConverter. Each listed
33
+ # path should contain a directory "program" which in turn
34
+ # contains the "soffice" executable.
35
+ # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
30
36
  def office_search_paths
31
37
  if windows?
32
38
  office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
@@ -48,14 +54,19 @@ module Docsplit
48
54
  search_paths
49
55
  end
50
56
 
57
+ # Identify the path to a working office executable.
51
58
  def office_executable
52
59
  paths = office_search_paths
53
60
 
61
+ # If an OFFICE_PATH has been specified on the commandline
62
+ # raise an error if that path isn't valid, otherwise, add
63
+ # it to the front of our search paths.
54
64
  if ENV['OFFICE_PATH']
55
65
  raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
56
66
  paths.unshift(ENV['OFFICE_PATH'])
57
67
  end
58
68
 
69
+ # The location of the office executable is OS dependent
59
70
  path_pieces = ["soffice"]
60
71
  if windows?
61
72
  path_pieces += [["program", "soffice.bin"]]
@@ -65,6 +76,8 @@ module Docsplit
65
76
  path_pieces += [["program", "soffice"]]
66
77
  end
67
78
 
79
+ # Search for the first suitable office executable
80
+ # and short circuit an executable is found.
68
81
  paths.each do |path|
69
82
  if File.exists? path
70
83
  @@executable ||= path unless File.directory? path
@@ -79,10 +92,12 @@ module Docsplit
79
92
  @@executable
80
93
  end
81
94
 
95
+ # Used to specify the office location for JODConverter
82
96
  def office_path
83
97
  File.dirname(File.dirname(office_executable))
84
98
  end
85
99
 
100
+ # Convert documents to PDF.
86
101
  def extract(docs, opts)
87
102
  out = opts[:output] || '.'
88
103
  FileUtils.mkdir_p out unless File.exists?(out)
@@ -95,12 +110,15 @@ module Docsplit
95
110
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
96
111
  else
97
112
  if libre_office?
98
- options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
113
+ # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
114
+ ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
115
+
116
+ options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
99
117
  cmd = "#{office_executable} #{options} 2>&1"
100
118
  result = `#{cmd}`.chomp
101
119
  raise ExtractionFailed, result if $? != 0
102
120
  true
103
- else # open office presumably
121
+ else # open office presumably, rely on JODConverter to figure it out.
104
122
  options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
105
123
  run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
106
124
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: