docsplit 0.6.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.4' # Keep version in sync with docsplit.rb
4
- s.date = '2012-11-12'
3
+ s.version = '0.7.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2013-02-21'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
13
13
  EOS
14
14
 
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
- s.email = 'jeremy@documentcloud.org'
16
+ s.email = 'opensource@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
18
 
19
19
  s.require_paths = ['lib']
data/lib/docsplit.rb CHANGED
@@ -5,24 +5,13 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.6.4' # Keep in sync with gemspec.
8
+ VERSION = '0.7.0' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
12
12
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
13
  ESCAPED_ROOT = ESCAPE[ROOT]
14
14
 
15
- CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
16
-
17
- LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
18
-
19
- HEADLESS = "-Djava.awt.headless=true"
20
-
21
- office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
22
- office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
23
-
24
- OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
25
-
26
15
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
27
16
 
28
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
@@ -66,20 +55,7 @@ module Docsplit
66
55
  # Use JODCConverter to extract the documents as PDFs.
67
56
  # If the document is in an image format, use GraphicsMagick to extract the PDF.
68
57
  def self.extract_pdf(docs, opts={})
69
- out = opts[:output] || '.'
70
- FileUtils.mkdir_p out unless File.exists?(out)
71
- [docs].flatten.each do |doc|
72
- ext = File.extname(doc)
73
- basename = File.basename(doc, ext)
74
- escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
75
-
76
- if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
77
- `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
78
- else
79
- options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
80
- run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
81
- end
82
- end
58
+ PdfExtractor.new.extract(docs, opts)
83
59
  end
84
60
 
85
61
  # Define custom methods for each of the metadata keys that we support.
@@ -92,30 +68,25 @@ module Docsplit
92
68
  end
93
69
  EOS
94
70
  end
71
+
72
+ def self.extract_info(pdfs, opts={})
73
+ pdfs = ensure_pdfs(pdfs)
74
+ InfoExtractor.new.extract_all(pdfs, opts)
75
+ end
95
76
 
96
77
  # Utility method to clean OCR'd text with garbage characters.
97
78
  def self.clean_text(text)
98
79
  TextCleaner.new.clean(text)
99
80
  end
100
81
 
101
-
102
82
  private
103
83
 
104
- # Runs a Java command, with quieted logging, and the classpath set properly.
105
- def self.run(command, pdfs, opts, return_output=false)
106
- pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
107
- cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
108
- result = `#{cmd}`.chomp
109
- raise ExtractionFailed, result if $? != 0
110
- return return_output ? (result.empty? ? nil : result) : true
111
- end
112
-
113
84
  # Normalize a value in an options hash for the command line.
114
85
  # Ranges look like: 1-10, Arrays like: 1,2,3.
115
86
  def self.normalize_value(value)
116
87
  case value
117
- when Range then normalize_range(value)
118
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
88
+ when Range then value.to_a.join(',')
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
119
90
  else value.to_s
120
91
  end
121
92
  end
@@ -126,5 +97,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
126
97
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
127
98
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
128
99
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
100
+ require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
129
101
  require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
130
102
  require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -17,16 +17,34 @@ module Docsplit
17
17
 
18
18
  # Pull out a single datum from a pdf.
19
19
  def extract(key, pdfs, opts)
20
+ extract_all(pdfs, opts)[key]
21
+ end
22
+
23
+ def extract_all(pdfs, opts)
20
24
  pdf = [pdfs].flatten.first
21
25
  cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
26
  result = `#{cmd}`.chomp
23
27
  raise ExtractionFailed, result if $? != 0
24
- match = result.match(MATCHERS[key])
25
- answer = match && match[1]
26
- answer = answer.to_i if answer && key == :length
27
- answer
28
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
+ if String.method_defined?(:encode)
30
+ result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
31
+ else
32
+ require 'iconv' unless defined?(Iconv)
33
+ ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34
+ result = ic.iconv(result)
35
+ end
36
+ info = {}
37
+ MATCHERS.each do |key, matcher|
38
+ match = result.match(matcher)
39
+ answer = match && match[1]
40
+ if answer
41
+ answer = answer.to_i if key == :length
42
+ info[key] = answer
43
+ end
44
+ end
45
+ info
28
46
  end
29
47
 
30
48
  end
31
49
 
32
- end
50
+ end
@@ -0,0 +1,132 @@
1
+ require 'rbconfig'
2
+
3
+ module Docsplit
4
+ class PdfExtractor
5
+ @@executable = nil
6
+
7
+ HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
8
+ def windows?
9
+ !!HOST_OS.match(/mswin|windows|cygwin/i)
10
+ end
11
+ def osx?
12
+ !!HOST_OS.match(/darwin/i)
13
+ end
14
+ def linux?
15
+ !!HOST_OS.match(/linux/i)
16
+ end
17
+
18
+ def version_string
19
+ @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
20
+ end
21
+
22
+ def libre_office?
23
+ !!version_string.match(/^LibreOffice/)
24
+ end
25
+
26
+ def open_office?
27
+ !!version_string.match(/^OpenOffice.org/)
28
+ end
29
+
30
+ def office_search_paths
31
+ if windows?
32
+ office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
33
+ program_files_path = ENV["CommonProgramFiles"]
34
+ search_paths = office_name.map{ |program| File.join(program_files_path, program) }
35
+ elsif osx?
36
+ search_paths = %w(
37
+ /Applications/LibreOffice.app/Contents
38
+ /Applications/OpenOffice.org.app/Contents
39
+ )
40
+ else # probably linux/unix
41
+ search_paths = %w(
42
+ /usr/lib/libreoffice
43
+ /opt/libreoffice
44
+ /usr/lib/openoffice
45
+ /opt/openoffice.org3
46
+ )
47
+ end
48
+ search_paths
49
+ end
50
+
51
+ def office_executable
52
+ paths = office_search_paths
53
+
54
+ if ENV['OFFICE_PATH']
55
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
56
+ paths.unshift(ENV['OFFICE_PATH'])
57
+ end
58
+
59
+ path_pieces = ["soffice"]
60
+ if windows?
61
+ path_pieces += [["program", "soffice.bin"]]
62
+ elsif osx?
63
+ path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
64
+ else
65
+ path_pieces += [["program", "soffice"]]
66
+ end
67
+
68
+ paths.each do |path|
69
+ if File.exists? path
70
+ @@executable ||= path unless File.directory? path
71
+ path_pieces.each do |pieces|
72
+ check_path = File.join(path, pieces)
73
+ @@executable ||= check_path if File.exists? check_path
74
+ end
75
+ end
76
+ break if @@executable
77
+ end
78
+ raise OfficeNotFound, "No office software found" unless @@executable
79
+ @@executable
80
+ end
81
+
82
+ def office_path
83
+ File.dirname(File.dirname(office_executable))
84
+ end
85
+
86
+ def extract(docs, opts)
87
+ out = opts[:output] || '.'
88
+ FileUtils.mkdir_p out unless File.exists?(out)
89
+ [docs].flatten.each do |doc|
90
+ ext = File.extname(doc)
91
+ basename = File.basename(doc, ext)
92
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
93
+
94
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
95
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
96
+ else
97
+ if libre_office?
98
+ options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
99
+ cmd = "#{office_executable} #{options} 2>&1"
100
+ result = `#{cmd}`.chomp
101
+ raise ExtractionFailed, result if $? != 0
102
+ true
103
+ else # open office presumably
104
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
105
+ run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
112
+
113
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
114
+
115
+ HEADLESS = "-Djava.awt.headless=true"
116
+
117
+ private
118
+
119
+ # Runs a Java command, with quieted logging, and the classpath set properly.
120
+ def run_jod(command, pdfs, opts, return_output=false)
121
+
122
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
123
+ office = osx? ? "-Doffice.home=#{office_path}" : office_path
124
+ cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
125
+ result = `#{cmd}`.chomp
126
+ raise ExtractionFailed, result if $? != 0
127
+ return return_output ? (result.empty? ? nil : result) : true
128
+ end
129
+
130
+ class OfficeNotFound < StandardError; end
131
+ end
132
+ end
@@ -12,7 +12,7 @@ module Docsplit
12
12
  if ext.downcase == '.pdf'
13
13
  doc
14
14
  else
15
- tempdir = File.join(Dir.tmpdir, 'docsplit')
15
+ tempdir = Dir.mktmpdir
16
16
  extract_pdf([doc], {:output => tempdir})
17
17
  File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
18
  end
metadata CHANGED
@@ -1,38 +1,33 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
- version: !ruby/object:Gem::Version
4
- hash: 15
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 6
9
- - 4
10
- version: 0.6.4
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jeremy Ashkenas
14
9
  - Samuel Clay
15
10
  - Ted Han
16
11
  autorequire:
17
12
  bindir: bin
18
13
  cert_chain: []
19
-
20
- date: 2012-11-12 00:00:00 Z
14
+ date: 2013-02-21 00:00:00.000000000 Z
21
15
  dependencies: []
22
-
23
- description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
24
- email: jeremy@documentcloud.org
25
- executables:
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: opensource@documentcloud.org
21
+ executables:
26
22
  - docsplit
27
23
  extensions: []
28
-
29
24
  extra_rdoc_files: []
30
-
31
- files:
25
+ files:
32
26
  - lib/docsplit/command_line.rb
33
27
  - lib/docsplit/image_extractor.rb
34
28
  - lib/docsplit/info_extractor.rb
35
29
  - lib/docsplit/page_extractor.rb
30
+ - lib/docsplit/pdf_extractor.rb
36
31
  - lib/docsplit/text_cleaner.rb
37
32
  - lib/docsplit/text_extractor.rb
38
33
  - lib/docsplit/transparent_pdfs.rb
@@ -53,36 +48,27 @@ files:
53
48
  - README
54
49
  homepage: http://documentcloud.github.com/docsplit/
55
50
  licenses: []
56
-
57
51
  post_install_message:
58
52
  rdoc_options: []
59
-
60
- require_paths:
53
+ require_paths:
61
54
  - lib
62
- required_ruby_version: !ruby/object:Gem::Requirement
55
+ required_ruby_version: !ruby/object:Gem::Requirement
63
56
  none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- hash: 3
68
- segments:
69
- - 0
70
- version: "0"
71
- required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
62
  none: false
73
- requirements:
74
- - - ">="
75
- - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
- version: "0"
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
80
67
  requirements: []
81
-
82
68
  rubyforge_project: docsplit
83
69
  rubygems_version: 1.8.24
84
70
  signing_key:
85
71
  specification_version: 3
86
72
  summary: Break Apart Documents into Images, Text, Pages and PDFs
87
73
  test_files: []
88
-
74
+ has_rdoc: