docsplit 0.6.4 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.4' # Keep version in sync with docsplit.rb
4
- s.date = '2012-11-12'
3
+ s.version = '0.7.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2013-02-21'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
13
13
  EOS
14
14
 
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
- s.email = 'jeremy@documentcloud.org'
16
+ s.email = 'opensource@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
18
 
19
19
  s.require_paths = ['lib']
data/lib/docsplit.rb CHANGED
@@ -5,24 +5,13 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.6.4' # Keep in sync with gemspec.
8
+ VERSION = '0.7.0' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
12
12
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
13
  ESCAPED_ROOT = ESCAPE[ROOT]
14
14
 
15
- CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
16
-
17
- LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
18
-
19
- HEADLESS = "-Djava.awt.headless=true"
20
-
21
- office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
22
- office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
23
-
24
- OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
25
-
26
15
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
27
16
 
28
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
@@ -66,20 +55,7 @@ module Docsplit
66
55
  # Use JODCConverter to extract the documents as PDFs.
67
56
  # If the document is in an image format, use GraphicsMagick to extract the PDF.
68
57
  def self.extract_pdf(docs, opts={})
69
- out = opts[:output] || '.'
70
- FileUtils.mkdir_p out unless File.exists?(out)
71
- [docs].flatten.each do |doc|
72
- ext = File.extname(doc)
73
- basename = File.basename(doc, ext)
74
- escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
75
-
76
- if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
77
- `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
78
- else
79
- options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
80
- run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
81
- end
82
- end
58
+ PdfExtractor.new.extract(docs, opts)
83
59
  end
84
60
 
85
61
  # Define custom methods for each of the metadata keys that we support.
@@ -92,30 +68,25 @@ module Docsplit
92
68
  end
93
69
  EOS
94
70
  end
71
+
72
+ def self.extract_info(pdfs, opts={})
73
+ pdfs = ensure_pdfs(pdfs)
74
+ InfoExtractor.new.extract_all(pdfs, opts)
75
+ end
95
76
 
96
77
  # Utility method to clean OCR'd text with garbage characters.
97
78
  def self.clean_text(text)
98
79
  TextCleaner.new.clean(text)
99
80
  end
100
81
 
101
-
102
82
  private
103
83
 
104
- # Runs a Java command, with quieted logging, and the classpath set properly.
105
- def self.run(command, pdfs, opts, return_output=false)
106
- pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
107
- cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
108
- result = `#{cmd}`.chomp
109
- raise ExtractionFailed, result if $? != 0
110
- return return_output ? (result.empty? ? nil : result) : true
111
- end
112
-
113
84
  # Normalize a value in an options hash for the command line.
114
85
  # Ranges look like: 1-10, Arrays like: 1,2,3.
115
86
  def self.normalize_value(value)
116
87
  case value
117
- when Range then normalize_range(value)
118
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
88
+ when Range then value.to_a.join(',')
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
119
90
  else value.to_s
120
91
  end
121
92
  end
@@ -126,5 +97,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
126
97
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
127
98
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
128
99
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
100
+ require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
129
101
  require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
130
102
  require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -17,16 +17,34 @@ module Docsplit
17
17
 
18
18
  # Pull out a single datum from a pdf.
19
19
  def extract(key, pdfs, opts)
20
+ extract_all(pdfs, opts)[key]
21
+ end
22
+
23
+ def extract_all(pdfs, opts)
20
24
  pdf = [pdfs].flatten.first
21
25
  cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
26
  result = `#{cmd}`.chomp
23
27
  raise ExtractionFailed, result if $? != 0
24
- match = result.match(MATCHERS[key])
25
- answer = match && match[1]
26
- answer = answer.to_i if answer && key == :length
27
- answer
28
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
+ if String.method_defined?(:encode)
30
+ result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
31
+ else
32
+ require 'iconv' unless defined?(Iconv)
33
+ ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34
+ result = ic.iconv(result)
35
+ end
36
+ info = {}
37
+ MATCHERS.each do |key, matcher|
38
+ match = result.match(matcher)
39
+ answer = match && match[1]
40
+ if answer
41
+ answer = answer.to_i if key == :length
42
+ info[key] = answer
43
+ end
44
+ end
45
+ info
28
46
  end
29
47
 
30
48
  end
31
49
 
32
- end
50
+ end
@@ -0,0 +1,132 @@
1
+ require 'rbconfig'
2
+
3
+ module Docsplit
4
+ class PdfExtractor
5
+ @@executable = nil
6
+
7
+ HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
8
+ def windows?
9
+ !!HOST_OS.match(/mswin|windows|cygwin/i)
10
+ end
11
+ def osx?
12
+ !!HOST_OS.match(/darwin/i)
13
+ end
14
+ def linux?
15
+ !!HOST_OS.match(/linux/i)
16
+ end
17
+
18
+ def version_string
19
+ @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
20
+ end
21
+
22
+ def libre_office?
23
+ !!version_string.match(/^LibreOffice/)
24
+ end
25
+
26
+ def open_office?
27
+ !!version_string.match(/^OpenOffice.org/)
28
+ end
29
+
30
+ def office_search_paths
31
+ if windows?
32
+ office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
33
+ program_files_path = ENV["CommonProgramFiles"]
34
+ search_paths = office_name.map{ |program| File.join(program_files_path, program) }
35
+ elsif osx?
36
+ search_paths = %w(
37
+ /Applications/LibreOffice.app/Contents
38
+ /Applications/OpenOffice.org.app/Contents
39
+ )
40
+ else # probably linux/unix
41
+ search_paths = %w(
42
+ /usr/lib/libreoffice
43
+ /opt/libreoffice
44
+ /usr/lib/openoffice
45
+ /opt/openoffice.org3
46
+ )
47
+ end
48
+ search_paths
49
+ end
50
+
51
+ def office_executable
52
+ paths = office_search_paths
53
+
54
+ if ENV['OFFICE_PATH']
55
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
56
+ paths.unshift(ENV['OFFICE_PATH'])
57
+ end
58
+
59
+ path_pieces = ["soffice"]
60
+ if windows?
61
+ path_pieces += [["program", "soffice.bin"]]
62
+ elsif osx?
63
+ path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
64
+ else
65
+ path_pieces += [["program", "soffice"]]
66
+ end
67
+
68
+ paths.each do |path|
69
+ if File.exists? path
70
+ @@executable ||= path unless File.directory? path
71
+ path_pieces.each do |pieces|
72
+ check_path = File.join(path, pieces)
73
+ @@executable ||= check_path if File.exists? check_path
74
+ end
75
+ end
76
+ break if @@executable
77
+ end
78
+ raise OfficeNotFound, "No office software found" unless @@executable
79
+ @@executable
80
+ end
81
+
82
+ def office_path
83
+ File.dirname(File.dirname(office_executable))
84
+ end
85
+
86
+ def extract(docs, opts)
87
+ out = opts[:output] || '.'
88
+ FileUtils.mkdir_p out unless File.exists?(out)
89
+ [docs].flatten.each do |doc|
90
+ ext = File.extname(doc)
91
+ basename = File.basename(doc, ext)
92
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
93
+
94
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
95
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
96
+ else
97
+ if libre_office?
98
+ options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
99
+ cmd = "#{office_executable} #{options} 2>&1"
100
+ result = `#{cmd}`.chomp
101
+ raise ExtractionFailed, result if $? != 0
102
+ true
103
+ else # open office presumably
104
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
105
+ run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
112
+
113
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
114
+
115
+ HEADLESS = "-Djava.awt.headless=true"
116
+
117
+ private
118
+
119
+ # Runs a Java command, with quieted logging, and the classpath set properly.
120
+ def run_jod(command, pdfs, opts, return_output=false)
121
+
122
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
123
+ office = osx? ? "-Doffice.home=#{office_path}" : office_path
124
+ cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
125
+ result = `#{cmd}`.chomp
126
+ raise ExtractionFailed, result if $? != 0
127
+ return return_output ? (result.empty? ? nil : result) : true
128
+ end
129
+
130
+ class OfficeNotFound < StandardError; end
131
+ end
132
+ end
@@ -12,7 +12,7 @@ module Docsplit
12
12
  if ext.downcase == '.pdf'
13
13
  doc
14
14
  else
15
- tempdir = File.join(Dir.tmpdir, 'docsplit')
15
+ tempdir = Dir.mktmpdir
16
16
  extract_pdf([doc], {:output => tempdir})
17
17
  File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
18
  end
metadata CHANGED
@@ -1,38 +1,33 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
- version: !ruby/object:Gem::Version
4
- hash: 15
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 6
9
- - 4
10
- version: 0.6.4
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jeremy Ashkenas
14
9
  - Samuel Clay
15
10
  - Ted Han
16
11
  autorequire:
17
12
  bindir: bin
18
13
  cert_chain: []
19
-
20
- date: 2012-11-12 00:00:00 Z
14
+ date: 2013-02-21 00:00:00.000000000 Z
21
15
  dependencies: []
22
-
23
- description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
24
- email: jeremy@documentcloud.org
25
- executables:
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: opensource@documentcloud.org
21
+ executables:
26
22
  - docsplit
27
23
  extensions: []
28
-
29
24
  extra_rdoc_files: []
30
-
31
- files:
25
+ files:
32
26
  - lib/docsplit/command_line.rb
33
27
  - lib/docsplit/image_extractor.rb
34
28
  - lib/docsplit/info_extractor.rb
35
29
  - lib/docsplit/page_extractor.rb
30
+ - lib/docsplit/pdf_extractor.rb
36
31
  - lib/docsplit/text_cleaner.rb
37
32
  - lib/docsplit/text_extractor.rb
38
33
  - lib/docsplit/transparent_pdfs.rb
@@ -53,36 +48,27 @@ files:
53
48
  - README
54
49
  homepage: http://documentcloud.github.com/docsplit/
55
50
  licenses: []
56
-
57
51
  post_install_message:
58
52
  rdoc_options: []
59
-
60
- require_paths:
53
+ require_paths:
61
54
  - lib
62
- required_ruby_version: !ruby/object:Gem::Requirement
55
+ required_ruby_version: !ruby/object:Gem::Requirement
63
56
  none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- hash: 3
68
- segments:
69
- - 0
70
- version: "0"
71
- required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
62
  none: false
73
- requirements:
74
- - - ">="
75
- - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
- version: "0"
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
80
67
  requirements: []
81
-
82
68
  rubyforge_project: docsplit
83
69
  rubygems_version: 1.8.24
84
70
  signing_key:
85
71
  specification_version: 3
86
72
  summary: Break Apart Documents into Images, Text, Pages and PDFs
87
73
  test_files: []
88
-
74
+ has_rdoc: