burisu-docsplit 0.7.8 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
4
- data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
3
+ metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
4
+ data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
5
5
  SHA512:
6
- metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
7
- data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709
6
+ metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
7
+ data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644
@@ -2,4 +2,4 @@
2
2
 
3
3
  require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
4
 
5
- Docsplit::CommandLine.new
5
+ Docsplit::CommandLine.new
@@ -1,8 +1,12 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'docsplit/version'
4
+
1
5
  Gem::Specification.new do |s|
2
6
  s.name = 'burisu-docsplit'
3
- s.version = '0.7.8' # Keep version in sync with docsplit.rb
4
- s.homepage = "http://documentcloud.github.com/docsplit/"
5
- s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
7
+ s.version = Docsplit::VERSION # Keep version in sync with docsplit.rb
8
+ s.homepage = 'http://documentcloud.github.com/docsplit/'
9
+ s.summary = 'Break Apart Documents into Images, Text, Pages and PDFs'
6
10
  s.description = <<-EOS
7
11
  Docsplit is a command-line utility and Ruby library for splitting apart
8
12
  documents into their component parts: searchable UTF-8 plain text, page
@@ -1,22 +1,20 @@
1
1
  require 'tmpdir'
2
2
  require 'fileutils'
3
3
  require 'shellwords'
4
+ require 'docsplit/version'
4
5
 
5
6
  # The Docsplit module delegates to the Java PDF extractors.
6
7
  module Docsplit
7
-
8
- VERSION = '0.7.6' # Keep in sync with gemspec.
9
-
10
- ESCAPE = lambda {|x| Shellwords.shellescape(x) }
8
+ ESCAPE = ->(x) { Shellwords.shellescape(x) }
11
9
 
12
10
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
11
  ESCAPED_ROOT = ESCAPE[ROOT]
14
12
 
15
- METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
16
-
17
- GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
13
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
14
+
15
+ GM_FORMATS = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
18
16
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
17
+ DEPENDENCIES = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
20
18
 
21
19
  # Check for all dependencies, and note their absence.
22
20
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -32,28 +30,28 @@ module Docsplit
32
30
  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
31
  if DEPENDENCIES[:tesseract]
34
32
  # osd will be listed in tesseract --listlangs
35
- val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
33
+ val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
36
34
  DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
35
  end
38
36
 
39
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
37
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
40
38
  # broke.
41
39
  class ExtractionFailed < StandardError; end
42
40
 
43
41
  # Use the ExtractPages Java class to burst a PDF into single pages.
44
- def self.extract_pages(pdfs, opts={})
42
+ def self.extract_pages(pdfs, opts = {})
45
43
  pdfs = ensure_pdfs(pdfs)
46
44
  PageExtractor.new.extract(pdfs, opts)
47
45
  end
48
46
 
49
47
  # Use the ExtractText Java class to write out all embedded text.
50
- def self.extract_text(pdfs, opts={})
48
+ def self.extract_text(pdfs, opts = {})
51
49
  pdfs = ensure_pdfs(pdfs)
52
50
  TextExtractor.new.extract(pdfs, opts)
53
51
  end
54
52
 
55
53
  # Use the ExtractImages Java class to rasterize a PDF into each page's image.
56
- def self.extract_images(pdfs, opts={})
54
+ def self.extract_images(pdfs, opts = {})
57
55
  pdfs = ensure_pdfs(pdfs)
58
56
  opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
59
57
  ImageExtractor.new.extract(pdfs, opts)
@@ -61,7 +59,7 @@ module Docsplit
61
59
 
62
60
  # Use JODCConverter to extract the documents as PDFs.
63
61
  # If the document is in an image format, use GraphicsMagick to extract the PDF.
64
- def self.extract_pdf(docs, opts={})
62
+ def self.extract_pdf(docs, opts = {})
65
63
  PdfExtractor.new.extract(docs, opts)
66
64
  end
67
65
 
@@ -75,8 +73,8 @@ module Docsplit
75
73
  end
76
74
  EOS
77
75
  end
78
-
79
- def self.extract_info(pdfs, opts={})
76
+
77
+ def self.extract_info(pdfs, opts = {})
80
78
  pdfs = ensure_pdfs(pdfs)
81
79
  InfoExtractor.new.extract_all(pdfs, opts)
82
80
  end
@@ -93,11 +91,10 @@ module Docsplit
93
91
  def self.normalize_value(value)
94
92
  case value
95
93
  when Range then value.to_a.join(',')
96
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
94
+ when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
97
95
  else value.to_s
98
96
  end
99
97
  end
100
-
101
98
  end
102
99
 
103
100
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
@@ -2,11 +2,9 @@ require 'optparse'
2
2
  require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
3
 
4
4
  module Docsplit
5
-
6
5
  # A single command-line utility to separate a PDF into all its component parts.
7
6
  class CommandLine
8
-
9
- BANNER = <<-EOS
7
+ BANNER = <<-EOS.freeze
10
8
  docsplit breaks apart documents into images, text, or individual pages.
11
9
  It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
10
 
@@ -39,24 +37,22 @@ Options:
39
37
 
40
38
  # Delegate to the Docsplit Ruby API to perform all extractions.
41
39
  def run
42
- begin
43
- case @command
44
- when :images then Docsplit.extract_images(ARGV, @options)
45
- when :pages then Docsplit.extract_pages(ARGV, @options)
46
- when :text then Docsplit.extract_text(ARGV, @options)
47
- when :pdf then Docsplit.extract_pdf(ARGV, @options)
40
+ case @command
41
+ when :images then Docsplit.extract_images(ARGV, @options)
42
+ when :pages then Docsplit.extract_pages(ARGV, @options)
43
+ when :text then Docsplit.extract_text(ARGV, @options)
44
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
45
+ else
46
+ if METADATA_KEYS.include?(@command)
47
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
48
+ puts value unless value.nil?
48
49
  else
49
- if METADATA_KEYS.include?(@command)
50
- value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
- puts value unless value.nil?
52
- else
53
- usage
54
- end
50
+ usage
55
51
  end
56
- rescue ExtractionFailed => e
57
- puts e.message.chomp
58
- exit(1)
59
52
  end
53
+ rescue ExtractionFailed => e
54
+ puts e.message.chomp
55
+ exit(1)
60
56
  end
61
57
 
62
58
  # Print out the usage help message.
@@ -65,18 +61,17 @@ Options:
65
61
  exit
66
62
  end
67
63
 
68
-
69
64
  private
70
65
 
71
66
  # Use the OptionParser library to parse out all supported options. Return
72
67
  # options formatted for the Ruby API.
73
68
  def parse_options
74
- @options = {:ocr => :default, :clean => true}
69
+ @options = { ocr: :default, clean: true }
75
70
  @option_parser = OptionParser.new do |opts|
76
71
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
72
  @options[:output] = d
78
73
  end
79
- opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
74
+ opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
80
75
  @options[:pages] = p
81
76
  end
82
77
  opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
@@ -91,16 +86,16 @@ Options:
91
86
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
87
  @options[:ocr] = o
93
88
  end
94
- opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
89
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
95
90
  @options[:clean] = false
96
91
  end
97
92
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
93
  @options[:language] = l
99
94
  end
100
- opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
95
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
101
96
  @options[:detect_orientation] = false
102
97
  end
103
- opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
98
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
104
99
  @options[:rolling] = true
105
100
  end
106
101
  opts.on_tail('-v', '--version', 'display docsplit version') do
@@ -119,7 +114,5 @@ Options:
119
114
  exit(1)
120
115
  end
121
116
  end
122
-
123
117
  end
124
-
125
- end
118
+ end
@@ -1,12 +1,10 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to GraphicsMagick in order to convert PDF documents into
4
3
  # nicely sized images.
5
4
  class ImageExtractor
6
-
7
- MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
5
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
8
6
  DEFAULT_FORMAT = :png
9
- DEFAULT_DENSITY = '150'
7
+ DEFAULT_DENSITY = '150'.freeze
10
8
 
11
9
  # Extract a list of PDFs as rasterized page images, according to the
12
10
  # configuration in options.
@@ -15,8 +13,8 @@ module Docsplit
15
13
  extract_options(options)
16
14
  @pdfs.each do |pdf|
17
15
  previous = nil
18
- @sizes.each_with_index do |size, i|
19
- @formats.each {|format| convert(pdf, size, format, previous) }
16
+ @sizes.each_with_index do |size, _i|
17
+ @formats.each { |format| convert(pdf, size, format, previous) }
20
18
  previous = size if @rolling
21
19
  end
22
20
  end
@@ -27,36 +25,35 @@ module Docsplit
27
25
  # we simply downsample that image, instead of re-rendering the entire PDF.
28
26
  # Now we generate one page at a time, a counterintuitive opimization
29
27
  # suggested by the GraphicsMagick list, that seems to work quite well.
30
- def convert(pdf, size, format, previous=nil)
28
+ def convert(pdf, size, format, previous = nil)
31
29
  tempdir = Dir.mktmpdir
32
30
  basename = File.basename(pdf, File.extname(pdf))
33
31
  directory = directory_for(size)
34
32
  pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
33
  escaped_pdf = ESCAPE[pdf]
36
- FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
- common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
34
+ FileUtils.mkdir_p(directory) unless File.exist?(directory)
35
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
36
  if previous
39
37
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
38
  result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
- raise ExtractionFailed, result if $? != 0
39
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
42
40
  else
43
41
  page_list(pages).each do |page|
44
- out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
42
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
43
  cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
44
  result = `#{cmd}`.chomp
47
- raise ExtractionFailed, result if $? != 0
45
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
48
46
  end
49
47
  end
50
48
  ensure
51
- FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
49
+ FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
52
50
  end
53
51
 
54
-
55
52
  private
56
53
 
57
54
  # Extract the relevant GraphicsMagick options from the options hash.
58
55
  def extract_options(options)
59
- @output = options[:output] || '.'
56
+ @output = options[:output] || '.'
60
57
  @pages = options[:pages]
61
58
  @density = options[:density] || DEFAULT_DENSITY
62
59
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
@@ -80,24 +77,22 @@ module Docsplit
80
77
  # Generate the appropriate quality argument for the image format.
81
78
  def quality_arg(format)
82
79
  case format.to_s
83
- when /jpe?g/ then "-quality 85"
84
- when /png/ then "-quality 100"
85
- else ""
80
+ when /jpe?g/ then '-quality 85'
81
+ when /png/ then '-quality 100'
82
+ else ''
86
83
  end
87
84
  end
88
85
 
89
86
  # Generate the expanded list of requested page numbers.
90
87
  def page_list(pages)
91
- pages.split(',').map { |range|
88
+ pages.split(',').map do |range|
92
89
  if range.include?('-')
93
90
  range = range.split('-')
94
- Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
91
+ Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
95
92
  else
96
93
  range.to_i
97
94
  end
98
- }.flatten.uniq.sort
95
+ end.flatten.uniq.sort
99
96
  end
100
-
101
97
  end
102
-
103
98
  end
@@ -1,36 +1,34 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
3
  class InfoExtractor
5
-
6
4
  # Regex matchers for different bits of information.
7
5
  MATCHERS = {
8
- :author => /^Author:\s+([^\n]+)/,
9
- :date => /^CreationDate:\s+([^\n]+)/,
10
- :creator => /^Creator:\s+([^\n]+)/,
11
- :keywords => /^Keywords:\s+([^\n]+)/,
12
- :producer => /^Producer:\s+([^\n]+)/,
13
- :subject => /^Subject:\s+([^\n]+)/,
14
- :title => /^Title:\s+([^\n]+)/,
15
- :length => /^Pages:\s+([^\n]+)/,
16
- }
6
+ author: /^Author:\s+([^\n]+)/,
7
+ date: /^CreationDate:\s+([^\n]+)/,
8
+ creator: /^Creator:\s+([^\n]+)/,
9
+ keywords: /^Keywords:\s+([^\n]+)/,
10
+ producer: /^Producer:\s+([^\n]+)/,
11
+ subject: /^Subject:\s+([^\n]+)/,
12
+ title: /^Title:\s+([^\n]+)/,
13
+ length: /^Pages:\s+([^\n]+)/
14
+ }.freeze
17
15
 
18
16
  # Pull out a single datum from a pdf.
19
17
  def extract(key, pdfs, opts)
20
18
  extract_all(pdfs, opts)[key]
21
19
  end
22
-
23
- def extract_all(pdfs, opts)
20
+
21
+ def extract_all(pdfs, _opts)
24
22
  pdf = [pdfs].flatten.first
25
23
  cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26
24
  result = `#{cmd}`.chomp
27
- raise ExtractionFailed, result if $? != 0
25
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
28
26
  # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
27
  if String.method_defined?(:encode)
30
- result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
28
+ result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
31
29
  else
32
30
  require 'iconv' unless defined?(Iconv)
33
- ic = Iconv.new('UTF-8//IGNORE','UTF-8')
31
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
34
32
  result = ic.iconv(result)
35
33
  end
36
34
  info = {}
@@ -44,7 +42,5 @@ module Docsplit
44
42
  end
45
43
  info
46
44
  end
47
-
48
45
  end
49
-
50
46
  end
@@ -1,36 +1,31 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdftk** in order to create bursted single pages from
4
3
  # a PDF document.
5
4
  class PageExtractor
6
-
7
5
  # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
6
  def extract(pdfs, opts)
9
7
  extract_options opts
10
8
  [pdfs].flatten.each do |pdf|
11
9
  pdf_name = File.basename(pdf, File.extname(pdf))
12
- page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
- FileUtils.mkdir_p @output unless File.exists?(@output)
14
-
10
+ page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
11
+ FileUtils.mkdir_p @output unless File.exist?(@output)
12
+
15
13
  cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
- "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
- else
18
- "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
14
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
15
+ else
16
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
17
  end
20
18
  result = `#{cmd}`.chomp
21
- FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
- raise ExtractionFailed, result if $? != 0
19
+ FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
20
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
23
21
  result
24
22
  end
25
23
  end
26
24
 
27
-
28
25
  private
29
26
 
30
27
  def extract_options(options)
31
28
  @output = options[:output] || '.'
32
29
  end
33
-
34
30
  end
35
-
36
31
  end
@@ -6,22 +6,24 @@ module Docsplit
6
6
  @@version_string = nil
7
7
 
8
8
  # Provide a set of helper functions to determine the OS.
9
- HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
9
+ HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
10
10
  def windows?
11
11
  !!HOST_OS.match(/mswin|windows|cygwin/i)
12
12
  end
13
+
13
14
  def osx?
14
15
  !!HOST_OS.match(/darwin/i)
15
16
  end
17
+
16
18
  def linux?
17
19
  !!HOST_OS.match(/linux/i)
18
20
  end
19
-
21
+
20
22
  # The first line of the help output holds the name and version number
21
23
  # of the office software to be used for extraction.
22
24
  def version_string
23
25
  unless @@version_string
24
- null = windows? ? "NUL" : "/dev/null"
26
+ null = windows? ? 'NUL' : '/dev/null'
25
27
  @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
28
  if !!@@version_string.to_s.match(/[0-9]*/)
27
29
  @@version_string = `#{office_executable} --version`.split("\n").first
@@ -29,23 +31,25 @@ module Docsplit
29
31
  end
30
32
  @@version_string
31
33
  end
34
+
32
35
  def libre_office?
33
36
  !!version_string.match(/^LibreOffice/)
34
37
  end
38
+
35
39
  def open_office?
36
40
  !!version_string.match(/^OpenOffice.org/)
37
41
  end
38
-
42
+
39
43
  # A set of default locations to search for office software
40
44
  # These have been extracted from JODConverter. Each listed
41
- # path should contain a directory "program" which in turn
45
+ # path should contain a directory "program" which in turn
42
46
  # contains the "soffice" executable.
43
47
  # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
44
48
  def office_search_paths
45
49
  if windows?
46
- office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
47
- program_files_path = ENV["CommonProgramFiles"]
48
- search_paths = office_names.map{ |program| File.join(program_files_path, program) }
50
+ office_names = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
51
+ program_files_path = ENV['CommonProgramFiles']
52
+ search_paths = office_names.map { |program| File.join(program_files_path, program) }
49
53
  elsif osx?
50
54
  search_paths = %w(
51
55
  /Applications/LibreOffice.app/Contents
@@ -69,7 +73,7 @@ module Docsplit
69
73
  end
70
74
  search_paths
71
75
  end
72
-
76
+
73
77
  # Identify the path to a working office executable.
74
78
  def office_executable
75
79
  paths = office_search_paths
@@ -78,45 +82,45 @@ module Docsplit
78
82
  # raise an error if that path isn't valid, otherwise, add
79
83
  # it to the front of our search paths.
80
84
  if ENV['OFFICE_PATH']
81
- raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
85
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
82
86
  paths.unshift(ENV['OFFICE_PATH'])
83
87
  end
84
-
88
+
85
89
  # The location of the office executable is OS dependent
86
- path_pieces = ["soffice"]
90
+ path_pieces = ['soffice']
87
91
  if windows?
88
- path_pieces += [["program", "soffice.bin"]]
92
+ path_pieces += [['program', 'soffice.bin']]
89
93
  elsif osx?
90
- path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
94
+ path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
91
95
  else
92
- path_pieces += [["program", "soffice"]]
96
+ path_pieces += [%w(program soffice)]
93
97
  end
94
-
98
+
95
99
  # Search for the first suitable office executable
96
100
  # and short circuit an executable is found.
97
101
  paths.each do |path|
98
- if File.exists? path
102
+ if File.exist? path
99
103
  @@executable ||= path unless File.directory? path
100
104
  path_pieces.each do |pieces|
101
105
  check_path = File.join(path, pieces)
102
- @@executable ||= check_path if File.exists? check_path
106
+ @@executable ||= check_path if File.exist? check_path
103
107
  end
104
108
  end
105
109
  break if @@executable
106
110
  end
107
- raise OfficeNotFound, "No office software found" unless @@executable
111
+ raise OfficeNotFound, 'No office software found' unless @@executable
108
112
  @@executable
109
113
  end
110
-
114
+
111
115
  # Used to specify the office location for JODConverter
112
116
  def office_path
113
117
  File.dirname(File.dirname(office_executable))
114
118
  end
115
-
119
+
116
120
  # Convert documents to PDF.
117
121
  def extract(docs, opts)
118
122
  out = opts[:output] || '.'
119
- FileUtils.mkdir_p out unless File.exists?(out)
123
+ FileUtils.mkdir_p out unless File.exist?(out)
120
124
  [docs].flatten.each do |doc|
121
125
  ext = File.extname(doc)
122
126
  basename = File.basename(doc, ext)
@@ -127,12 +131,12 @@ module Docsplit
127
131
  else
128
132
  if libre_office?
129
133
  # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
130
- ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
131
-
134
+ ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
135
+
132
136
  options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
133
137
  cmd = "#{office_executable} #{options} 2>&1"
134
138
  result = `#{cmd}`.chomp
135
- raise ExtractionFailed, result if $? != 0
139
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
136
140
  true
137
141
  else # open office presumably, rely on JODConverter to figure it out.
138
142
  options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
@@ -142,23 +146,22 @@ module Docsplit
142
146
  end
143
147
  end
144
148
 
145
- CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
149
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
146
150
 
147
- LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
151
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
152
+
153
+ HEADLESS = '-Djava.awt.headless=true'.freeze
148
154
 
149
- HEADLESS = "-Djava.awt.headless=true"
150
-
151
155
  private
152
-
153
- # Runs a Java command, with quieted logging, and the classpath set properly.
154
- def run_jod(command, pdfs, opts, return_output=false)
155
156
 
156
- pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
157
+ # Runs a Java command, with quieted logging, and the classpath set properly.
158
+ def run_jod(command, pdfs, _opts, return_output = false)
159
+ pdfs = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
157
160
  office = osx? ? "-Doffice.home=#{office_path}" : office_path
158
161
  cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
159
162
  result = `#{cmd}`.chomp
160
- raise ExtractionFailed, result if $? != 0
161
- return return_output ? (result.empty? ? nil : result) : true
163
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
164
+ return_output ? (result.empty? ? nil : result) : true
162
165
  end
163
166
 
164
167
  class OfficeNotFound < StandardError; end
@@ -1,7 +1,6 @@
1
1
  require 'strscan'
2
2
 
3
3
  module Docsplit
4
-
5
4
  # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
5
  # words. Algorithms taken from:
7
6
  #
@@ -13,7 +12,6 @@ module Docsplit
13
12
  # -- Kulp
14
13
  #
15
14
  class TextCleaner
16
-
17
15
  # Cached regexes we plan on using.
18
16
  WORD = /\S+/
19
17
  SPACE = /\s+/
@@ -36,7 +34,7 @@ module Docsplit
36
34
  # multibyte-aware version, coercing to ASCII first.
37
35
  def clean(text)
38
36
  if String.method_defined?(:encode)
39
- text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
37
+ text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
40
38
  else
41
39
  require 'iconv' unless defined?(Iconv)
42
40
  text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
@@ -67,33 +65,31 @@ module Docsplit
67
65
  # More than 30 bytes in length.
68
66
  (w.length > 30) ||
69
67
 
70
- # If there are three or more identical characters in a row in the string.
71
- (w =~ REPEAT) ||
68
+ # If there are three or more identical characters in a row in the string.
69
+ (w =~ REPEAT) ||
72
70
 
73
- # More punctuation than alpha numerics.
74
- (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
71
+ # More punctuation than alpha numerics.
72
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
75
73
 
76
- # Ignoring the first and last characters in the string, if there are three or
77
- # more different punctuation characters in the string.
78
- (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+ # Ignoring the first and last characters in the string, if there are three or
75
+ # more different punctuation characters in the string.
76
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
79
77
 
80
- # Four or more consecutive vowels, or five or more consecutive consonants.
81
- ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
78
+ # Four or more consecutive vowels, or five or more consecutive consonants.
79
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
82
80
 
83
- # Number of uppercase letters greater than lowercase letters, but the word is
84
- # not all uppercase + punctuation.
85
- (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+ # Number of uppercase letters greater than lowercase letters, but the word is
82
+ # not all uppercase + punctuation.
83
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
86
84
 
87
- # Single letters that are not A or I.
88
- (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
85
+ # Single letters that are not A or I.
86
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
89
87
 
90
- # All characters are alphabetic and there are 8 times more vowels than
91
- # consonants, or 8 times more consonants than vowels.
92
- (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
93
- (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
94
- (cons > vows * 8)))
88
+ # All characters are alphabetic and there are 8 times more vowels than
89
+ # consonants, or 8 times more consonants than vowels.
90
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
91
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
92
+ (cons > vows * 8)))
95
93
  end
96
-
97
94
  end
98
-
99
95
  end
@@ -1,5 +1,4 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
3
  # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
4
  # forbid OCR extraction, but by default the heuristic works like this:
@@ -13,11 +12,10 @@ module Docsplit
13
12
  # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
13
  #
15
14
  class TextExtractor
16
-
17
15
  NO_TEXT_DETECTED = /---------\n\Z/
18
16
 
19
- OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
- MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
17
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'.freeze
18
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
21
19
 
22
20
  MIN_TEXT_PER_PAGE = 100 # in bytes
23
21
 
@@ -28,10 +26,10 @@ module Docsplit
28
26
  # Extract text from a list of PDFs.
29
27
  def extract(pdfs, opts)
30
28
  extract_options opts
31
- FileUtils.mkdir_p @output unless File.exists?(@output)
29
+ FileUtils.mkdir_p @output unless File.exist?(@output)
32
30
  [pdfs].flatten.each do |pdf|
33
31
  @pdf_name = File.basename(pdf, File.extname(pdf))
34
- pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
32
+ pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
35
33
  if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
34
  extract_from_ocr(pdf, pages)
37
35
  else
@@ -52,7 +50,7 @@ module Docsplit
52
50
  # Extract a page range worth of text from a PDF, directly.
53
51
  def extract_from_pdf(pdf, pages)
54
52
  return extract_full(pdf) unless pages
55
- pages.each {|page| extract_page(pdf, page) }
53
+ pages.each { |page| extract_page(pdf, page) }
56
54
  end
57
55
 
58
56
  # Extract a page range worth of text from a PDF via OCR.
@@ -60,7 +58,7 @@ module Docsplit
60
58
  tempdir = Dir.mktmpdir
61
59
  base_path = File.join(@output, @pdf_name)
62
60
  escaped_pdf = ESCAPE[pdf]
63
- psm = @detect_orientation ? "-psm 1" : ""
61
+ psm = @detect_orientation ? '-psm 1' : ''
64
62
  if pages
65
63
  pages.each do |page|
66
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
@@ -75,15 +73,14 @@ module Docsplit
75
73
  tiff = "#{tempdir}/#{@pdf_name}.tif"
76
74
  escaped_tiff = ESCAPE[tiff]
77
75
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
78
- #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
76
+ # if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
77
  run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
80
78
  clean_text(base_path + '.txt') if @clean_ocr
81
79
  end
82
80
  ensure
83
- FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
81
+ FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
84
82
  end
85
83
 
86
-
87
84
  private
88
85
 
89
86
  def clean_text(file)
@@ -98,7 +95,7 @@ module Docsplit
98
95
  # Run an external process and raise an exception if it fails.
99
96
  def run(command)
100
97
  result = `#{command}`
101
- raise ExtractionFailed, result if $? != 0
98
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
102
99
  result
103
100
  end
104
101
 
@@ -124,10 +121,8 @@ module Docsplit
124
121
  @force_ocr = options[:ocr] == true
125
122
  @forbid_ocr = options[:ocr] == false
126
123
  @language = options[:language] || 'eng'
127
- @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
- @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
124
+ @clean_ocr = (!(options[:clean] == false) && @language == 'eng')
125
+ @detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
129
126
  end
130
-
131
127
  end
132
-
133
128
  end
@@ -1,9 +1,7 @@
1
1
  module Docsplit
2
-
3
2
  # Include a method to transparently convert non-PDF arguments to temporary
4
3
  # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
4
  module TransparentPDFs
6
-
7
5
  # Temporarily convert any non-PDF documents to PDFs before running them
8
6
  # through further extraction.
9
7
  def ensure_pdfs(docs)
@@ -12,18 +10,16 @@ module Docsplit
12
10
  doc
13
11
  else
14
12
  tempdir = File.join(Dir.tmpdir, 'docsplit')
15
- extract_pdf([doc], {:output => tempdir})
13
+ extract_pdf([doc], output: tempdir)
16
14
  File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
17
15
  end
18
16
  end
19
17
  end
20
18
 
21
19
  def is_pdf?(doc)
22
- File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
20
+ File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
23
21
  end
24
-
25
22
  end
26
23
 
27
24
  extend TransparentPDFs
28
-
29
25
  end
@@ -0,0 +1,3 @@
1
+ module Docsplit
2
+ VERSION = '0.7.9'.freeze
3
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: burisu-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.7.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2015-06-26 00:00:00.000000000 Z
13
+ date: 2016-09-06 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart
@@ -36,6 +36,7 @@ files:
36
36
  - lib/docsplit/text_cleaner.rb
37
37
  - lib/docsplit/text_extractor.rb
38
38
  - lib/docsplit/transparent_pdfs.rb
39
+ - lib/docsplit/version.rb
39
40
  - vendor/conf/document-formats.js
40
41
  - vendor/jodconverter/commons-cli-1.1.jar
41
42
  - vendor/jodconverter/commons-io-1.4.jar
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
66
67
  version: '0'
67
68
  requirements: []
68
69
  rubyforge_project:
69
- rubygems_version: 2.4.5
70
+ rubygems_version: 2.4.5.1
70
71
  signing_key:
71
72
  specification_version: 4
72
73
  summary: Break Apart Documents into Images, Text, Pages and PDFs
73
74
  test_files: []
74
- has_rdoc: