burisu-docsplit 0.7.8 → 0.7.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
4
- data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
3
+ metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
4
+ data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
5
5
  SHA512:
6
- metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
7
- data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709
6
+ metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
7
+ data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644
@@ -2,4 +2,4 @@
2
2
 
3
3
  require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
4
 
5
- Docsplit::CommandLine.new
5
+ Docsplit::CommandLine.new
@@ -1,8 +1,12 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'docsplit/version'
4
+
1
5
  Gem::Specification.new do |s|
2
6
  s.name = 'burisu-docsplit'
3
- s.version = '0.7.8' # Keep version in sync with docsplit.rb
4
- s.homepage = "http://documentcloud.github.com/docsplit/"
5
- s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
7
+ s.version = Docsplit::VERSION # Keep version in sync with docsplit.rb
8
+ s.homepage = 'http://documentcloud.github.com/docsplit/'
9
+ s.summary = 'Break Apart Documents into Images, Text, Pages and PDFs'
6
10
  s.description = <<-EOS
7
11
  Docsplit is a command-line utility and Ruby library for splitting apart
8
12
  documents into their component parts: searchable UTF-8 plain text, page
@@ -1,22 +1,20 @@
1
1
  require 'tmpdir'
2
2
  require 'fileutils'
3
3
  require 'shellwords'
4
+ require 'docsplit/version'
4
5
 
5
6
  # The Docsplit module delegates to the Java PDF extractors.
6
7
  module Docsplit
7
-
8
- VERSION = '0.7.6' # Keep in sync with gemspec.
9
-
10
- ESCAPE = lambda {|x| Shellwords.shellescape(x) }
8
+ ESCAPE = ->(x) { Shellwords.shellescape(x) }
11
9
 
12
10
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
11
  ESCAPED_ROOT = ESCAPE[ROOT]
14
12
 
15
- METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
16
-
17
- GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
13
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
14
+
15
+ GM_FORMATS = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
18
16
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
17
+ DEPENDENCIES = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
20
18
 
21
19
  # Check for all dependencies, and note their absence.
22
20
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -32,28 +30,28 @@ module Docsplit
32
30
  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
31
  if DEPENDENCIES[:tesseract]
34
32
  # osd will be listed in tesseract --listlangs
35
- val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
33
+ val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
36
34
  DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
35
  end
38
36
 
39
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
37
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
40
38
  # broke.
41
39
  class ExtractionFailed < StandardError; end
42
40
 
43
41
  # Use the ExtractPages Java class to burst a PDF into single pages.
44
- def self.extract_pages(pdfs, opts={})
42
+ def self.extract_pages(pdfs, opts = {})
45
43
  pdfs = ensure_pdfs(pdfs)
46
44
  PageExtractor.new.extract(pdfs, opts)
47
45
  end
48
46
 
49
47
  # Use the ExtractText Java class to write out all embedded text.
50
- def self.extract_text(pdfs, opts={})
48
+ def self.extract_text(pdfs, opts = {})
51
49
  pdfs = ensure_pdfs(pdfs)
52
50
  TextExtractor.new.extract(pdfs, opts)
53
51
  end
54
52
 
55
53
  # Use the ExtractImages Java class to rasterize a PDF into each page's image.
56
- def self.extract_images(pdfs, opts={})
54
+ def self.extract_images(pdfs, opts = {})
57
55
  pdfs = ensure_pdfs(pdfs)
58
56
  opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
59
57
  ImageExtractor.new.extract(pdfs, opts)
@@ -61,7 +59,7 @@ module Docsplit
61
59
 
62
60
  # Use JODCConverter to extract the documents as PDFs.
63
61
  # If the document is in an image format, use GraphicsMagick to extract the PDF.
64
- def self.extract_pdf(docs, opts={})
62
+ def self.extract_pdf(docs, opts = {})
65
63
  PdfExtractor.new.extract(docs, opts)
66
64
  end
67
65
 
@@ -75,8 +73,8 @@ module Docsplit
75
73
  end
76
74
  EOS
77
75
  end
78
-
79
- def self.extract_info(pdfs, opts={})
76
+
77
+ def self.extract_info(pdfs, opts = {})
80
78
  pdfs = ensure_pdfs(pdfs)
81
79
  InfoExtractor.new.extract_all(pdfs, opts)
82
80
  end
@@ -93,11 +91,10 @@ module Docsplit
93
91
  def self.normalize_value(value)
94
92
  case value
95
93
  when Range then value.to_a.join(',')
96
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
94
+ when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
97
95
  else value.to_s
98
96
  end
99
97
  end
100
-
101
98
  end
102
99
 
103
100
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
@@ -2,11 +2,9 @@ require 'optparse'
2
2
  require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
3
 
4
4
  module Docsplit
5
-
6
5
  # A single command-line utility to separate a PDF into all its component parts.
7
6
  class CommandLine
8
-
9
- BANNER = <<-EOS
7
+ BANNER = <<-EOS.freeze
10
8
  docsplit breaks apart documents into images, text, or individual pages.
11
9
  It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
10
 
@@ -39,24 +37,22 @@ Options:
39
37
 
40
38
  # Delegate to the Docsplit Ruby API to perform all extractions.
41
39
  def run
42
- begin
43
- case @command
44
- when :images then Docsplit.extract_images(ARGV, @options)
45
- when :pages then Docsplit.extract_pages(ARGV, @options)
46
- when :text then Docsplit.extract_text(ARGV, @options)
47
- when :pdf then Docsplit.extract_pdf(ARGV, @options)
40
+ case @command
41
+ when :images then Docsplit.extract_images(ARGV, @options)
42
+ when :pages then Docsplit.extract_pages(ARGV, @options)
43
+ when :text then Docsplit.extract_text(ARGV, @options)
44
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
45
+ else
46
+ if METADATA_KEYS.include?(@command)
47
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
48
+ puts value unless value.nil?
48
49
  else
49
- if METADATA_KEYS.include?(@command)
50
- value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
- puts value unless value.nil?
52
- else
53
- usage
54
- end
50
+ usage
55
51
  end
56
- rescue ExtractionFailed => e
57
- puts e.message.chomp
58
- exit(1)
59
52
  end
53
+ rescue ExtractionFailed => e
54
+ puts e.message.chomp
55
+ exit(1)
60
56
  end
61
57
 
62
58
  # Print out the usage help message.
@@ -65,18 +61,17 @@ Options:
65
61
  exit
66
62
  end
67
63
 
68
-
69
64
  private
70
65
 
71
66
  # Use the OptionParser library to parse out all supported options. Return
72
67
  # options formatted for the Ruby API.
73
68
  def parse_options
74
- @options = {:ocr => :default, :clean => true}
69
+ @options = { ocr: :default, clean: true }
75
70
  @option_parser = OptionParser.new do |opts|
76
71
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
72
  @options[:output] = d
78
73
  end
79
- opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
74
+ opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
80
75
  @options[:pages] = p
81
76
  end
82
77
  opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
@@ -91,16 +86,16 @@ Options:
91
86
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
87
  @options[:ocr] = o
93
88
  end
94
- opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
89
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
95
90
  @options[:clean] = false
96
91
  end
97
92
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
93
  @options[:language] = l
99
94
  end
100
- opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
95
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
101
96
  @options[:detect_orientation] = false
102
97
  end
103
- opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
98
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
104
99
  @options[:rolling] = true
105
100
  end
106
101
  opts.on_tail('-v', '--version', 'display docsplit version') do
@@ -119,7 +114,5 @@ Options:
119
114
  exit(1)
120
115
  end
121
116
  end
122
-
123
117
  end
124
-
125
- end
118
+ end
@@ -1,12 +1,10 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to GraphicsMagick in order to convert PDF documents into
4
3
  # nicely sized images.
5
4
  class ImageExtractor
6
-
7
- MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
5
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
8
6
  DEFAULT_FORMAT = :png
9
- DEFAULT_DENSITY = '150'
7
+ DEFAULT_DENSITY = '150'.freeze
10
8
 
11
9
  # Extract a list of PDFs as rasterized page images, according to the
12
10
  # configuration in options.
@@ -15,8 +13,8 @@ module Docsplit
15
13
  extract_options(options)
16
14
  @pdfs.each do |pdf|
17
15
  previous = nil
18
- @sizes.each_with_index do |size, i|
19
- @formats.each {|format| convert(pdf, size, format, previous) }
16
+ @sizes.each_with_index do |size, _i|
17
+ @formats.each { |format| convert(pdf, size, format, previous) }
20
18
  previous = size if @rolling
21
19
  end
22
20
  end
@@ -27,36 +25,35 @@ module Docsplit
27
25
  # we simply downsample that image, instead of re-rendering the entire PDF.
28
26
  # Now we generate one page at a time, a counterintuitive opimization
29
27
  # suggested by the GraphicsMagick list, that seems to work quite well.
30
- def convert(pdf, size, format, previous=nil)
28
+ def convert(pdf, size, format, previous = nil)
31
29
  tempdir = Dir.mktmpdir
32
30
  basename = File.basename(pdf, File.extname(pdf))
33
31
  directory = directory_for(size)
34
32
  pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
33
  escaped_pdf = ESCAPE[pdf]
36
- FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
- common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
34
+ FileUtils.mkdir_p(directory) unless File.exist?(directory)
35
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
36
  if previous
39
37
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
38
  result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
- raise ExtractionFailed, result if $? != 0
39
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
42
40
  else
43
41
  page_list(pages).each do |page|
44
- out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
42
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
43
  cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
44
  result = `#{cmd}`.chomp
47
- raise ExtractionFailed, result if $? != 0
45
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
48
46
  end
49
47
  end
50
48
  ensure
51
- FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
49
+ FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
52
50
  end
53
51
 
54
-
55
52
  private
56
53
 
57
54
  # Extract the relevant GraphicsMagick options from the options hash.
58
55
  def extract_options(options)
59
- @output = options[:output] || '.'
56
+ @output = options[:output] || '.'
60
57
  @pages = options[:pages]
61
58
  @density = options[:density] || DEFAULT_DENSITY
62
59
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
@@ -80,24 +77,22 @@ module Docsplit
80
77
  # Generate the appropriate quality argument for the image format.
81
78
  def quality_arg(format)
82
79
  case format.to_s
83
- when /jpe?g/ then "-quality 85"
84
- when /png/ then "-quality 100"
85
- else ""
80
+ when /jpe?g/ then '-quality 85'
81
+ when /png/ then '-quality 100'
82
+ else ''
86
83
  end
87
84
  end
88
85
 
89
86
  # Generate the expanded list of requested page numbers.
90
87
  def page_list(pages)
91
- pages.split(',').map { |range|
88
+ pages.split(',').map do |range|
92
89
  if range.include?('-')
93
90
  range = range.split('-')
94
- Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
91
+ Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
95
92
  else
96
93
  range.to_i
97
94
  end
98
- }.flatten.uniq.sort
95
+ end.flatten.uniq.sort
99
96
  end
100
-
101
97
  end
102
-
103
98
  end
@@ -1,36 +1,34 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
3
  class InfoExtractor
5
-
6
4
  # Regex matchers for different bits of information.
7
5
  MATCHERS = {
8
- :author => /^Author:\s+([^\n]+)/,
9
- :date => /^CreationDate:\s+([^\n]+)/,
10
- :creator => /^Creator:\s+([^\n]+)/,
11
- :keywords => /^Keywords:\s+([^\n]+)/,
12
- :producer => /^Producer:\s+([^\n]+)/,
13
- :subject => /^Subject:\s+([^\n]+)/,
14
- :title => /^Title:\s+([^\n]+)/,
15
- :length => /^Pages:\s+([^\n]+)/,
16
- }
6
+ author: /^Author:\s+([^\n]+)/,
7
+ date: /^CreationDate:\s+([^\n]+)/,
8
+ creator: /^Creator:\s+([^\n]+)/,
9
+ keywords: /^Keywords:\s+([^\n]+)/,
10
+ producer: /^Producer:\s+([^\n]+)/,
11
+ subject: /^Subject:\s+([^\n]+)/,
12
+ title: /^Title:\s+([^\n]+)/,
13
+ length: /^Pages:\s+([^\n]+)/
14
+ }.freeze
17
15
 
18
16
  # Pull out a single datum from a pdf.
19
17
  def extract(key, pdfs, opts)
20
18
  extract_all(pdfs, opts)[key]
21
19
  end
22
-
23
- def extract_all(pdfs, opts)
20
+
21
+ def extract_all(pdfs, _opts)
24
22
  pdf = [pdfs].flatten.first
25
23
  cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26
24
  result = `#{cmd}`.chomp
27
- raise ExtractionFailed, result if $? != 0
25
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
28
26
  # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
27
  if String.method_defined?(:encode)
30
- result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
28
+ result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
31
29
  else
32
30
  require 'iconv' unless defined?(Iconv)
33
- ic = Iconv.new('UTF-8//IGNORE','UTF-8')
31
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
34
32
  result = ic.iconv(result)
35
33
  end
36
34
  info = {}
@@ -44,7 +42,5 @@ module Docsplit
44
42
  end
45
43
  info
46
44
  end
47
-
48
45
  end
49
-
50
46
  end
@@ -1,36 +1,31 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdftk** in order to create bursted single pages from
4
3
  # a PDF document.
5
4
  class PageExtractor
6
-
7
5
  # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
6
  def extract(pdfs, opts)
9
7
  extract_options opts
10
8
  [pdfs].flatten.each do |pdf|
11
9
  pdf_name = File.basename(pdf, File.extname(pdf))
12
- page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
- FileUtils.mkdir_p @output unless File.exists?(@output)
14
-
10
+ page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
11
+ FileUtils.mkdir_p @output unless File.exist?(@output)
12
+
15
13
  cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
- "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
- else
18
- "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
14
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
15
+ else
16
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
17
  end
20
18
  result = `#{cmd}`.chomp
21
- FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
- raise ExtractionFailed, result if $? != 0
19
+ FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
20
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
23
21
  result
24
22
  end
25
23
  end
26
24
 
27
-
28
25
  private
29
26
 
30
27
  def extract_options(options)
31
28
  @output = options[:output] || '.'
32
29
  end
33
-
34
30
  end
35
-
36
31
  end
@@ -6,22 +6,24 @@ module Docsplit
6
6
  @@version_string = nil
7
7
 
8
8
  # Provide a set of helper functions to determine the OS.
9
- HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
9
+ HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
10
10
  def windows?
11
11
  !!HOST_OS.match(/mswin|windows|cygwin/i)
12
12
  end
13
+
13
14
  def osx?
14
15
  !!HOST_OS.match(/darwin/i)
15
16
  end
17
+
16
18
  def linux?
17
19
  !!HOST_OS.match(/linux/i)
18
20
  end
19
-
21
+
20
22
  # The first line of the help output holds the name and version number
21
23
  # of the office software to be used for extraction.
22
24
  def version_string
23
25
  unless @@version_string
24
- null = windows? ? "NUL" : "/dev/null"
26
+ null = windows? ? 'NUL' : '/dev/null'
25
27
  @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
28
  if !!@@version_string.to_s.match(/[0-9]*/)
27
29
  @@version_string = `#{office_executable} --version`.split("\n").first
@@ -29,23 +31,25 @@ module Docsplit
29
31
  end
30
32
  @@version_string
31
33
  end
34
+
32
35
  def libre_office?
33
36
  !!version_string.match(/^LibreOffice/)
34
37
  end
38
+
35
39
  def open_office?
36
40
  !!version_string.match(/^OpenOffice.org/)
37
41
  end
38
-
42
+
39
43
  # A set of default locations to search for office software
40
44
  # These have been extracted from JODConverter. Each listed
41
- # path should contain a directory "program" which in turn
45
+ # path should contain a directory "program" which in turn
42
46
  # contains the "soffice" executable.
43
47
  # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
44
48
  def office_search_paths
45
49
  if windows?
46
- office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
47
- program_files_path = ENV["CommonProgramFiles"]
48
- search_paths = office_names.map{ |program| File.join(program_files_path, program) }
50
+ office_names = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
51
+ program_files_path = ENV['CommonProgramFiles']
52
+ search_paths = office_names.map { |program| File.join(program_files_path, program) }
49
53
  elsif osx?
50
54
  search_paths = %w(
51
55
  /Applications/LibreOffice.app/Contents
@@ -69,7 +73,7 @@ module Docsplit
69
73
  end
70
74
  search_paths
71
75
  end
72
-
76
+
73
77
  # Identify the path to a working office executable.
74
78
  def office_executable
75
79
  paths = office_search_paths
@@ -78,45 +82,45 @@ module Docsplit
78
82
  # raise an error if that path isn't valid, otherwise, add
79
83
  # it to the front of our search paths.
80
84
  if ENV['OFFICE_PATH']
81
- raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
85
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
82
86
  paths.unshift(ENV['OFFICE_PATH'])
83
87
  end
84
-
88
+
85
89
  # The location of the office executable is OS dependent
86
- path_pieces = ["soffice"]
90
+ path_pieces = ['soffice']
87
91
  if windows?
88
- path_pieces += [["program", "soffice.bin"]]
92
+ path_pieces += [['program', 'soffice.bin']]
89
93
  elsif osx?
90
- path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
94
+ path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
91
95
  else
92
- path_pieces += [["program", "soffice"]]
96
+ path_pieces += [%w(program soffice)]
93
97
  end
94
-
98
+
95
99
  # Search for the first suitable office executable
96
100
  # and short circuit an executable is found.
97
101
  paths.each do |path|
98
- if File.exists? path
102
+ if File.exist? path
99
103
  @@executable ||= path unless File.directory? path
100
104
  path_pieces.each do |pieces|
101
105
  check_path = File.join(path, pieces)
102
- @@executable ||= check_path if File.exists? check_path
106
+ @@executable ||= check_path if File.exist? check_path
103
107
  end
104
108
  end
105
109
  break if @@executable
106
110
  end
107
- raise OfficeNotFound, "No office software found" unless @@executable
111
+ raise OfficeNotFound, 'No office software found' unless @@executable
108
112
  @@executable
109
113
  end
110
-
114
+
111
115
  # Used to specify the office location for JODConverter
112
116
  def office_path
113
117
  File.dirname(File.dirname(office_executable))
114
118
  end
115
-
119
+
116
120
  # Convert documents to PDF.
117
121
  def extract(docs, opts)
118
122
  out = opts[:output] || '.'
119
- FileUtils.mkdir_p out unless File.exists?(out)
123
+ FileUtils.mkdir_p out unless File.exist?(out)
120
124
  [docs].flatten.each do |doc|
121
125
  ext = File.extname(doc)
122
126
  basename = File.basename(doc, ext)
@@ -127,12 +131,12 @@ module Docsplit
127
131
  else
128
132
  if libre_office?
129
133
  # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
130
- ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
131
-
134
+ ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
135
+
132
136
  options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
133
137
  cmd = "#{office_executable} #{options} 2>&1"
134
138
  result = `#{cmd}`.chomp
135
- raise ExtractionFailed, result if $? != 0
139
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
136
140
  true
137
141
  else # open office presumably, rely on JODConverter to figure it out.
138
142
  options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
@@ -142,23 +146,22 @@ module Docsplit
142
146
  end
143
147
  end
144
148
 
145
- CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
149
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
146
150
 
147
- LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
151
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
152
+
153
+ HEADLESS = '-Djava.awt.headless=true'.freeze
148
154
 
149
- HEADLESS = "-Djava.awt.headless=true"
150
-
151
155
  private
152
-
153
- # Runs a Java command, with quieted logging, and the classpath set properly.
154
- def run_jod(command, pdfs, opts, return_output=false)
155
156
 
156
- pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
157
+ # Runs a Java command, with quieted logging, and the classpath set properly.
158
+ def run_jod(command, pdfs, _opts, return_output = false)
159
+ pdfs = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
157
160
  office = osx? ? "-Doffice.home=#{office_path}" : office_path
158
161
  cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
159
162
  result = `#{cmd}`.chomp
160
- raise ExtractionFailed, result if $? != 0
161
- return return_output ? (result.empty? ? nil : result) : true
163
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
164
+ return_output ? (result.empty? ? nil : result) : true
162
165
  end
163
166
 
164
167
  class OfficeNotFound < StandardError; end
@@ -1,7 +1,6 @@
1
1
  require 'strscan'
2
2
 
3
3
  module Docsplit
4
-
5
4
  # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
5
  # words. Algorithms taken from:
7
6
  #
@@ -13,7 +12,6 @@ module Docsplit
13
12
  # -- Kulp
14
13
  #
15
14
  class TextCleaner
16
-
17
15
  # Cached regexes we plan on using.
18
16
  WORD = /\S+/
19
17
  SPACE = /\s+/
@@ -36,7 +34,7 @@ module Docsplit
36
34
  # multibyte-aware version, coercing to ASCII first.
37
35
  def clean(text)
38
36
  if String.method_defined?(:encode)
39
- text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
37
+ text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
40
38
  else
41
39
  require 'iconv' unless defined?(Iconv)
42
40
  text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
@@ -67,33 +65,31 @@ module Docsplit
67
65
  # More than 30 bytes in length.
68
66
  (w.length > 30) ||
69
67
 
70
- # If there are three or more identical characters in a row in the string.
71
- (w =~ REPEAT) ||
68
+ # If there are three or more identical characters in a row in the string.
69
+ (w =~ REPEAT) ||
72
70
 
73
- # More punctuation than alpha numerics.
74
- (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
71
+ # More punctuation than alpha numerics.
72
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
75
73
 
76
- # Ignoring the first and last characters in the string, if there are three or
77
- # more different punctuation characters in the string.
78
- (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+ # Ignoring the first and last characters in the string, if there are three or
75
+ # more different punctuation characters in the string.
76
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
79
77
 
80
- # Four or more consecutive vowels, or five or more consecutive consonants.
81
- ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
78
+ # Four or more consecutive vowels, or five or more consecutive consonants.
79
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
82
80
 
83
- # Number of uppercase letters greater than lowercase letters, but the word is
84
- # not all uppercase + punctuation.
85
- (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+ # Number of uppercase letters greater than lowercase letters, but the word is
82
+ # not all uppercase + punctuation.
83
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
86
84
 
87
- # Single letters that are not A or I.
88
- (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
85
+ # Single letters that are not A or I.
86
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
89
87
 
90
- # All characters are alphabetic and there are 8 times more vowels than
91
- # consonants, or 8 times more consonants than vowels.
92
- (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
93
- (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
94
- (cons > vows * 8)))
88
+ # All characters are alphabetic and there are 8 times more vowels than
89
+ # consonants, or 8 times more consonants than vowels.
90
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
91
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
92
+ (cons > vows * 8)))
95
93
  end
96
-
97
94
  end
98
-
99
95
  end
@@ -1,5 +1,4 @@
1
1
  module Docsplit
2
-
3
2
  # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
3
  # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
4
  # forbid OCR extraction, but by default the heuristic works like this:
@@ -13,11 +12,10 @@ module Docsplit
13
12
  # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
13
  #
15
14
  class TextExtractor
16
-
17
15
  NO_TEXT_DETECTED = /---------\n\Z/
18
16
 
19
- OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
- MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
17
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'.freeze
18
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
21
19
 
22
20
  MIN_TEXT_PER_PAGE = 100 # in bytes
23
21
 
@@ -28,10 +26,10 @@ module Docsplit
28
26
  # Extract text from a list of PDFs.
29
27
  def extract(pdfs, opts)
30
28
  extract_options opts
31
- FileUtils.mkdir_p @output unless File.exists?(@output)
29
+ FileUtils.mkdir_p @output unless File.exist?(@output)
32
30
  [pdfs].flatten.each do |pdf|
33
31
  @pdf_name = File.basename(pdf, File.extname(pdf))
34
- pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
32
+ pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
35
33
  if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
34
  extract_from_ocr(pdf, pages)
37
35
  else
@@ -52,7 +50,7 @@ module Docsplit
52
50
  # Extract a page range worth of text from a PDF, directly.
53
51
  def extract_from_pdf(pdf, pages)
54
52
  return extract_full(pdf) unless pages
55
- pages.each {|page| extract_page(pdf, page) }
53
+ pages.each { |page| extract_page(pdf, page) }
56
54
  end
57
55
 
58
56
  # Extract a page range worth of text from a PDF via OCR.
@@ -60,7 +58,7 @@ module Docsplit
60
58
  tempdir = Dir.mktmpdir
61
59
  base_path = File.join(@output, @pdf_name)
62
60
  escaped_pdf = ESCAPE[pdf]
63
- psm = @detect_orientation ? "-psm 1" : ""
61
+ psm = @detect_orientation ? '-psm 1' : ''
64
62
  if pages
65
63
  pages.each do |page|
66
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
@@ -75,15 +73,14 @@ module Docsplit
75
73
  tiff = "#{tempdir}/#{@pdf_name}.tif"
76
74
  escaped_tiff = ESCAPE[tiff]
77
75
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
78
- #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
76
+ # if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
77
  run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
80
78
  clean_text(base_path + '.txt') if @clean_ocr
81
79
  end
82
80
  ensure
83
- FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
81
+ FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
84
82
  end
85
83
 
86
-
87
84
  private
88
85
 
89
86
  def clean_text(file)
@@ -98,7 +95,7 @@ module Docsplit
98
95
  # Run an external process and raise an exception if it fails.
99
96
  def run(command)
100
97
  result = `#{command}`
101
- raise ExtractionFailed, result if $? != 0
98
+ raise ExtractionFailed, result if $?.exitstatus.nonzero?
102
99
  result
103
100
  end
104
101
 
@@ -124,10 +121,8 @@ module Docsplit
124
121
  @force_ocr = options[:ocr] == true
125
122
  @forbid_ocr = options[:ocr] == false
126
123
  @language = options[:language] || 'eng'
127
- @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
- @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
124
+ @clean_ocr = (!(options[:clean] == false) && @language == 'eng')
125
+ @detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
129
126
  end
130
-
131
127
  end
132
-
133
128
  end
@@ -1,9 +1,7 @@
1
1
  module Docsplit
2
-
3
2
  # Include a method to transparently convert non-PDF arguments to temporary
4
3
  # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
4
  module TransparentPDFs
6
-
7
5
  # Temporarily convert any non-PDF documents to PDFs before running them
8
6
  # through further extraction.
9
7
  def ensure_pdfs(docs)
@@ -12,18 +10,16 @@ module Docsplit
12
10
  doc
13
11
  else
14
12
  tempdir = File.join(Dir.tmpdir, 'docsplit')
15
- extract_pdf([doc], {:output => tempdir})
13
+ extract_pdf([doc], output: tempdir)
16
14
  File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
17
15
  end
18
16
  end
19
17
  end
20
18
 
21
19
  def is_pdf?(doc)
22
- File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
20
+ File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
23
21
  end
24
-
25
22
  end
26
23
 
27
24
  extend TransparentPDFs
28
-
29
25
  end
@@ -0,0 +1,3 @@
1
+ module Docsplit
2
+ VERSION = '0.7.9'.freeze
3
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: burisu-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.7.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2015-06-26 00:00:00.000000000 Z
13
+ date: 2016-09-06 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart
@@ -36,6 +36,7 @@ files:
36
36
  - lib/docsplit/text_cleaner.rb
37
37
  - lib/docsplit/text_extractor.rb
38
38
  - lib/docsplit/transparent_pdfs.rb
39
+ - lib/docsplit/version.rb
39
40
  - vendor/conf/document-formats.js
40
41
  - vendor/jodconverter/commons-cli-1.1.jar
41
42
  - vendor/jodconverter/commons-io-1.4.jar
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
66
67
  version: '0'
67
68
  requirements: []
68
69
  rubyforge_project:
69
- rubygems_version: 2.4.5
70
+ rubygems_version: 2.4.5.1
70
71
  signing_key:
71
72
  specification_version: 4
72
73
  summary: Break Apart Documents into Images, Text, Pages and PDFs
73
74
  test_files: []
74
- has_rdoc: