burisu-docsplit 0.7.8 → 0.7.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/docsplit +1 -1
- data/docsplit.gemspec +7 -3
- data/lib/docsplit.rb +15 -18
- data/lib/docsplit/command_line.rb +20 -27
- data/lib/docsplit/image_extractor.rb +18 -23
- data/lib/docsplit/info_extractor.rb +14 -18
- data/lib/docsplit/page_extractor.rb +8 -13
- data/lib/docsplit/pdf_extractor.rb +38 -35
- data/lib/docsplit/text_cleaner.rb +20 -24
- data/lib/docsplit/text_extractor.rb +11 -16
- data/lib/docsplit/transparent_pdfs.rb +2 -6
- data/lib/docsplit/version.rb +3 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
|
4
|
+
data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
|
7
|
+
data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644
|
data/bin/docsplit
CHANGED
data/docsplit.gemspec
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'docsplit/version'
|
4
|
+
|
1
5
|
Gem::Specification.new do |s|
|
2
6
|
s.name = 'burisu-docsplit'
|
3
|
-
s.version =
|
4
|
-
s.homepage =
|
5
|
-
s.summary =
|
7
|
+
s.version = Docsplit::VERSION # Keep version in sync with docsplit.rb
|
8
|
+
s.homepage = 'http://documentcloud.github.com/docsplit/'
|
9
|
+
s.summary = 'Break Apart Documents into Images, Text, Pages and PDFs'
|
6
10
|
s.description = <<-EOS
|
7
11
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
8
12
|
documents into their component parts: searchable UTF-8 plain text, page
|
data/lib/docsplit.rb
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
require 'tmpdir'
|
2
2
|
require 'fileutils'
|
3
3
|
require 'shellwords'
|
4
|
+
require 'docsplit/version'
|
4
5
|
|
5
6
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
7
|
module Docsplit
|
7
|
-
|
8
|
-
VERSION = '0.7.6' # Keep in sync with gemspec.
|
9
|
-
|
10
|
-
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
8
|
+
ESCAPE = ->(x) { Shellwords.shellescape(x) }
|
11
9
|
|
12
10
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
11
|
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
12
|
|
15
|
-
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
16
|
-
|
17
|
-
GM_FORMATS = [
|
13
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
|
14
|
+
|
15
|
+
GM_FORMATS = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
|
18
16
|
|
19
|
-
DEPENDENCIES = {:
|
17
|
+
DEPENDENCIES = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
|
20
18
|
|
21
19
|
# Check for all dependencies, and note their absence.
|
22
20
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -32,28 +30,28 @@ module Docsplit
|
|
32
30
|
# if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
|
33
31
|
if DEPENDENCIES[:tesseract]
|
34
32
|
# osd will be listed in tesseract --listlangs
|
35
|
-
val =
|
33
|
+
val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
|
36
34
|
DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
|
37
35
|
end
|
38
36
|
|
39
|
-
|
37
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
40
38
|
# broke.
|
41
39
|
class ExtractionFailed < StandardError; end
|
42
40
|
|
43
41
|
# Use the ExtractPages Java class to burst a PDF into single pages.
|
44
|
-
def self.extract_pages(pdfs, opts={})
|
42
|
+
def self.extract_pages(pdfs, opts = {})
|
45
43
|
pdfs = ensure_pdfs(pdfs)
|
46
44
|
PageExtractor.new.extract(pdfs, opts)
|
47
45
|
end
|
48
46
|
|
49
47
|
# Use the ExtractText Java class to write out all embedded text.
|
50
|
-
def self.extract_text(pdfs, opts={})
|
48
|
+
def self.extract_text(pdfs, opts = {})
|
51
49
|
pdfs = ensure_pdfs(pdfs)
|
52
50
|
TextExtractor.new.extract(pdfs, opts)
|
53
51
|
end
|
54
52
|
|
55
53
|
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
56
|
-
def self.extract_images(pdfs, opts={})
|
54
|
+
def self.extract_images(pdfs, opts = {})
|
57
55
|
pdfs = ensure_pdfs(pdfs)
|
58
56
|
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
59
57
|
ImageExtractor.new.extract(pdfs, opts)
|
@@ -61,7 +59,7 @@ module Docsplit
|
|
61
59
|
|
62
60
|
# Use JODCConverter to extract the documents as PDFs.
|
63
61
|
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
64
|
-
def self.extract_pdf(docs, opts={})
|
62
|
+
def self.extract_pdf(docs, opts = {})
|
65
63
|
PdfExtractor.new.extract(docs, opts)
|
66
64
|
end
|
67
65
|
|
@@ -75,8 +73,8 @@ module Docsplit
|
|
75
73
|
end
|
76
74
|
EOS
|
77
75
|
end
|
78
|
-
|
79
|
-
def self.extract_info(pdfs, opts={})
|
76
|
+
|
77
|
+
def self.extract_info(pdfs, opts = {})
|
80
78
|
pdfs = ensure_pdfs(pdfs)
|
81
79
|
InfoExtractor.new.extract_all(pdfs, opts)
|
82
80
|
end
|
@@ -93,11 +91,10 @@ module Docsplit
|
|
93
91
|
def self.normalize_value(value)
|
94
92
|
case value
|
95
93
|
when Range then value.to_a.join(',')
|
96
|
-
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
94
|
+
when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
97
95
|
else value.to_s
|
98
96
|
end
|
99
97
|
end
|
100
|
-
|
101
98
|
end
|
102
99
|
|
103
100
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
@@ -2,11 +2,9 @@ require 'optparse'
|
|
2
2
|
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
3
|
|
4
4
|
module Docsplit
|
5
|
-
|
6
5
|
# A single command-line utility to separate a PDF into all its component parts.
|
7
6
|
class CommandLine
|
8
|
-
|
9
|
-
BANNER = <<-EOS
|
7
|
+
BANNER = <<-EOS.freeze
|
10
8
|
docsplit breaks apart documents into images, text, or individual pages.
|
11
9
|
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
10
|
|
@@ -39,24 +37,22 @@ Options:
|
|
39
37
|
|
40
38
|
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
39
|
def run
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
case @command
|
41
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
42
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
43
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
44
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
45
|
+
else
|
46
|
+
if METADATA_KEYS.include?(@command)
|
47
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
48
|
+
puts value unless value.nil?
|
48
49
|
else
|
49
|
-
|
50
|
-
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
-
puts value unless value.nil?
|
52
|
-
else
|
53
|
-
usage
|
54
|
-
end
|
50
|
+
usage
|
55
51
|
end
|
56
|
-
rescue ExtractionFailed => e
|
57
|
-
puts e.message.chomp
|
58
|
-
exit(1)
|
59
52
|
end
|
53
|
+
rescue ExtractionFailed => e
|
54
|
+
puts e.message.chomp
|
55
|
+
exit(1)
|
60
56
|
end
|
61
57
|
|
62
58
|
# Print out the usage help message.
|
@@ -65,18 +61,17 @@ Options:
|
|
65
61
|
exit
|
66
62
|
end
|
67
63
|
|
68
|
-
|
69
64
|
private
|
70
65
|
|
71
66
|
# Use the OptionParser library to parse out all supported options. Return
|
72
67
|
# options formatted for the Ruby API.
|
73
68
|
def parse_options
|
74
|
-
@options = {:
|
69
|
+
@options = { ocr: :default, clean: true }
|
75
70
|
@option_parser = OptionParser.new do |opts|
|
76
71
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
72
|
@options[:output] = d
|
78
73
|
end
|
79
|
-
opts.on('-p', '--pages [PAGES]',
|
74
|
+
opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
|
80
75
|
@options[:pages] = p
|
81
76
|
end
|
82
77
|
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
@@ -91,16 +86,16 @@ Options:
|
|
91
86
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
92
87
|
@options[:ocr] = o
|
93
88
|
end
|
94
|
-
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |
|
89
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
|
95
90
|
@options[:clean] = false
|
96
91
|
end
|
97
92
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
93
|
@options[:language] = l
|
99
94
|
end
|
100
|
-
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |
|
95
|
+
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
|
101
96
|
@options[:detect_orientation] = false
|
102
97
|
end
|
103
|
-
opts.on('-r', '--rolling', 'generate images from each previous image') do |
|
98
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
|
104
99
|
@options[:rolling] = true
|
105
100
|
end
|
106
101
|
opts.on_tail('-v', '--version', 'display docsplit version') do
|
@@ -119,7 +114,5 @@ Options:
|
|
119
114
|
exit(1)
|
120
115
|
end
|
121
116
|
end
|
122
|
-
|
123
117
|
end
|
124
|
-
|
125
|
-
end
|
118
|
+
end
|
@@ -1,12 +1,10 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
3
|
# nicely sized images.
|
5
4
|
class ImageExtractor
|
6
|
-
|
7
|
-
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
5
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
|
8
6
|
DEFAULT_FORMAT = :png
|
9
|
-
DEFAULT_DENSITY = '150'
|
7
|
+
DEFAULT_DENSITY = '150'.freeze
|
10
8
|
|
11
9
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
10
|
# configuration in options.
|
@@ -15,8 +13,8 @@ module Docsplit
|
|
15
13
|
extract_options(options)
|
16
14
|
@pdfs.each do |pdf|
|
17
15
|
previous = nil
|
18
|
-
@sizes.each_with_index do |size,
|
19
|
-
@formats.each {|format| convert(pdf, size, format, previous) }
|
16
|
+
@sizes.each_with_index do |size, _i|
|
17
|
+
@formats.each { |format| convert(pdf, size, format, previous) }
|
20
18
|
previous = size if @rolling
|
21
19
|
end
|
22
20
|
end
|
@@ -27,36 +25,35 @@ module Docsplit
|
|
27
25
|
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
26
|
# Now we generate one page at a time, a counterintuitive opimization
|
29
27
|
# suggested by the GraphicsMagick list, that seems to work quite well.
|
30
|
-
def convert(pdf, size, format, previous=nil)
|
28
|
+
def convert(pdf, size, format, previous = nil)
|
31
29
|
tempdir = Dir.mktmpdir
|
32
30
|
basename = File.basename(pdf, File.extname(pdf))
|
33
31
|
directory = directory_for(size)
|
34
32
|
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
33
|
escaped_pdf = ESCAPE[pdf]
|
36
|
-
FileUtils.mkdir_p(directory) unless File.
|
37
|
-
common
|
34
|
+
FileUtils.mkdir_p(directory) unless File.exist?(directory)
|
35
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
38
36
|
if previous
|
39
37
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
40
38
|
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
41
|
-
raise ExtractionFailed, result if
|
39
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
42
40
|
else
|
43
41
|
page_list(pages).each do |page|
|
44
|
-
out_file
|
42
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
43
|
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
44
|
result = `#{cmd}`.chomp
|
47
|
-
raise ExtractionFailed, result if
|
45
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
48
46
|
end
|
49
47
|
end
|
50
48
|
ensure
|
51
|
-
FileUtils.remove_entry_secure tempdir if File.
|
49
|
+
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
|
52
50
|
end
|
53
51
|
|
54
|
-
|
55
52
|
private
|
56
53
|
|
57
54
|
# Extract the relevant GraphicsMagick options from the options hash.
|
58
55
|
def extract_options(options)
|
59
|
-
@output = options[:output]
|
56
|
+
@output = options[:output] || '.'
|
60
57
|
@pages = options[:pages]
|
61
58
|
@density = options[:density] || DEFAULT_DENSITY
|
62
59
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
@@ -80,24 +77,22 @@ module Docsplit
|
|
80
77
|
# Generate the appropriate quality argument for the image format.
|
81
78
|
def quality_arg(format)
|
82
79
|
case format.to_s
|
83
|
-
when /jpe?g/ then
|
84
|
-
when /png/ then
|
85
|
-
else
|
80
|
+
when /jpe?g/ then '-quality 85'
|
81
|
+
when /png/ then '-quality 100'
|
82
|
+
else ''
|
86
83
|
end
|
87
84
|
end
|
88
85
|
|
89
86
|
# Generate the expanded list of requested page numbers.
|
90
87
|
def page_list(pages)
|
91
|
-
pages.split(',').map
|
88
|
+
pages.split(',').map do |range|
|
92
89
|
if range.include?('-')
|
93
90
|
range = range.split('-')
|
94
|
-
Range.new(range.first.to_i, range.last.to_i).to_a.map
|
91
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
|
95
92
|
else
|
96
93
|
range.to_i
|
97
94
|
end
|
98
|
-
|
95
|
+
end.flatten.uniq.sort
|
99
96
|
end
|
100
|
-
|
101
97
|
end
|
102
|
-
|
103
98
|
end
|
@@ -1,36 +1,34 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
3
|
class InfoExtractor
|
5
|
-
|
6
4
|
# Regex matchers for different bits of information.
|
7
5
|
MATCHERS = {
|
8
|
-
:
|
9
|
-
:
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:
|
16
|
-
}
|
6
|
+
author: /^Author:\s+([^\n]+)/,
|
7
|
+
date: /^CreationDate:\s+([^\n]+)/,
|
8
|
+
creator: /^Creator:\s+([^\n]+)/,
|
9
|
+
keywords: /^Keywords:\s+([^\n]+)/,
|
10
|
+
producer: /^Producer:\s+([^\n]+)/,
|
11
|
+
subject: /^Subject:\s+([^\n]+)/,
|
12
|
+
title: /^Title:\s+([^\n]+)/,
|
13
|
+
length: /^Pages:\s+([^\n]+)/
|
14
|
+
}.freeze
|
17
15
|
|
18
16
|
# Pull out a single datum from a pdf.
|
19
17
|
def extract(key, pdfs, opts)
|
20
18
|
extract_all(pdfs, opts)[key]
|
21
19
|
end
|
22
|
-
|
23
|
-
def extract_all(pdfs,
|
20
|
+
|
21
|
+
def extract_all(pdfs, _opts)
|
24
22
|
pdf = [pdfs].flatten.first
|
25
23
|
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
26
24
|
result = `#{cmd}`.chomp
|
27
|
-
raise ExtractionFailed, result if
|
25
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
28
26
|
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
27
|
if String.method_defined?(:encode)
|
30
|
-
result.encode!('UTF-8', 'binary', :
|
28
|
+
result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
|
31
29
|
else
|
32
30
|
require 'iconv' unless defined?(Iconv)
|
33
|
-
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
31
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
34
32
|
result = ic.iconv(result)
|
35
33
|
end
|
36
34
|
info = {}
|
@@ -44,7 +42,5 @@ module Docsplit
|
|
44
42
|
end
|
45
43
|
info
|
46
44
|
end
|
47
|
-
|
48
45
|
end
|
49
|
-
|
50
46
|
end
|
@@ -1,36 +1,31 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdftk** in order to create bursted single pages from
|
4
3
|
# a PDF document.
|
5
4
|
class PageExtractor
|
6
|
-
|
7
5
|
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
6
|
def extract(pdfs, opts)
|
9
7
|
extract_options opts
|
10
8
|
[pdfs].flatten.each do |pdf|
|
11
9
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
page_path = ESCAPE[File.join(@output,
|
13
|
-
FileUtils.mkdir_p @output unless File.
|
14
|
-
|
10
|
+
page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
|
11
|
+
FileUtils.mkdir_p @output unless File.exist?(@output)
|
12
|
+
|
15
13
|
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
"pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
|
15
|
+
else
|
16
|
+
"pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
|
19
17
|
end
|
20
18
|
result = `#{cmd}`.chomp
|
21
|
-
FileUtils.rm('doc_data.txt') if File.
|
22
|
-
raise ExtractionFailed, result if
|
19
|
+
FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
|
20
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
23
21
|
result
|
24
22
|
end
|
25
23
|
end
|
26
24
|
|
27
|
-
|
28
25
|
private
|
29
26
|
|
30
27
|
def extract_options(options)
|
31
28
|
@output = options[:output] || '.'
|
32
29
|
end
|
33
|
-
|
34
30
|
end
|
35
|
-
|
36
31
|
end
|
@@ -6,22 +6,24 @@ module Docsplit
|
|
6
6
|
@@version_string = nil
|
7
7
|
|
8
8
|
# Provide a set of helper functions to determine the OS.
|
9
|
-
HOST_OS = (defined?(
|
9
|
+
HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
|
10
10
|
def windows?
|
11
11
|
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
12
12
|
end
|
13
|
+
|
13
14
|
def osx?
|
14
15
|
!!HOST_OS.match(/darwin/i)
|
15
16
|
end
|
17
|
+
|
16
18
|
def linux?
|
17
19
|
!!HOST_OS.match(/linux/i)
|
18
20
|
end
|
19
|
-
|
21
|
+
|
20
22
|
# The first line of the help output holds the name and version number
|
21
23
|
# of the office software to be used for extraction.
|
22
24
|
def version_string
|
23
25
|
unless @@version_string
|
24
|
-
null = windows? ?
|
26
|
+
null = windows? ? 'NUL' : '/dev/null'
|
25
27
|
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
28
|
if !!@@version_string.to_s.match(/[0-9]*/)
|
27
29
|
@@version_string = `#{office_executable} --version`.split("\n").first
|
@@ -29,23 +31,25 @@ module Docsplit
|
|
29
31
|
end
|
30
32
|
@@version_string
|
31
33
|
end
|
34
|
+
|
32
35
|
def libre_office?
|
33
36
|
!!version_string.match(/^LibreOffice/)
|
34
37
|
end
|
38
|
+
|
35
39
|
def open_office?
|
36
40
|
!!version_string.match(/^OpenOffice.org/)
|
37
41
|
end
|
38
|
-
|
42
|
+
|
39
43
|
# A set of default locations to search for office software
|
40
44
|
# These have been extracted from JODConverter. Each listed
|
41
|
-
# path should contain a directory "program" which in turn
|
45
|
+
# path should contain a directory "program" which in turn
|
42
46
|
# contains the "soffice" executable.
|
43
47
|
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
44
48
|
def office_search_paths
|
45
49
|
if windows?
|
46
|
-
office_names = [
|
47
|
-
program_files_path = ENV[
|
48
|
-
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
50
|
+
office_names = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
|
51
|
+
program_files_path = ENV['CommonProgramFiles']
|
52
|
+
search_paths = office_names.map { |program| File.join(program_files_path, program) }
|
49
53
|
elsif osx?
|
50
54
|
search_paths = %w(
|
51
55
|
/Applications/LibreOffice.app/Contents
|
@@ -69,7 +73,7 @@ module Docsplit
|
|
69
73
|
end
|
70
74
|
search_paths
|
71
75
|
end
|
72
|
-
|
76
|
+
|
73
77
|
# Identify the path to a working office executable.
|
74
78
|
def office_executable
|
75
79
|
paths = office_search_paths
|
@@ -78,45 +82,45 @@ module Docsplit
|
|
78
82
|
# raise an error if that path isn't valid, otherwise, add
|
79
83
|
# it to the front of our search paths.
|
80
84
|
if ENV['OFFICE_PATH']
|
81
|
-
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.
|
85
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
|
82
86
|
paths.unshift(ENV['OFFICE_PATH'])
|
83
87
|
end
|
84
|
-
|
88
|
+
|
85
89
|
# The location of the office executable is OS dependent
|
86
|
-
path_pieces = [
|
90
|
+
path_pieces = ['soffice']
|
87
91
|
if windows?
|
88
|
-
path_pieces += [[
|
92
|
+
path_pieces += [['program', 'soffice.bin']]
|
89
93
|
elsif osx?
|
90
|
-
path_pieces += [
|
94
|
+
path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
|
91
95
|
else
|
92
|
-
path_pieces += [
|
96
|
+
path_pieces += [%w(program soffice)]
|
93
97
|
end
|
94
|
-
|
98
|
+
|
95
99
|
# Search for the first suitable office executable
|
96
100
|
# and short circuit an executable is found.
|
97
101
|
paths.each do |path|
|
98
|
-
if File.
|
102
|
+
if File.exist? path
|
99
103
|
@@executable ||= path unless File.directory? path
|
100
104
|
path_pieces.each do |pieces|
|
101
105
|
check_path = File.join(path, pieces)
|
102
|
-
@@executable ||= check_path if File.
|
106
|
+
@@executable ||= check_path if File.exist? check_path
|
103
107
|
end
|
104
108
|
end
|
105
109
|
break if @@executable
|
106
110
|
end
|
107
|
-
raise OfficeNotFound,
|
111
|
+
raise OfficeNotFound, 'No office software found' unless @@executable
|
108
112
|
@@executable
|
109
113
|
end
|
110
|
-
|
114
|
+
|
111
115
|
# Used to specify the office location for JODConverter
|
112
116
|
def office_path
|
113
117
|
File.dirname(File.dirname(office_executable))
|
114
118
|
end
|
115
|
-
|
119
|
+
|
116
120
|
# Convert documents to PDF.
|
117
121
|
def extract(docs, opts)
|
118
122
|
out = opts[:output] || '.'
|
119
|
-
FileUtils.mkdir_p out unless File.
|
123
|
+
FileUtils.mkdir_p out unless File.exist?(out)
|
120
124
|
[docs].flatten.each do |doc|
|
121
125
|
ext = File.extname(doc)
|
122
126
|
basename = File.basename(doc, ext)
|
@@ -127,12 +131,12 @@ module Docsplit
|
|
127
131
|
else
|
128
132
|
if libre_office?
|
129
133
|
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
130
|
-
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
131
|
-
|
134
|
+
ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
|
135
|
+
|
132
136
|
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
133
137
|
cmd = "#{office_executable} #{options} 2>&1"
|
134
138
|
result = `#{cmd}`.chomp
|
135
|
-
raise ExtractionFailed, result if
|
139
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
136
140
|
true
|
137
141
|
else # open office presumably, rely on JODConverter to figure it out.
|
138
142
|
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
@@ -142,23 +146,22 @@ module Docsplit
|
|
142
146
|
end
|
143
147
|
end
|
144
148
|
|
145
|
-
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
149
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
|
146
150
|
|
147
|
-
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
151
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
|
152
|
+
|
153
|
+
HEADLESS = '-Djava.awt.headless=true'.freeze
|
148
154
|
|
149
|
-
HEADLESS = "-Djava.awt.headless=true"
|
150
|
-
|
151
155
|
private
|
152
|
-
|
153
|
-
# Runs a Java command, with quieted logging, and the classpath set properly.
|
154
|
-
def run_jod(command, pdfs, opts, return_output=false)
|
155
156
|
|
156
|
-
|
157
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
158
|
+
def run_jod(command, pdfs, _opts, return_output = false)
|
159
|
+
pdfs = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
|
157
160
|
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
158
161
|
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
159
162
|
result = `#{cmd}`.chomp
|
160
|
-
raise ExtractionFailed, result if
|
161
|
-
|
163
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
164
|
+
return_output ? (result.empty? ? nil : result) : true
|
162
165
|
end
|
163
166
|
|
164
167
|
class OfficeNotFound < StandardError; end
|
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'strscan'
|
2
2
|
|
3
3
|
module Docsplit
|
4
|
-
|
5
4
|
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
6
5
|
# words. Algorithms taken from:
|
7
6
|
#
|
@@ -13,7 +12,6 @@ module Docsplit
|
|
13
12
|
# -- Kulp
|
14
13
|
#
|
15
14
|
class TextCleaner
|
16
|
-
|
17
15
|
# Cached regexes we plan on using.
|
18
16
|
WORD = /\S+/
|
19
17
|
SPACE = /\s+/
|
@@ -36,7 +34,7 @@ module Docsplit
|
|
36
34
|
# multibyte-aware version, coercing to ASCII first.
|
37
35
|
def clean(text)
|
38
36
|
if String.method_defined?(:encode)
|
39
|
-
text.encode!('ascii', :
|
37
|
+
text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
|
40
38
|
else
|
41
39
|
require 'iconv' unless defined?(Iconv)
|
42
40
|
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
@@ -67,33 +65,31 @@ module Docsplit
|
|
67
65
|
# More than 30 bytes in length.
|
68
66
|
(w.length > 30) ||
|
69
67
|
|
70
|
-
|
71
|
-
|
68
|
+
# If there are three or more identical characters in a row in the string.
|
69
|
+
(w =~ REPEAT) ||
|
72
70
|
|
73
|
-
|
74
|
-
|
71
|
+
# More punctuation than alpha numerics.
|
72
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
75
73
|
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
# Ignoring the first and last characters in the string, if there are three or
|
75
|
+
# more different punctuation characters in the string.
|
76
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
79
77
|
|
80
|
-
|
81
|
-
|
78
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
79
|
+
((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
|
82
80
|
|
83
|
-
|
84
|
-
|
85
|
-
|
81
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
82
|
+
# not all uppercase + punctuation.
|
83
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
86
84
|
|
87
|
-
|
88
|
-
|
85
|
+
# Single letters that are not A or I.
|
86
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
89
87
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
88
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
89
|
+
# consonants, or 8 times more consonants than vowels.
|
90
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
91
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
92
|
+
(cons > vows * 8)))
|
95
93
|
end
|
96
|
-
|
97
94
|
end
|
98
|
-
|
99
95
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
3
|
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
4
|
# forbid OCR extraction, but by default the heuristic works like this:
|
@@ -13,11 +12,10 @@ module Docsplit
|
|
13
12
|
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
13
|
#
|
15
14
|
class TextExtractor
|
16
|
-
|
17
15
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
16
|
|
19
|
-
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
|
-
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
17
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'.freeze
|
18
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
|
21
19
|
|
22
20
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
21
|
|
@@ -28,10 +26,10 @@ module Docsplit
|
|
28
26
|
# Extract text from a list of PDFs.
|
29
27
|
def extract(pdfs, opts)
|
30
28
|
extract_options opts
|
31
|
-
FileUtils.mkdir_p @output unless File.
|
29
|
+
FileUtils.mkdir_p @output unless File.exist?(@output)
|
32
30
|
[pdfs].flatten.each do |pdf|
|
33
31
|
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
-
pages =
|
32
|
+
pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
|
35
33
|
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
34
|
extract_from_ocr(pdf, pages)
|
37
35
|
else
|
@@ -52,7 +50,7 @@ module Docsplit
|
|
52
50
|
# Extract a page range worth of text from a PDF, directly.
|
53
51
|
def extract_from_pdf(pdf, pages)
|
54
52
|
return extract_full(pdf) unless pages
|
55
|
-
pages.each {|page| extract_page(pdf, page) }
|
53
|
+
pages.each { |page| extract_page(pdf, page) }
|
56
54
|
end
|
57
55
|
|
58
56
|
# Extract a page range worth of text from a PDF via OCR.
|
@@ -60,7 +58,7 @@ module Docsplit
|
|
60
58
|
tempdir = Dir.mktmpdir
|
61
59
|
base_path = File.join(@output, @pdf_name)
|
62
60
|
escaped_pdf = ESCAPE[pdf]
|
63
|
-
psm = @detect_orientation ?
|
61
|
+
psm = @detect_orientation ? '-psm 1' : ''
|
64
62
|
if pages
|
65
63
|
pages.each do |page|
|
66
64
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
@@ -75,15 +73,14 @@ module Docsplit
|
|
75
73
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
76
74
|
escaped_tiff = ESCAPE[tiff]
|
77
75
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
78
|
-
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
76
|
+
# if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
79
77
|
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
|
80
78
|
clean_text(base_path + '.txt') if @clean_ocr
|
81
79
|
end
|
82
80
|
ensure
|
83
|
-
FileUtils.remove_entry_secure tempdir if File.
|
81
|
+
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
|
84
82
|
end
|
85
83
|
|
86
|
-
|
87
84
|
private
|
88
85
|
|
89
86
|
def clean_text(file)
|
@@ -98,7 +95,7 @@ module Docsplit
|
|
98
95
|
# Run an external process and raise an exception if it fails.
|
99
96
|
def run(command)
|
100
97
|
result = `#{command}`
|
101
|
-
raise ExtractionFailed, result if
|
98
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
102
99
|
result
|
103
100
|
end
|
104
101
|
|
@@ -124,10 +121,8 @@ module Docsplit
|
|
124
121
|
@force_ocr = options[:ocr] == true
|
125
122
|
@forbid_ocr = options[:ocr] == false
|
126
123
|
@language = options[:language] || 'eng'
|
127
|
-
@clean_ocr = (!(options[:clean] == false)
|
128
|
-
@detect_orientation = ((options[:detect_orientation] != false)
|
124
|
+
@clean_ocr = (!(options[:clean] == false) && @language == 'eng')
|
125
|
+
@detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
|
129
126
|
end
|
130
|
-
|
131
127
|
end
|
132
|
-
|
133
128
|
end
|
@@ -1,9 +1,7 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Include a method to transparently convert non-PDF arguments to temporary
|
4
3
|
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
4
|
module TransparentPDFs
|
6
|
-
|
7
5
|
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
6
|
# through further extraction.
|
9
7
|
def ensure_pdfs(docs)
|
@@ -12,18 +10,16 @@ module Docsplit
|
|
12
10
|
doc
|
13
11
|
else
|
14
12
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
15
|
-
extract_pdf([doc],
|
13
|
+
extract_pdf([doc], output: tempdir)
|
16
14
|
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
17
15
|
end
|
18
16
|
end
|
19
17
|
end
|
20
18
|
|
21
19
|
def is_pdf?(doc)
|
22
|
-
File.extname(doc).
|
20
|
+
File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
|
27
24
|
extend TransparentPDFs
|
28
|
-
|
29
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: burisu-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2016-09-06 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: |2
|
16
16
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- lib/docsplit/text_cleaner.rb
|
37
37
|
- lib/docsplit/text_extractor.rb
|
38
38
|
- lib/docsplit/transparent_pdfs.rb
|
39
|
+
- lib/docsplit/version.rb
|
39
40
|
- vendor/conf/document-formats.js
|
40
41
|
- vendor/jodconverter/commons-cli-1.1.jar
|
41
42
|
- vendor/jodconverter/commons-io-1.4.jar
|
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
67
|
version: '0'
|
67
68
|
requirements: []
|
68
69
|
rubyforge_project:
|
69
|
-
rubygems_version: 2.4.5
|
70
|
+
rubygems_version: 2.4.5.1
|
70
71
|
signing_key:
|
71
72
|
specification_version: 4
|
72
73
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
73
74
|
test_files: []
|
74
|
-
has_rdoc:
|