burisu-docsplit 0.7.8 → 0.7.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/docsplit +1 -1
- data/docsplit.gemspec +7 -3
- data/lib/docsplit.rb +15 -18
- data/lib/docsplit/command_line.rb +20 -27
- data/lib/docsplit/image_extractor.rb +18 -23
- data/lib/docsplit/info_extractor.rb +14 -18
- data/lib/docsplit/page_extractor.rb +8 -13
- data/lib/docsplit/pdf_extractor.rb +38 -35
- data/lib/docsplit/text_cleaner.rb +20 -24
- data/lib/docsplit/text_extractor.rb +11 -16
- data/lib/docsplit/transparent_pdfs.rb +2 -6
- data/lib/docsplit/version.rb +3 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
|
4
|
+
data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
|
7
|
+
data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644
|
data/bin/docsplit
CHANGED
data/docsplit.gemspec
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'docsplit/version'
|
4
|
+
|
1
5
|
Gem::Specification.new do |s|
|
2
6
|
s.name = 'burisu-docsplit'
|
3
|
-
s.version =
|
4
|
-
s.homepage =
|
5
|
-
s.summary =
|
7
|
+
s.version = Docsplit::VERSION # Keep version in sync with docsplit.rb
|
8
|
+
s.homepage = 'http://documentcloud.github.com/docsplit/'
|
9
|
+
s.summary = 'Break Apart Documents into Images, Text, Pages and PDFs'
|
6
10
|
s.description = <<-EOS
|
7
11
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
8
12
|
documents into their component parts: searchable UTF-8 plain text, page
|
data/lib/docsplit.rb
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
require 'tmpdir'
|
2
2
|
require 'fileutils'
|
3
3
|
require 'shellwords'
|
4
|
+
require 'docsplit/version'
|
4
5
|
|
5
6
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
7
|
module Docsplit
|
7
|
-
|
8
|
-
VERSION = '0.7.6' # Keep in sync with gemspec.
|
9
|
-
|
10
|
-
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
8
|
+
ESCAPE = ->(x) { Shellwords.shellescape(x) }
|
11
9
|
|
12
10
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
11
|
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
12
|
|
15
|
-
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
16
|
-
|
17
|
-
GM_FORMATS = [
|
13
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
|
14
|
+
|
15
|
+
GM_FORMATS = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
|
18
16
|
|
19
|
-
DEPENDENCIES = {:
|
17
|
+
DEPENDENCIES = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
|
20
18
|
|
21
19
|
# Check for all dependencies, and note their absence.
|
22
20
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -32,28 +30,28 @@ module Docsplit
|
|
32
30
|
# if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
|
33
31
|
if DEPENDENCIES[:tesseract]
|
34
32
|
# osd will be listed in tesseract --listlangs
|
35
|
-
val =
|
33
|
+
val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
|
36
34
|
DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
|
37
35
|
end
|
38
36
|
|
39
|
-
|
37
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
40
38
|
# broke.
|
41
39
|
class ExtractionFailed < StandardError; end
|
42
40
|
|
43
41
|
# Use the ExtractPages Java class to burst a PDF into single pages.
|
44
|
-
def self.extract_pages(pdfs, opts={})
|
42
|
+
def self.extract_pages(pdfs, opts = {})
|
45
43
|
pdfs = ensure_pdfs(pdfs)
|
46
44
|
PageExtractor.new.extract(pdfs, opts)
|
47
45
|
end
|
48
46
|
|
49
47
|
# Use the ExtractText Java class to write out all embedded text.
|
50
|
-
def self.extract_text(pdfs, opts={})
|
48
|
+
def self.extract_text(pdfs, opts = {})
|
51
49
|
pdfs = ensure_pdfs(pdfs)
|
52
50
|
TextExtractor.new.extract(pdfs, opts)
|
53
51
|
end
|
54
52
|
|
55
53
|
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
56
|
-
def self.extract_images(pdfs, opts={})
|
54
|
+
def self.extract_images(pdfs, opts = {})
|
57
55
|
pdfs = ensure_pdfs(pdfs)
|
58
56
|
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
59
57
|
ImageExtractor.new.extract(pdfs, opts)
|
@@ -61,7 +59,7 @@ module Docsplit
|
|
61
59
|
|
62
60
|
# Use JODCConverter to extract the documents as PDFs.
|
63
61
|
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
64
|
-
def self.extract_pdf(docs, opts={})
|
62
|
+
def self.extract_pdf(docs, opts = {})
|
65
63
|
PdfExtractor.new.extract(docs, opts)
|
66
64
|
end
|
67
65
|
|
@@ -75,8 +73,8 @@ module Docsplit
|
|
75
73
|
end
|
76
74
|
EOS
|
77
75
|
end
|
78
|
-
|
79
|
-
def self.extract_info(pdfs, opts={})
|
76
|
+
|
77
|
+
def self.extract_info(pdfs, opts = {})
|
80
78
|
pdfs = ensure_pdfs(pdfs)
|
81
79
|
InfoExtractor.new.extract_all(pdfs, opts)
|
82
80
|
end
|
@@ -93,11 +91,10 @@ module Docsplit
|
|
93
91
|
def self.normalize_value(value)
|
94
92
|
case value
|
95
93
|
when Range then value.to_a.join(',')
|
96
|
-
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
94
|
+
when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
97
95
|
else value.to_s
|
98
96
|
end
|
99
97
|
end
|
100
|
-
|
101
98
|
end
|
102
99
|
|
103
100
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
@@ -2,11 +2,9 @@ require 'optparse'
|
|
2
2
|
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
3
|
|
4
4
|
module Docsplit
|
5
|
-
|
6
5
|
# A single command-line utility to separate a PDF into all its component parts.
|
7
6
|
class CommandLine
|
8
|
-
|
9
|
-
BANNER = <<-EOS
|
7
|
+
BANNER = <<-EOS.freeze
|
10
8
|
docsplit breaks apart documents into images, text, or individual pages.
|
11
9
|
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
10
|
|
@@ -39,24 +37,22 @@ Options:
|
|
39
37
|
|
40
38
|
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
39
|
def run
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
case @command
|
41
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
42
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
43
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
44
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
45
|
+
else
|
46
|
+
if METADATA_KEYS.include?(@command)
|
47
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
48
|
+
puts value unless value.nil?
|
48
49
|
else
|
49
|
-
|
50
|
-
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
-
puts value unless value.nil?
|
52
|
-
else
|
53
|
-
usage
|
54
|
-
end
|
50
|
+
usage
|
55
51
|
end
|
56
|
-
rescue ExtractionFailed => e
|
57
|
-
puts e.message.chomp
|
58
|
-
exit(1)
|
59
52
|
end
|
53
|
+
rescue ExtractionFailed => e
|
54
|
+
puts e.message.chomp
|
55
|
+
exit(1)
|
60
56
|
end
|
61
57
|
|
62
58
|
# Print out the usage help message.
|
@@ -65,18 +61,17 @@ Options:
|
|
65
61
|
exit
|
66
62
|
end
|
67
63
|
|
68
|
-
|
69
64
|
private
|
70
65
|
|
71
66
|
# Use the OptionParser library to parse out all supported options. Return
|
72
67
|
# options formatted for the Ruby API.
|
73
68
|
def parse_options
|
74
|
-
@options = {:
|
69
|
+
@options = { ocr: :default, clean: true }
|
75
70
|
@option_parser = OptionParser.new do |opts|
|
76
71
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
72
|
@options[:output] = d
|
78
73
|
end
|
79
|
-
opts.on('-p', '--pages [PAGES]',
|
74
|
+
opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
|
80
75
|
@options[:pages] = p
|
81
76
|
end
|
82
77
|
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
@@ -91,16 +86,16 @@ Options:
|
|
91
86
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
92
87
|
@options[:ocr] = o
|
93
88
|
end
|
94
|
-
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |
|
89
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
|
95
90
|
@options[:clean] = false
|
96
91
|
end
|
97
92
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
93
|
@options[:language] = l
|
99
94
|
end
|
100
|
-
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |
|
95
|
+
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
|
101
96
|
@options[:detect_orientation] = false
|
102
97
|
end
|
103
|
-
opts.on('-r', '--rolling', 'generate images from each previous image') do |
|
98
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
|
104
99
|
@options[:rolling] = true
|
105
100
|
end
|
106
101
|
opts.on_tail('-v', '--version', 'display docsplit version') do
|
@@ -119,7 +114,5 @@ Options:
|
|
119
114
|
exit(1)
|
120
115
|
end
|
121
116
|
end
|
122
|
-
|
123
117
|
end
|
124
|
-
|
125
|
-
end
|
118
|
+
end
|
@@ -1,12 +1,10 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
3
|
# nicely sized images.
|
5
4
|
class ImageExtractor
|
6
|
-
|
7
|
-
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
5
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
|
8
6
|
DEFAULT_FORMAT = :png
|
9
|
-
DEFAULT_DENSITY = '150'
|
7
|
+
DEFAULT_DENSITY = '150'.freeze
|
10
8
|
|
11
9
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
10
|
# configuration in options.
|
@@ -15,8 +13,8 @@ module Docsplit
|
|
15
13
|
extract_options(options)
|
16
14
|
@pdfs.each do |pdf|
|
17
15
|
previous = nil
|
18
|
-
@sizes.each_with_index do |size,
|
19
|
-
@formats.each {|format| convert(pdf, size, format, previous) }
|
16
|
+
@sizes.each_with_index do |size, _i|
|
17
|
+
@formats.each { |format| convert(pdf, size, format, previous) }
|
20
18
|
previous = size if @rolling
|
21
19
|
end
|
22
20
|
end
|
@@ -27,36 +25,35 @@ module Docsplit
|
|
27
25
|
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
26
|
# Now we generate one page at a time, a counterintuitive opimization
|
29
27
|
# suggested by the GraphicsMagick list, that seems to work quite well.
|
30
|
-
def convert(pdf, size, format, previous=nil)
|
28
|
+
def convert(pdf, size, format, previous = nil)
|
31
29
|
tempdir = Dir.mktmpdir
|
32
30
|
basename = File.basename(pdf, File.extname(pdf))
|
33
31
|
directory = directory_for(size)
|
34
32
|
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
33
|
escaped_pdf = ESCAPE[pdf]
|
36
|
-
FileUtils.mkdir_p(directory) unless File.
|
37
|
-
common
|
34
|
+
FileUtils.mkdir_p(directory) unless File.exist?(directory)
|
35
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
38
36
|
if previous
|
39
37
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
40
38
|
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
41
|
-
raise ExtractionFailed, result if
|
39
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
42
40
|
else
|
43
41
|
page_list(pages).each do |page|
|
44
|
-
out_file
|
42
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
43
|
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
44
|
result = `#{cmd}`.chomp
|
47
|
-
raise ExtractionFailed, result if
|
45
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
48
46
|
end
|
49
47
|
end
|
50
48
|
ensure
|
51
|
-
FileUtils.remove_entry_secure tempdir if File.
|
49
|
+
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
|
52
50
|
end
|
53
51
|
|
54
|
-
|
55
52
|
private
|
56
53
|
|
57
54
|
# Extract the relevant GraphicsMagick options from the options hash.
|
58
55
|
def extract_options(options)
|
59
|
-
@output = options[:output]
|
56
|
+
@output = options[:output] || '.'
|
60
57
|
@pages = options[:pages]
|
61
58
|
@density = options[:density] || DEFAULT_DENSITY
|
62
59
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
@@ -80,24 +77,22 @@ module Docsplit
|
|
80
77
|
# Generate the appropriate quality argument for the image format.
|
81
78
|
def quality_arg(format)
|
82
79
|
case format.to_s
|
83
|
-
when /jpe?g/ then
|
84
|
-
when /png/ then
|
85
|
-
else
|
80
|
+
when /jpe?g/ then '-quality 85'
|
81
|
+
when /png/ then '-quality 100'
|
82
|
+
else ''
|
86
83
|
end
|
87
84
|
end
|
88
85
|
|
89
86
|
# Generate the expanded list of requested page numbers.
|
90
87
|
def page_list(pages)
|
91
|
-
pages.split(',').map
|
88
|
+
pages.split(',').map do |range|
|
92
89
|
if range.include?('-')
|
93
90
|
range = range.split('-')
|
94
|
-
Range.new(range.first.to_i, range.last.to_i).to_a.map
|
91
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
|
95
92
|
else
|
96
93
|
range.to_i
|
97
94
|
end
|
98
|
-
|
95
|
+
end.flatten.uniq.sort
|
99
96
|
end
|
100
|
-
|
101
97
|
end
|
102
|
-
|
103
98
|
end
|
@@ -1,36 +1,34 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
3
|
class InfoExtractor
|
5
|
-
|
6
4
|
# Regex matchers for different bits of information.
|
7
5
|
MATCHERS = {
|
8
|
-
:
|
9
|
-
:
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:
|
16
|
-
}
|
6
|
+
author: /^Author:\s+([^\n]+)/,
|
7
|
+
date: /^CreationDate:\s+([^\n]+)/,
|
8
|
+
creator: /^Creator:\s+([^\n]+)/,
|
9
|
+
keywords: /^Keywords:\s+([^\n]+)/,
|
10
|
+
producer: /^Producer:\s+([^\n]+)/,
|
11
|
+
subject: /^Subject:\s+([^\n]+)/,
|
12
|
+
title: /^Title:\s+([^\n]+)/,
|
13
|
+
length: /^Pages:\s+([^\n]+)/
|
14
|
+
}.freeze
|
17
15
|
|
18
16
|
# Pull out a single datum from a pdf.
|
19
17
|
def extract(key, pdfs, opts)
|
20
18
|
extract_all(pdfs, opts)[key]
|
21
19
|
end
|
22
|
-
|
23
|
-
def extract_all(pdfs,
|
20
|
+
|
21
|
+
def extract_all(pdfs, _opts)
|
24
22
|
pdf = [pdfs].flatten.first
|
25
23
|
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
26
24
|
result = `#{cmd}`.chomp
|
27
|
-
raise ExtractionFailed, result if
|
25
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
28
26
|
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
27
|
if String.method_defined?(:encode)
|
30
|
-
result.encode!('UTF-8', 'binary', :
|
28
|
+
result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
|
31
29
|
else
|
32
30
|
require 'iconv' unless defined?(Iconv)
|
33
|
-
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
31
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
34
32
|
result = ic.iconv(result)
|
35
33
|
end
|
36
34
|
info = {}
|
@@ -44,7 +42,5 @@ module Docsplit
|
|
44
42
|
end
|
45
43
|
info
|
46
44
|
end
|
47
|
-
|
48
45
|
end
|
49
|
-
|
50
46
|
end
|
@@ -1,36 +1,31 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdftk** in order to create bursted single pages from
|
4
3
|
# a PDF document.
|
5
4
|
class PageExtractor
|
6
|
-
|
7
5
|
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
6
|
def extract(pdfs, opts)
|
9
7
|
extract_options opts
|
10
8
|
[pdfs].flatten.each do |pdf|
|
11
9
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
page_path = ESCAPE[File.join(@output,
|
13
|
-
FileUtils.mkdir_p @output unless File.
|
14
|
-
|
10
|
+
page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
|
11
|
+
FileUtils.mkdir_p @output unless File.exist?(@output)
|
12
|
+
|
15
13
|
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
"pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
|
15
|
+
else
|
16
|
+
"pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
|
19
17
|
end
|
20
18
|
result = `#{cmd}`.chomp
|
21
|
-
FileUtils.rm('doc_data.txt') if File.
|
22
|
-
raise ExtractionFailed, result if
|
19
|
+
FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
|
20
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
23
21
|
result
|
24
22
|
end
|
25
23
|
end
|
26
24
|
|
27
|
-
|
28
25
|
private
|
29
26
|
|
30
27
|
def extract_options(options)
|
31
28
|
@output = options[:output] || '.'
|
32
29
|
end
|
33
|
-
|
34
30
|
end
|
35
|
-
|
36
31
|
end
|
@@ -6,22 +6,24 @@ module Docsplit
|
|
6
6
|
@@version_string = nil
|
7
7
|
|
8
8
|
# Provide a set of helper functions to determine the OS.
|
9
|
-
HOST_OS = (defined?(
|
9
|
+
HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
|
10
10
|
def windows?
|
11
11
|
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
12
12
|
end
|
13
|
+
|
13
14
|
def osx?
|
14
15
|
!!HOST_OS.match(/darwin/i)
|
15
16
|
end
|
17
|
+
|
16
18
|
def linux?
|
17
19
|
!!HOST_OS.match(/linux/i)
|
18
20
|
end
|
19
|
-
|
21
|
+
|
20
22
|
# The first line of the help output holds the name and version number
|
21
23
|
# of the office software to be used for extraction.
|
22
24
|
def version_string
|
23
25
|
unless @@version_string
|
24
|
-
null = windows? ?
|
26
|
+
null = windows? ? 'NUL' : '/dev/null'
|
25
27
|
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
28
|
if !!@@version_string.to_s.match(/[0-9]*/)
|
27
29
|
@@version_string = `#{office_executable} --version`.split("\n").first
|
@@ -29,23 +31,25 @@ module Docsplit
|
|
29
31
|
end
|
30
32
|
@@version_string
|
31
33
|
end
|
34
|
+
|
32
35
|
def libre_office?
|
33
36
|
!!version_string.match(/^LibreOffice/)
|
34
37
|
end
|
38
|
+
|
35
39
|
def open_office?
|
36
40
|
!!version_string.match(/^OpenOffice.org/)
|
37
41
|
end
|
38
|
-
|
42
|
+
|
39
43
|
# A set of default locations to search for office software
|
40
44
|
# These have been extracted from JODConverter. Each listed
|
41
|
-
# path should contain a directory "program" which in turn
|
45
|
+
# path should contain a directory "program" which in turn
|
42
46
|
# contains the "soffice" executable.
|
43
47
|
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
44
48
|
def office_search_paths
|
45
49
|
if windows?
|
46
|
-
office_names = [
|
47
|
-
program_files_path = ENV[
|
48
|
-
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
50
|
+
office_names = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
|
51
|
+
program_files_path = ENV['CommonProgramFiles']
|
52
|
+
search_paths = office_names.map { |program| File.join(program_files_path, program) }
|
49
53
|
elsif osx?
|
50
54
|
search_paths = %w(
|
51
55
|
/Applications/LibreOffice.app/Contents
|
@@ -69,7 +73,7 @@ module Docsplit
|
|
69
73
|
end
|
70
74
|
search_paths
|
71
75
|
end
|
72
|
-
|
76
|
+
|
73
77
|
# Identify the path to a working office executable.
|
74
78
|
def office_executable
|
75
79
|
paths = office_search_paths
|
@@ -78,45 +82,45 @@ module Docsplit
|
|
78
82
|
# raise an error if that path isn't valid, otherwise, add
|
79
83
|
# it to the front of our search paths.
|
80
84
|
if ENV['OFFICE_PATH']
|
81
|
-
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.
|
85
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
|
82
86
|
paths.unshift(ENV['OFFICE_PATH'])
|
83
87
|
end
|
84
|
-
|
88
|
+
|
85
89
|
# The location of the office executable is OS dependent
|
86
|
-
path_pieces = [
|
90
|
+
path_pieces = ['soffice']
|
87
91
|
if windows?
|
88
|
-
path_pieces += [[
|
92
|
+
path_pieces += [['program', 'soffice.bin']]
|
89
93
|
elsif osx?
|
90
|
-
path_pieces += [
|
94
|
+
path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
|
91
95
|
else
|
92
|
-
path_pieces += [
|
96
|
+
path_pieces += [%w(program soffice)]
|
93
97
|
end
|
94
|
-
|
98
|
+
|
95
99
|
# Search for the first suitable office executable
|
96
100
|
# and short circuit an executable is found.
|
97
101
|
paths.each do |path|
|
98
|
-
if File.
|
102
|
+
if File.exist? path
|
99
103
|
@@executable ||= path unless File.directory? path
|
100
104
|
path_pieces.each do |pieces|
|
101
105
|
check_path = File.join(path, pieces)
|
102
|
-
@@executable ||= check_path if File.
|
106
|
+
@@executable ||= check_path if File.exist? check_path
|
103
107
|
end
|
104
108
|
end
|
105
109
|
break if @@executable
|
106
110
|
end
|
107
|
-
raise OfficeNotFound,
|
111
|
+
raise OfficeNotFound, 'No office software found' unless @@executable
|
108
112
|
@@executable
|
109
113
|
end
|
110
|
-
|
114
|
+
|
111
115
|
# Used to specify the office location for JODConverter
|
112
116
|
def office_path
|
113
117
|
File.dirname(File.dirname(office_executable))
|
114
118
|
end
|
115
|
-
|
119
|
+
|
116
120
|
# Convert documents to PDF.
|
117
121
|
def extract(docs, opts)
|
118
122
|
out = opts[:output] || '.'
|
119
|
-
FileUtils.mkdir_p out unless File.
|
123
|
+
FileUtils.mkdir_p out unless File.exist?(out)
|
120
124
|
[docs].flatten.each do |doc|
|
121
125
|
ext = File.extname(doc)
|
122
126
|
basename = File.basename(doc, ext)
|
@@ -127,12 +131,12 @@ module Docsplit
|
|
127
131
|
else
|
128
132
|
if libre_office?
|
129
133
|
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
130
|
-
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
131
|
-
|
134
|
+
ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
|
135
|
+
|
132
136
|
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
133
137
|
cmd = "#{office_executable} #{options} 2>&1"
|
134
138
|
result = `#{cmd}`.chomp
|
135
|
-
raise ExtractionFailed, result if
|
139
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
136
140
|
true
|
137
141
|
else # open office presumably, rely on JODConverter to figure it out.
|
138
142
|
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
@@ -142,23 +146,22 @@ module Docsplit
|
|
142
146
|
end
|
143
147
|
end
|
144
148
|
|
145
|
-
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
149
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
|
146
150
|
|
147
|
-
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
151
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
|
152
|
+
|
153
|
+
HEADLESS = '-Djava.awt.headless=true'.freeze
|
148
154
|
|
149
|
-
HEADLESS = "-Djava.awt.headless=true"
|
150
|
-
|
151
155
|
private
|
152
|
-
|
153
|
-
# Runs a Java command, with quieted logging, and the classpath set properly.
|
154
|
-
def run_jod(command, pdfs, opts, return_output=false)
|
155
156
|
|
156
|
-
|
157
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
158
|
+
def run_jod(command, pdfs, _opts, return_output = false)
|
159
|
+
pdfs = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
|
157
160
|
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
158
161
|
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
159
162
|
result = `#{cmd}`.chomp
|
160
|
-
raise ExtractionFailed, result if
|
161
|
-
|
163
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
164
|
+
return_output ? (result.empty? ? nil : result) : true
|
162
165
|
end
|
163
166
|
|
164
167
|
class OfficeNotFound < StandardError; end
|
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'strscan'
|
2
2
|
|
3
3
|
module Docsplit
|
4
|
-
|
5
4
|
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
6
5
|
# words. Algorithms taken from:
|
7
6
|
#
|
@@ -13,7 +12,6 @@ module Docsplit
|
|
13
12
|
# -- Kulp
|
14
13
|
#
|
15
14
|
class TextCleaner
|
16
|
-
|
17
15
|
# Cached regexes we plan on using.
|
18
16
|
WORD = /\S+/
|
19
17
|
SPACE = /\s+/
|
@@ -36,7 +34,7 @@ module Docsplit
|
|
36
34
|
# multibyte-aware version, coercing to ASCII first.
|
37
35
|
def clean(text)
|
38
36
|
if String.method_defined?(:encode)
|
39
|
-
text.encode!('ascii', :
|
37
|
+
text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
|
40
38
|
else
|
41
39
|
require 'iconv' unless defined?(Iconv)
|
42
40
|
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
@@ -67,33 +65,31 @@ module Docsplit
|
|
67
65
|
# More than 30 bytes in length.
|
68
66
|
(w.length > 30) ||
|
69
67
|
|
70
|
-
|
71
|
-
|
68
|
+
# If there are three or more identical characters in a row in the string.
|
69
|
+
(w =~ REPEAT) ||
|
72
70
|
|
73
|
-
|
74
|
-
|
71
|
+
# More punctuation than alpha numerics.
|
72
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
75
73
|
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
# Ignoring the first and last characters in the string, if there are three or
|
75
|
+
# more different punctuation characters in the string.
|
76
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
79
77
|
|
80
|
-
|
81
|
-
|
78
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
79
|
+
((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
|
82
80
|
|
83
|
-
|
84
|
-
|
85
|
-
|
81
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
82
|
+
# not all uppercase + punctuation.
|
83
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
86
84
|
|
87
|
-
|
88
|
-
|
85
|
+
# Single letters that are not A or I.
|
86
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
89
87
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
88
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
89
|
+
# consonants, or 8 times more consonants than vowels.
|
90
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
91
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
92
|
+
(cons > vows * 8)))
|
95
93
|
end
|
96
|
-
|
97
94
|
end
|
98
|
-
|
99
95
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
3
|
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
4
|
# forbid OCR extraction, but by default the heuristic works like this:
|
@@ -13,11 +12,10 @@ module Docsplit
|
|
13
12
|
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
13
|
#
|
15
14
|
class TextExtractor
|
16
|
-
|
17
15
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
16
|
|
19
|
-
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
|
-
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
17
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'.freeze
|
18
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
|
21
19
|
|
22
20
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
21
|
|
@@ -28,10 +26,10 @@ module Docsplit
|
|
28
26
|
# Extract text from a list of PDFs.
|
29
27
|
def extract(pdfs, opts)
|
30
28
|
extract_options opts
|
31
|
-
FileUtils.mkdir_p @output unless File.
|
29
|
+
FileUtils.mkdir_p @output unless File.exist?(@output)
|
32
30
|
[pdfs].flatten.each do |pdf|
|
33
31
|
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
-
pages =
|
32
|
+
pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
|
35
33
|
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
34
|
extract_from_ocr(pdf, pages)
|
37
35
|
else
|
@@ -52,7 +50,7 @@ module Docsplit
|
|
52
50
|
# Extract a page range worth of text from a PDF, directly.
|
53
51
|
def extract_from_pdf(pdf, pages)
|
54
52
|
return extract_full(pdf) unless pages
|
55
|
-
pages.each {|page| extract_page(pdf, page) }
|
53
|
+
pages.each { |page| extract_page(pdf, page) }
|
56
54
|
end
|
57
55
|
|
58
56
|
# Extract a page range worth of text from a PDF via OCR.
|
@@ -60,7 +58,7 @@ module Docsplit
|
|
60
58
|
tempdir = Dir.mktmpdir
|
61
59
|
base_path = File.join(@output, @pdf_name)
|
62
60
|
escaped_pdf = ESCAPE[pdf]
|
63
|
-
psm = @detect_orientation ?
|
61
|
+
psm = @detect_orientation ? '-psm 1' : ''
|
64
62
|
if pages
|
65
63
|
pages.each do |page|
|
66
64
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
@@ -75,15 +73,14 @@ module Docsplit
|
|
75
73
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
76
74
|
escaped_tiff = ESCAPE[tiff]
|
77
75
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
78
|
-
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
76
|
+
# if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
79
77
|
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
|
80
78
|
clean_text(base_path + '.txt') if @clean_ocr
|
81
79
|
end
|
82
80
|
ensure
|
83
|
-
FileUtils.remove_entry_secure tempdir if File.
|
81
|
+
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
|
84
82
|
end
|
85
83
|
|
86
|
-
|
87
84
|
private
|
88
85
|
|
89
86
|
def clean_text(file)
|
@@ -98,7 +95,7 @@ module Docsplit
|
|
98
95
|
# Run an external process and raise an exception if it fails.
|
99
96
|
def run(command)
|
100
97
|
result = `#{command}`
|
101
|
-
raise ExtractionFailed, result if
|
98
|
+
raise ExtractionFailed, result if $?.exitstatus.nonzero?
|
102
99
|
result
|
103
100
|
end
|
104
101
|
|
@@ -124,10 +121,8 @@ module Docsplit
|
|
124
121
|
@force_ocr = options[:ocr] == true
|
125
122
|
@forbid_ocr = options[:ocr] == false
|
126
123
|
@language = options[:language] || 'eng'
|
127
|
-
@clean_ocr = (!(options[:clean] == false)
|
128
|
-
@detect_orientation = ((options[:detect_orientation] != false)
|
124
|
+
@clean_ocr = (!(options[:clean] == false) && @language == 'eng')
|
125
|
+
@detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
|
129
126
|
end
|
130
|
-
|
131
127
|
end
|
132
|
-
|
133
128
|
end
|
@@ -1,9 +1,7 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
3
2
|
# Include a method to transparently convert non-PDF arguments to temporary
|
4
3
|
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
4
|
module TransparentPDFs
|
6
|
-
|
7
5
|
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
6
|
# through further extraction.
|
9
7
|
def ensure_pdfs(docs)
|
@@ -12,18 +10,16 @@ module Docsplit
|
|
12
10
|
doc
|
13
11
|
else
|
14
12
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
15
|
-
extract_pdf([doc],
|
13
|
+
extract_pdf([doc], output: tempdir)
|
16
14
|
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
17
15
|
end
|
18
16
|
end
|
19
17
|
end
|
20
18
|
|
21
19
|
def is_pdf?(doc)
|
22
|
-
File.extname(doc).
|
20
|
+
File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
|
27
24
|
extend TransparentPDFs
|
28
|
-
|
29
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: burisu-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2016-09-06 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: |2
|
16
16
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- lib/docsplit/text_cleaner.rb
|
37
37
|
- lib/docsplit/text_extractor.rb
|
38
38
|
- lib/docsplit/transparent_pdfs.rb
|
39
|
+
- lib/docsplit/version.rb
|
39
40
|
- vendor/conf/document-formats.js
|
40
41
|
- vendor/jodconverter/commons-cli-1.1.jar
|
41
42
|
- vendor/jodconverter/commons-io-1.4.jar
|
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
67
|
version: '0'
|
67
68
|
requirements: []
|
68
69
|
rubyforge_project:
|
69
|
-
rubygems_version: 2.4.5
|
70
|
+
rubygems_version: 2.4.5.1
|
70
71
|
signing_key:
|
71
72
|
specification_version: 4
|
72
73
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
73
74
|
test_files: []
|
74
|
-
has_rdoc:
|