burisu-docsplit 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +25 -0
- data/README +22 -0
- data/bin/docsplit +5 -0
- data/docsplit.gemspec +23 -0
- data/lib/docsplit.rb +102 -0
- data/lib/docsplit/command_line.rb +123 -0
- data/lib/docsplit/image_extractor.rb +103 -0
- data/lib/docsplit/info_extractor.rb +50 -0
- data/lib/docsplit/page_extractor.rb +36 -0
- data/lib/docsplit/pdf_extractor.rb +162 -0
- data/lib/docsplit/text_cleaner.rb +99 -0
- data/lib/docsplit/text_extractor.rb +130 -0
- data/lib/docsplit/transparent_pdfs.rb +26 -0
- data/vendor/conf/document-formats.js +233 -0
- data/vendor/jodconverter/commons-cli-1.1.jar +0 -0
- data/vendor/jodconverter/commons-io-1.4.jar +0 -0
- data/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar +0 -0
- data/vendor/jodconverter/json-20090211.jar +0 -0
- data/vendor/jodconverter/juh-3.2.1.jar +0 -0
- data/vendor/jodconverter/jurt-3.2.1.jar +0 -0
- data/vendor/jodconverter/ridl-3.2.1.jar +0 -0
- data/vendor/jodconverter/unoil-3.2.1.jar +0 -0
- data/vendor/logging.properties +1 -0
- metadata +73 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 364308f838aa67a65ff4a478073b9011f6696aec
|
4
|
+
data.tar.gz: 949f366310ece3bf924296148ba2e2346e3dba77
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 71516f45bf021f608c76989dbd7032de6adcc0eae38e5d07b645f26d8819a2f637252f0e52e350bf4deb80ae9d5e29eb92f7aef193f0f12f5736712181fb28de
|
7
|
+
data.tar.gz: 5408cb91169a00ce40294631106dc70ee3f657cbb9a50ef650f1fe8547bf6d19c3cdbe1bb3e51c4512da8ca24682a5b7f6157b0218b4cf5c0ddc7bd2e6dcb935
|
data/LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
|
2
|
+
|
3
|
+
Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
|
4
|
+
Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person
|
7
|
+
obtaining a copy of this software and associated documentation
|
8
|
+
files (the "Software"), to deal in the Software without
|
9
|
+
restriction, including without limitation the rights to use,
|
10
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
copies of the Software, and to permit persons to whom the
|
12
|
+
Software is furnished to do so, subject to the following
|
13
|
+
conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
20
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
22
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
23
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
24
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
25
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
==
|
2
|
+
__ ___ __
|
3
|
+
____/ /___ ______________ / (_) /_
|
4
|
+
/ __ / __ \/ ___/ ___/ __ \/ / / __/
|
5
|
+
/ /_/ / /_/ / /__(__ ) /_/ / / / /_
|
6
|
+
\____/\____/\___/____/ .___/_/_/\__/
|
7
|
+
/_/
|
8
|
+
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
|
14
|
+
Installation:
|
15
|
+
gem install docsplit
|
16
|
+
|
17
|
+
For documentation, usage, and examples, see:
|
18
|
+
http://documentcloud.github.com/docsplit/
|
19
|
+
|
20
|
+
To suggest a feature or report a bug:
|
21
|
+
http://github.com/documentcloud/docsplit/issues/
|
22
|
+
|
data/bin/docsplit
ADDED
data/docsplit.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'burisu-docsplit'
|
3
|
+
s.version = '0.7.5' # Keep version in sync with docsplit.rb
|
4
|
+
|
5
|
+
s.homepage = "http://documentcloud.github.com/docsplit/"
|
6
|
+
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
7
|
+
s.description = <<-EOS
|
8
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
9
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
10
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
11
|
+
metadata (title, author, number of pages...)
|
12
|
+
EOS
|
13
|
+
|
14
|
+
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
15
|
+
s.email = 'opensource@documentcloud.org'
|
16
|
+
s.license = 'MIT'
|
17
|
+
|
18
|
+
s.require_paths = ['lib']
|
19
|
+
s.executables = ['docsplit']
|
20
|
+
|
21
|
+
s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
|
22
|
+
'docsplit.gemspec', 'LICENSE', 'README']
|
23
|
+
end
|
data/lib/docsplit.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
# The Docsplit module delegates to the Java PDF extractors.
|
6
|
+
module Docsplit
|
7
|
+
|
8
|
+
VERSION = '0.7.4' # Keep in sync with gemspec.
|
9
|
+
|
10
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
|
+
|
12
|
+
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
|
+
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
|
+
|
15
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
16
|
+
|
17
|
+
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
18
|
+
|
19
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
20
|
+
|
21
|
+
# Check for all dependencies, and note their absence.
|
22
|
+
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
23
|
+
DEPENDENCIES.each_key do |dep|
|
24
|
+
dirs.each do |dir|
|
25
|
+
if File.executable?(File.join(dir, dep.to_s))
|
26
|
+
DEPENDENCIES[dep] = true
|
27
|
+
break
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
33
|
+
# broke.
|
34
|
+
class ExtractionFailed < StandardError; end
|
35
|
+
|
36
|
+
# Use the ExtractPages Java class to burst a PDF into single pages.
|
37
|
+
def self.extract_pages(pdfs, opts={})
|
38
|
+
pdfs = ensure_pdfs(pdfs)
|
39
|
+
PageExtractor.new.extract(pdfs, opts)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Use the ExtractText Java class to write out all embedded text.
|
43
|
+
def self.extract_text(pdfs, opts={})
|
44
|
+
pdfs = ensure_pdfs(pdfs)
|
45
|
+
TextExtractor.new.extract(pdfs, opts)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
49
|
+
def self.extract_images(pdfs, opts={})
|
50
|
+
pdfs = ensure_pdfs(pdfs)
|
51
|
+
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
52
|
+
ImageExtractor.new.extract(pdfs, opts)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Use JODCConverter to extract the documents as PDFs.
|
56
|
+
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
57
|
+
def self.extract_pdf(docs, opts={})
|
58
|
+
PdfExtractor.new.extract(docs, opts)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Define custom methods for each of the metadata keys that we support.
|
62
|
+
# Use the ExtractInfo Java class to print out a single bit of metadata.
|
63
|
+
METADATA_KEYS.each do |key|
|
64
|
+
instance_eval <<-EOS
|
65
|
+
def self.extract_#{key}(pdfs, opts={})
|
66
|
+
pdfs = ensure_pdfs(pdfs)
|
67
|
+
InfoExtractor.new.extract(:#{key}, pdfs, opts)
|
68
|
+
end
|
69
|
+
EOS
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.extract_info(pdfs, opts={})
|
73
|
+
pdfs = ensure_pdfs(pdfs)
|
74
|
+
InfoExtractor.new.extract_all(pdfs, opts)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Utility method to clean OCR'd text with garbage characters.
|
78
|
+
def self.clean_text(text)
|
79
|
+
TextCleaner.new.clean(text)
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# Normalize a value in an options hash for the command line.
|
85
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
86
|
+
def self.normalize_value(value)
|
87
|
+
case value
|
88
|
+
when Range then value.to_a.join(',')
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
90
|
+
else value.to_s
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
97
|
+
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
98
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
99
|
+
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
100
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
|
101
|
+
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
102
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
|
+
|
4
|
+
module Docsplit
|
5
|
+
|
6
|
+
# A single command-line utility to separate a PDF into all its component parts.
|
7
|
+
class CommandLine
|
8
|
+
|
9
|
+
BANNER = <<-EOS
|
10
|
+
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
+
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
15
|
+
Main commands:
|
16
|
+
pages, images, text, pdf.
|
17
|
+
Metadata commands:
|
18
|
+
author, date, creator, keywords, producer, subject, title, length.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
docsplit images --size 700x --format jpg document.pdf
|
22
|
+
|
23
|
+
Dependencies:
|
24
|
+
Ruby, Java, A working GraphicsMagick (gm) command,
|
25
|
+
and a headless OpenOffice server for non-PDF documents.
|
26
|
+
|
27
|
+
Options:
|
28
|
+
(size, pages and format can take comma-separated values)
|
29
|
+
|
30
|
+
EOS
|
31
|
+
|
32
|
+
# Creating a CommandLine runs off of the contents of ARGV.
|
33
|
+
def initialize
|
34
|
+
parse_options
|
35
|
+
cmd = ARGV.shift
|
36
|
+
@command = cmd && cmd.to_sym
|
37
|
+
run
|
38
|
+
end
|
39
|
+
|
40
|
+
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
|
+
def run
|
42
|
+
begin
|
43
|
+
case @command
|
44
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
45
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
46
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
47
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
48
|
+
else
|
49
|
+
if METADATA_KEYS.include?(@command)
|
50
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
+
puts value unless value.nil?
|
52
|
+
else
|
53
|
+
usage
|
54
|
+
end
|
55
|
+
end
|
56
|
+
rescue ExtractionFailed => e
|
57
|
+
puts e.message.chomp
|
58
|
+
exit(1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Print out the usage help message.
|
63
|
+
def usage
|
64
|
+
puts "\n#{@option_parser}\n"
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
# Use the OptionParser library to parse out all supported options. Return
|
72
|
+
# options formatted for the Ruby API.
|
73
|
+
def parse_options
|
74
|
+
@options = {:ocr => :default, :clean => true}
|
75
|
+
@option_parser = OptionParser.new do |opts|
|
76
|
+
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
|
+
@options[:output] = d
|
78
|
+
end
|
79
|
+
opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
|
80
|
+
@options[:pages] = p
|
81
|
+
end
|
82
|
+
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
83
|
+
@options[:size] = s.split(',')
|
84
|
+
end
|
85
|
+
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
|
+
@options[:format] = t.split(',')
|
87
|
+
end
|
88
|
+
opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
|
89
|
+
@options[:density] = d
|
90
|
+
end
|
91
|
+
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
92
|
+
@options[:ocr] = o
|
93
|
+
end
|
94
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
95
|
+
@options[:clean] = false
|
96
|
+
end
|
97
|
+
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
|
+
@options[:language] = l
|
99
|
+
@options[:clean] = false
|
100
|
+
end
|
101
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
102
|
+
@options[:rolling] = true
|
103
|
+
end
|
104
|
+
opts.on_tail('-v', '--version', 'display docsplit version') do
|
105
|
+
puts "Docsplit version #{Docsplit::VERSION}"
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
opts.on_tail('-h', '--help', 'display this help message') do
|
109
|
+
usage
|
110
|
+
end
|
111
|
+
end
|
112
|
+
@option_parser.banner = BANNER
|
113
|
+
begin
|
114
|
+
@option_parser.parse!(ARGV)
|
115
|
+
rescue OptionParser::InvalidOption => e
|
116
|
+
puts e.message
|
117
|
+
exit(1)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
|
+
# nicely sized images.
|
5
|
+
class ImageExtractor
|
6
|
+
|
7
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
8
|
+
DEFAULT_FORMAT = :png
|
9
|
+
DEFAULT_DENSITY = '150'
|
10
|
+
|
11
|
+
# Extract a list of PDFs as rasterized page images, according to the
|
12
|
+
# configuration in options.
|
13
|
+
def extract(pdfs, options)
|
14
|
+
@pdfs = [pdfs].flatten
|
15
|
+
extract_options(options)
|
16
|
+
@pdfs.each do |pdf|
|
17
|
+
previous = nil
|
18
|
+
@sizes.each_with_index do |size, i|
|
19
|
+
@formats.each {|format| convert(pdf, size, format, previous) }
|
20
|
+
previous = size if @rolling
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Convert a single PDF into page images at the specified size and format.
|
26
|
+
# If `--rolling`, and we have a previous image at a larger size to work with,
|
27
|
+
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
|
+
# Now we generate one page at a time, a counterintuitive opimization
|
29
|
+
# suggested by the GraphicsMagick list, that seems to work quite well.
|
30
|
+
def convert(pdf, size, format, previous=nil)
|
31
|
+
tempdir = Dir.mktmpdir
|
32
|
+
basename = File.basename(pdf, File.extname(pdf))
|
33
|
+
directory = directory_for(size)
|
34
|
+
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
|
+
escaped_pdf = ESCAPE[pdf]
|
36
|
+
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
37
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
38
|
+
if previous
|
39
|
+
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
40
|
+
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
41
|
+
raise ExtractionFailed, result if $? != 0
|
42
|
+
else
|
43
|
+
page_list(pages).each do |page|
|
44
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
|
+
result = `#{cmd}`.chomp
|
47
|
+
raise ExtractionFailed, result if $? != 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
ensure
|
51
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# Extract the relevant GraphicsMagick options from the options hash.
|
58
|
+
def extract_options(options)
|
59
|
+
@output = options[:output] || '.'
|
60
|
+
@pages = options[:pages]
|
61
|
+
@density = options[:density] || DEFAULT_DENSITY
|
62
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
63
|
+
@sizes = [options[:size]].flatten.compact
|
64
|
+
@sizes = [nil] if @sizes.empty?
|
65
|
+
@rolling = !!options[:rolling]
|
66
|
+
end
|
67
|
+
|
68
|
+
# If there's only one size requested, generate the images directly into
|
69
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
70
|
+
def directory_for(size)
|
71
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
72
|
+
File.expand_path(path)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Generate the resize argument.
|
76
|
+
def resize_arg(size)
|
77
|
+
size.nil? ? '' : "-resize #{size}"
|
78
|
+
end
|
79
|
+
|
80
|
+
# Generate the appropriate quality argument for the image format.
|
81
|
+
def quality_arg(format)
|
82
|
+
case format.to_s
|
83
|
+
when /jpe?g/ then "-quality 85"
|
84
|
+
when /png/ then "-quality 100"
|
85
|
+
else ""
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Generate the expanded list of requested page numbers.
|
90
|
+
def page_list(pages)
|
91
|
+
pages.split(',').map { |range|
|
92
|
+
if range.include?('-')
|
93
|
+
range = range.split('-')
|
94
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
95
|
+
else
|
96
|
+
range.to_i
|
97
|
+
end
|
98
|
+
}.flatten.uniq.sort
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
|
+
class InfoExtractor
|
5
|
+
|
6
|
+
# Regex matchers for different bits of information.
|
7
|
+
MATCHERS = {
|
8
|
+
:author => /^Author:\s+([^\n]+)/,
|
9
|
+
:date => /^CreationDate:\s+([^\n]+)/,
|
10
|
+
:creator => /^Creator:\s+([^\n]+)/,
|
11
|
+
:keywords => /^Keywords:\s+([^\n]+)/,
|
12
|
+
:producer => /^Producer:\s+([^\n]+)/,
|
13
|
+
:subject => /^Subject:\s+([^\n]+)/,
|
14
|
+
:title => /^Title:\s+([^\n]+)/,
|
15
|
+
:length => /^Pages:\s+([^\n]+)/,
|
16
|
+
}
|
17
|
+
|
18
|
+
# Pull out a single datum from a pdf.
|
19
|
+
def extract(key, pdfs, opts)
|
20
|
+
extract_all(pdfs, opts)[key]
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_all(pdfs, opts)
|
24
|
+
pdf = [pdfs].flatten.first
|
25
|
+
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
26
|
+
result = `#{cmd}`.chomp
|
27
|
+
raise ExtractionFailed, result if $? != 0
|
28
|
+
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
|
+
if String.method_defined?(:encode)
|
30
|
+
result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
|
31
|
+
else
|
32
|
+
require 'iconv' unless defined?(Iconv)
|
33
|
+
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
34
|
+
result = ic.iconv(result)
|
35
|
+
end
|
36
|
+
info = {}
|
37
|
+
MATCHERS.each do |key, matcher|
|
38
|
+
match = result.match(matcher)
|
39
|
+
answer = match && match[1]
|
40
|
+
if answer
|
41
|
+
answer = answer.to_i if key == :length
|
42
|
+
info[key] = answer
|
43
|
+
end
|
44
|
+
end
|
45
|
+
info
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftk** in order to create bursted single pages from
|
4
|
+
# a PDF document.
|
5
|
+
class PageExtractor
|
6
|
+
|
7
|
+
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
|
+
def extract(pdfs, opts)
|
9
|
+
extract_options opts
|
10
|
+
[pdfs].flatten.each do |pdf|
|
11
|
+
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
+
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
+
|
15
|
+
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
+
"pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
|
17
|
+
else
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
19
|
+
end
|
20
|
+
result = `#{cmd}`.chomp
|
21
|
+
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
22
|
+
raise ExtractionFailed, result if $? != 0
|
23
|
+
result
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def extract_options(options)
|
31
|
+
@output = options[:output] || '.'
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
class PdfExtractor
|
5
|
+
@@executable = nil
|
6
|
+
@@version_string = nil
|
7
|
+
|
8
|
+
# Provide a set of helper functions to determine the OS.
|
9
|
+
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
10
|
+
def windows?
|
11
|
+
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
12
|
+
end
|
13
|
+
def osx?
|
14
|
+
!!HOST_OS.match(/darwin/i)
|
15
|
+
end
|
16
|
+
def linux?
|
17
|
+
!!HOST_OS.match(/linux/i)
|
18
|
+
end
|
19
|
+
|
20
|
+
# The first line of the help output holds the name and version number
|
21
|
+
# of the office software to be used for extraction.
|
22
|
+
def version_string
|
23
|
+
unless @@version_string
|
24
|
+
null = windows? ? "NUL" : "/dev/null"
|
25
|
+
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
|
+
if !!@@version_string.match(/[0-9]*/)
|
27
|
+
@@version_string = `#{office_executable} --version`.split("\n").first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
@@version_string
|
31
|
+
end
|
32
|
+
def libre_office?
|
33
|
+
!!version_string.match(/^LibreOffice/)
|
34
|
+
end
|
35
|
+
def open_office?
|
36
|
+
!!version_string.match(/^OpenOffice.org/)
|
37
|
+
end
|
38
|
+
|
39
|
+
# A set of default locations to search for office software
|
40
|
+
# These have been extracted from JODConverter. Each listed
|
41
|
+
# path should contain a directory "program" which in turn
|
42
|
+
# contains the "soffice" executable.
|
43
|
+
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
44
|
+
def office_search_paths
|
45
|
+
if windows?
|
46
|
+
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
47
|
+
program_files_path = ENV["CommonProgramFiles"]
|
48
|
+
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
49
|
+
elsif osx?
|
50
|
+
search_paths = %w(
|
51
|
+
/Applications/LibreOffice.app/Contents
|
52
|
+
/Applications/OpenOffice.org.app/Contents
|
53
|
+
)
|
54
|
+
else # probably linux/unix
|
55
|
+
# heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
|
56
|
+
search_paths = %w(
|
57
|
+
/usr/lib/libreoffice
|
58
|
+
/usr/lib64/libreoffice
|
59
|
+
/opt/libreoffice
|
60
|
+
/usr/lib/openoffice
|
61
|
+
/usr/lib64/openoffice
|
62
|
+
/opt/openoffice.org3
|
63
|
+
/app/vendor/libreoffice
|
64
|
+
)
|
65
|
+
end
|
66
|
+
search_paths
|
67
|
+
end
|
68
|
+
|
69
|
+
# Identify the path to a working office executable.
|
70
|
+
def office_executable
|
71
|
+
paths = office_search_paths
|
72
|
+
|
73
|
+
# If an OFFICE_PATH has been specified on the commandline
|
74
|
+
# raise an error if that path isn't valid, otherwise, add
|
75
|
+
# it to the front of our search paths.
|
76
|
+
if ENV['OFFICE_PATH']
|
77
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
78
|
+
paths.unshift(ENV['OFFICE_PATH'])
|
79
|
+
end
|
80
|
+
|
81
|
+
# The location of the office executable is OS dependent
|
82
|
+
path_pieces = ["soffice"]
|
83
|
+
if windows?
|
84
|
+
path_pieces += [["program", "soffice.bin"]]
|
85
|
+
elsif osx?
|
86
|
+
path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
|
87
|
+
else
|
88
|
+
path_pieces += [["program", "soffice"]]
|
89
|
+
end
|
90
|
+
|
91
|
+
# Search for the first suitable office executable
|
92
|
+
# and short circuit an executable is found.
|
93
|
+
paths.each do |path|
|
94
|
+
if File.exists? path
|
95
|
+
@@executable ||= path unless File.directory? path
|
96
|
+
path_pieces.each do |pieces|
|
97
|
+
check_path = File.join(path, pieces)
|
98
|
+
@@executable ||= check_path if File.exists? check_path
|
99
|
+
end
|
100
|
+
end
|
101
|
+
break if @@executable
|
102
|
+
end
|
103
|
+
raise OfficeNotFound, "No office software found" unless @@executable
|
104
|
+
@@executable
|
105
|
+
end
|
106
|
+
|
107
|
+
# Used to specify the office location for JODConverter
|
108
|
+
def office_path
|
109
|
+
File.dirname(File.dirname(office_executable))
|
110
|
+
end
|
111
|
+
|
112
|
+
# Convert documents to PDF.
|
113
|
+
def extract(docs, opts)
|
114
|
+
out = opts[:output] || '.'
|
115
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
116
|
+
[docs].flatten.each do |doc|
|
117
|
+
ext = File.extname(doc)
|
118
|
+
basename = File.basename(doc, ext)
|
119
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
120
|
+
|
121
|
+
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
122
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
123
|
+
else
|
124
|
+
if libre_office?
|
125
|
+
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
126
|
+
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
127
|
+
|
128
|
+
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
129
|
+
cmd = "#{office_executable} #{options} 2>&1"
|
130
|
+
result = `#{cmd}`.chomp
|
131
|
+
raise ExtractionFailed, result if $? != 0
|
132
|
+
true
|
133
|
+
else # open office presumably, rely on JODConverter to figure it out.
|
134
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
135
|
+
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
142
|
+
|
143
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
144
|
+
|
145
|
+
HEADLESS = "-Djava.awt.headless=true"
|
146
|
+
|
147
|
+
private
|
148
|
+
|
149
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
150
|
+
def run_jod(command, pdfs, opts, return_output=false)
|
151
|
+
|
152
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
153
|
+
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
154
|
+
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
155
|
+
result = `#{cmd}`.chomp
|
156
|
+
raise ExtractionFailed, result if $? != 0
|
157
|
+
return return_output ? (result.empty? ? nil : result) : true
|
158
|
+
end
|
159
|
+
|
160
|
+
class OfficeNotFound < StandardError; end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
|
5
|
+
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
6
|
+
# words. Algorithms taken from:
|
7
|
+
#
|
8
|
+
# Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
|
9
|
+
# -- Taghva, Nartker, Condit, and Borsack
|
10
|
+
#
|
11
|
+
# Improving Search and Retrieval Performance through Shortening Documents,
|
12
|
+
# Detecting Garbage, and Throwing out Jargon
|
13
|
+
# -- Kulp
|
14
|
+
#
|
15
|
+
class TextCleaner
|
16
|
+
|
17
|
+
# Cached regexes we plan on using.
|
18
|
+
WORD = /\S+/
|
19
|
+
SPACE = /\s+/
|
20
|
+
NEWLINE = /[\r\n]/
|
21
|
+
ALNUM = /[a-z0-9]/i
|
22
|
+
PUNCT = /[[:punct:]]/i
|
23
|
+
REPEAT = /([^0-9])\1{2,}/
|
24
|
+
UPPER = /[A-Z]/
|
25
|
+
LOWER = /[a-z]/
|
26
|
+
ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
|
27
|
+
ALL_ALPHA = /^[a-z]+$/i
|
28
|
+
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
|
29
|
+
VOWEL = /([aeiou]|y$)/i
|
30
|
+
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
|
31
|
+
VOWEL_5 = /[aeiou]{5}/i
|
32
|
+
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
|
33
|
+
SINGLETONS = /^[AaIi]$/
|
34
|
+
|
35
|
+
# For the time being, `clean` uses the regular StringScanner, and not the
|
36
|
+
# multibyte-aware version, coercing to ASCII first.
|
37
|
+
def clean(text)
|
38
|
+
if String.method_defined?(:encode)
|
39
|
+
text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
|
40
|
+
else
|
41
|
+
require 'iconv' unless defined?(Iconv)
|
42
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
43
|
+
end
|
44
|
+
|
45
|
+
scanner = StringScanner.new(text)
|
46
|
+
cleaned = []
|
47
|
+
spaced = false
|
48
|
+
loop do
|
49
|
+
if space = scanner.scan(SPACE)
|
50
|
+
cleaned.push(space) unless spaced && (space !~ NEWLINE)
|
51
|
+
spaced = true
|
52
|
+
elsif word = scanner.scan(WORD)
|
53
|
+
unless garbage(word)
|
54
|
+
cleaned.push(word)
|
55
|
+
spaced = false
|
56
|
+
end
|
57
|
+
elsif scanner.eos?
|
58
|
+
return cleaned.join('').gsub(REPEATED, '')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Is a given word OCR garbage?
|
64
|
+
def garbage(w)
|
65
|
+
acronym = w =~ ACRONYM
|
66
|
+
|
67
|
+
# More than 30 bytes in length.
|
68
|
+
(w.length > 30) ||
|
69
|
+
|
70
|
+
# If there are three or more identical characters in a row in the string.
|
71
|
+
(w =~ REPEAT) ||
|
72
|
+
|
73
|
+
# More punctuation than alpha numerics.
|
74
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
75
|
+
|
76
|
+
# Ignoring the first and last characters in the string, if there are three or
|
77
|
+
# more different punctuation characters in the string.
|
78
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
79
|
+
|
80
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
81
|
+
((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
|
82
|
+
|
83
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
84
|
+
# not all uppercase + punctuation.
|
85
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
86
|
+
|
87
|
+
# Single letters that are not A or I.
|
88
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
89
|
+
|
90
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
91
|
+
# consonants, or 8 times more consonants than vowels.
|
92
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
93
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
94
|
+
(cons > vows * 8)))
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
|
+
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
|
+
# forbid OCR extraction, but by default the heuristic works like this:
|
6
|
+
#
|
7
|
+
# * Check for the presence of fonts in the PDF. If no fonts are detected,
|
8
|
+
# OCR is used automatically.
|
9
|
+
# * Extract the text of each page with **pdftotext**, if the page has less
|
10
|
+
# than 100 bytes of text (a scanned image page, or a page that just
|
11
|
+
# contains a filename and a page number), then add it to the list of
|
12
|
+
# `@pages_to_ocr`.
|
13
|
+
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
|
+
#
|
15
|
+
class TextExtractor
|
16
|
+
|
17
|
+
NO_TEXT_DETECTED = /---------\n\Z/
|
18
|
+
|
19
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
|
+
|
22
|
+
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@pages_to_ocr = []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Extract text from a list of PDFs.
|
29
|
+
def extract(pdfs, opts)
|
30
|
+
extract_options opts
|
31
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
32
|
+
[pdfs].flatten.each do |pdf|
|
33
|
+
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
+
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
|
35
|
+
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
|
+
extract_from_ocr(pdf, pages)
|
37
|
+
else
|
38
|
+
extract_from_pdf(pdf, pages)
|
39
|
+
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
|
40
|
+
extract_from_ocr(pdf, @pages_to_ocr)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Does a PDF have any text embedded?
|
47
|
+
def contains_text?(pdf)
|
48
|
+
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
|
49
|
+
!fonts.match(NO_TEXT_DETECTED)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Extract a page range worth of text from a PDF, directly.
|
53
|
+
def extract_from_pdf(pdf, pages)
|
54
|
+
return extract_full(pdf) unless pages
|
55
|
+
pages.each {|page| extract_page(pdf, page) }
|
56
|
+
end
|
57
|
+
|
58
|
+
# Extract a page range worth of text from a PDF via OCR.
|
59
|
+
def extract_from_ocr(pdf, pages)
|
60
|
+
tempdir = Dir.mktmpdir
|
61
|
+
base_path = File.join(@output, @pdf_name)
|
62
|
+
escaped_pdf = ESCAPE[pdf]
|
63
|
+
if pages
|
64
|
+
pages.each do |page|
|
65
|
+
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
|
+
escaped_tiff = ESCAPE[tiff]
|
67
|
+
file = "#{base_path}_#{page}"
|
68
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
69
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
|
70
|
+
clean_text(file + '.txt') if @clean_ocr
|
71
|
+
FileUtils.remove_entry_secure tiff
|
72
|
+
end
|
73
|
+
else
|
74
|
+
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
75
|
+
escaped_tiff = ESCAPE[tiff]
|
76
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
77
|
+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
|
78
|
+
clean_text(base_path + '.txt') if @clean_ocr
|
79
|
+
end
|
80
|
+
ensure
|
81
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def clean_text(file)
|
88
|
+
File.open(file, 'r+') do |f|
|
89
|
+
text = f.read
|
90
|
+
f.truncate(0)
|
91
|
+
f.rewind
|
92
|
+
f.write(Docsplit.clean_text(text))
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Run an external process and raise an exception if it fails.
|
97
|
+
def run(command)
|
98
|
+
result = `#{command}`
|
99
|
+
raise ExtractionFailed, result if $? != 0
|
100
|
+
result
|
101
|
+
end
|
102
|
+
|
103
|
+
# Extract the full contents of a pdf as a single file, directly.
|
104
|
+
def extract_full(pdf)
|
105
|
+
text_path = File.join(@output, "#{@pdf_name}.txt")
|
106
|
+
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
107
|
+
end
|
108
|
+
|
109
|
+
# Extract the contents of a single page of text, directly, adding it to
|
110
|
+
# the `@pages_to_ocr` list if the text length is inadequate.
|
111
|
+
def extract_page(pdf, page)
|
112
|
+
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
113
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
114
|
+
unless @forbid_ocr
|
115
|
+
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def extract_options(options)
|
120
|
+
@output = options[:output] || '.'
|
121
|
+
@pages = options[:pages]
|
122
|
+
@force_ocr = options[:ocr] == true
|
123
|
+
@forbid_ocr = options[:ocr] == false
|
124
|
+
@clean_ocr = !(options[:clean] == false)
|
125
|
+
@language = options[:language] || 'eng'
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Include a method to transparently convert non-PDF arguments to temporary
|
4
|
+
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
|
+
module TransparentPDFs
|
6
|
+
|
7
|
+
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
|
+
# through further extraction.
|
9
|
+
def ensure_pdfs(docs)
|
10
|
+
[docs].flatten.map do |doc|
|
11
|
+
ext = File.extname(doc)
|
12
|
+
if ext.downcase == '.pdf' || File.open(doc, &:readline) =~ /\A\%PDF-\d+(\.\d+)?$/
|
13
|
+
doc
|
14
|
+
else
|
15
|
+
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
|
+
extract_pdf([doc], {:output => tempdir})
|
17
|
+
File.join(tempdir, File.basename(doc, ext) + '.pdf')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
extend TransparentPDFs
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"name": "Portable Document Format",
|
4
|
+
"extension": "pdf",
|
5
|
+
"mediaType": "application/pdf",
|
6
|
+
"storePropertiesByFamily": {
|
7
|
+
"DRAWING": {"FilterName": "draw_pdf_Export"},
|
8
|
+
"SPREADSHEET": {"FilterName": "calc_pdf_Export"},
|
9
|
+
"PRESENTATION": {"FilterName": "impress_pdf_Export"},
|
10
|
+
"TEXT": {"FilterName": "writer_pdf_Export"}
|
11
|
+
}
|
12
|
+
},
|
13
|
+
{
|
14
|
+
"name": "Macromedia Flash",
|
15
|
+
"extension": "swf",
|
16
|
+
"mediaType": "application/x-shockwave-flash",
|
17
|
+
"storePropertiesByFamily": {
|
18
|
+
"DRAWING": {"FilterName": "draw_flash_Export"},
|
19
|
+
"PRESENTATION": {"FilterName": "impress_flash_Export"}
|
20
|
+
}
|
21
|
+
},
|
22
|
+
{
|
23
|
+
"name": "HTML",
|
24
|
+
"extension": "html",
|
25
|
+
"mediaType": "text/html",
|
26
|
+
"inputFamily": "TEXT",
|
27
|
+
"storePropertiesByFamily": {
|
28
|
+
"SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
|
29
|
+
"PRESENTATION": {"FilterName": "impress_html_Export"},
|
30
|
+
"TEXT": {"FilterName": "HTML (StarWriter)"}
|
31
|
+
}
|
32
|
+
},
|
33
|
+
{
|
34
|
+
"name": "OpenDocument Text",
|
35
|
+
"extension": "odt",
|
36
|
+
"mediaType": "application/vnd.oasis.opendocument.text",
|
37
|
+
"inputFamily": "TEXT",
|
38
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"name": "OpenOffice.org 1.0 Text Document",
|
42
|
+
"extension": "sxw",
|
43
|
+
"mediaType": "application/vnd.sun.xml.writer",
|
44
|
+
"inputFamily": "TEXT",
|
45
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
|
46
|
+
},
|
47
|
+
{
|
48
|
+
"name": "Microsoft Word",
|
49
|
+
"extension": "doc",
|
50
|
+
"mediaType": "application/msword",
|
51
|
+
"inputFamily": "TEXT",
|
52
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
|
53
|
+
},
|
54
|
+
{
|
55
|
+
"name": "Microsoft Word 2007 XML",
|
56
|
+
"extension": "docx",
|
57
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
58
|
+
"inputFamily": "TEXT"
|
59
|
+
},
|
60
|
+
{
|
61
|
+
"name": "Rich Text Format",
|
62
|
+
"extension": "rtf",
|
63
|
+
"mediaType": "text/rtf",
|
64
|
+
"inputFamily": "TEXT",
|
65
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
|
66
|
+
},
|
67
|
+
{
|
68
|
+
"name": "WordPerfect",
|
69
|
+
"extension": "wpd",
|
70
|
+
"mediaType": "application/wordperfect",
|
71
|
+
"inputFamily": "TEXT"
|
72
|
+
},
|
73
|
+
{
|
74
|
+
"name": "Plain Text",
|
75
|
+
"extension": "txt",
|
76
|
+
"mediaType": "text/plain",
|
77
|
+
"inputFamily": "TEXT",
|
78
|
+
"loadProperties": {
|
79
|
+
"FilterName": "Text (encoded)",
|
80
|
+
"FilterOptions": "utf8"
|
81
|
+
},
|
82
|
+
"storePropertiesByFamily": {"TEXT": {
|
83
|
+
"FilterName": "Text (encoded)",
|
84
|
+
"FilterOptions": "utf8"
|
85
|
+
}}
|
86
|
+
},
|
87
|
+
{
|
88
|
+
"name": "MediaWiki wikitext",
|
89
|
+
"extension": "wiki",
|
90
|
+
"mediaType": "text/x-wiki",
|
91
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
|
92
|
+
},
|
93
|
+
{
|
94
|
+
"name": "OpenDocument Spreadsheet",
|
95
|
+
"extension": "ods",
|
96
|
+
"mediaType": "application/vnd.oasis.opendocument.spreadsheet",
|
97
|
+
"inputFamily": "SPREADSHEET",
|
98
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
|
99
|
+
},
|
100
|
+
{
|
101
|
+
"name": "OpenOffice.org 1.0 Spreadsheet",
|
102
|
+
"extension": "sxc",
|
103
|
+
"mediaType": "application/vnd.sun.xml.calc",
|
104
|
+
"inputFamily": "SPREADSHEET",
|
105
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
|
106
|
+
},
|
107
|
+
{
|
108
|
+
"name": "Microsoft Excel",
|
109
|
+
"extension": "xls",
|
110
|
+
"mediaType": "application/vnd.ms-excel",
|
111
|
+
"inputFamily": "SPREADSHEET",
|
112
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
|
113
|
+
},
|
114
|
+
{
|
115
|
+
"name": "Microsoft Excel 2007 XML",
|
116
|
+
"extension": "xlsx",
|
117
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
118
|
+
"inputFamily": "SPREADSHEET"
|
119
|
+
},
|
120
|
+
{
|
121
|
+
"name": "Comma Separated Values",
|
122
|
+
"extension": "csv",
|
123
|
+
"mediaType": "text/csv",
|
124
|
+
"inputFamily": "SPREADSHEET",
|
125
|
+
"loadProperties": {
|
126
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
127
|
+
"FilterOptions": "44,34,0"
|
128
|
+
},
|
129
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
130
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
131
|
+
"FilterOptions": "44,34,0"
|
132
|
+
}}
|
133
|
+
},
|
134
|
+
{
|
135
|
+
"name": "Tab Separated Values",
|
136
|
+
"extension": "tsv",
|
137
|
+
"mediaType": "text/tab-separated-values",
|
138
|
+
"inputFamily": "SPREADSHEET",
|
139
|
+
"loadProperties": {
|
140
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
141
|
+
"FilterOptions": "9,34,0"
|
142
|
+
},
|
143
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
144
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
145
|
+
"FilterOptions": "9,34,0"
|
146
|
+
}}
|
147
|
+
},
|
148
|
+
{
|
149
|
+
"name": "OpenDocument Presentation",
|
150
|
+
"extension": "odp",
|
151
|
+
"mediaType": "application/vnd.oasis.opendocument.presentation",
|
152
|
+
"inputFamily": "PRESENTATION",
|
153
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
|
154
|
+
},
|
155
|
+
{
|
156
|
+
"name": "OpenOffice.org 1.0 Presentation",
|
157
|
+
"extension": "sxi",
|
158
|
+
"mediaType": "application/vnd.sun.xml.impress",
|
159
|
+
"inputFamily": "PRESENTATION",
|
160
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
|
161
|
+
},
|
162
|
+
{
|
163
|
+
"name": "Microsoft PowerPoint",
|
164
|
+
"extension": "ppt",
|
165
|
+
"mediaType": "application/vnd.ms-powerpoint",
|
166
|
+
"inputFamily": "PRESENTATION",
|
167
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
|
168
|
+
},
|
169
|
+
{
|
170
|
+
"name": "Microsoft PowerPoint 2007 XML",
|
171
|
+
"extension": "pptx",
|
172
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
173
|
+
"inputFamily": "PRESENTATION"
|
174
|
+
},
|
175
|
+
{
|
176
|
+
"name": "OpenDocument Drawing",
|
177
|
+
"extension": "odg",
|
178
|
+
"mediaType": "application/vnd.oasis.opendocument.graphics",
|
179
|
+
"inputFamily": "DRAWING",
|
180
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
|
181
|
+
},
|
182
|
+
{
|
183
|
+
"name": "Scalable Vector Graphics",
|
184
|
+
"extension": "svg",
|
185
|
+
"mediaType": "image/svg+xml",
|
186
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
|
187
|
+
},
|
188
|
+
{
|
189
|
+
"name": "Portable Network Graphic",
|
190
|
+
"extension": "png",
|
191
|
+
"mediaType": "image/png",
|
192
|
+
"storePropertiesByFamily": {
|
193
|
+
"DRAWING": {"FilterName": "draw_png_Export"},
|
194
|
+
"PRESENTATION": {"FilterName": "impress_png_Export"}
|
195
|
+
}
|
196
|
+
},
|
197
|
+
{
|
198
|
+
"name": "Graphics Interchange Format",
|
199
|
+
"extension": "gif",
|
200
|
+
"mediaType": "image/gif",
|
201
|
+
"storePropertiesByFamily": {
|
202
|
+
"DRAWING": {"FilterName": "draw_gif_Export"},
|
203
|
+
"PRESENTATION": {"FilterName": "impress_gif_Export"}
|
204
|
+
}
|
205
|
+
},
|
206
|
+
{
|
207
|
+
"name": "Joint Photographic Experts Group",
|
208
|
+
"extension": "jpg",
|
209
|
+
"mediaType": "image/jpeg",
|
210
|
+
"storePropertiesByFamily": {
|
211
|
+
"DRAWING": {"FilterName": "draw_jpg_Export"},
|
212
|
+
"PRESENTATION": {"FilterName": "impress_jpg_Export"}
|
213
|
+
}
|
214
|
+
},
|
215
|
+
{
|
216
|
+
"name": "Windows Bitmap",
|
217
|
+
"extension": "bmp",
|
218
|
+
"mediaType": "image/bmp",
|
219
|
+
"storePropertiesByFamily": {
|
220
|
+
"DRAWING": {"FilterName": "draw_bmp_Export"},
|
221
|
+
"PRESENTATION": {"FilterName": "impress_bmp_Export"}
|
222
|
+
}
|
223
|
+
},
|
224
|
+
{
|
225
|
+
"name": "Tagged Image File Format",
|
226
|
+
"extension": "tif",
|
227
|
+
"mediaType": "image/tiff",
|
228
|
+
"storePropertiesByFamily": {
|
229
|
+
"DRAWING": {"FilterName": "draw_tif_Export"},
|
230
|
+
"PRESENTATION": {"FilterName": "impress_tif_Export"}
|
231
|
+
}
|
232
|
+
}
|
233
|
+
]
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
.level=WARNING
|
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: burisu-docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeremy Ashkenas
|
8
|
+
- Samuel Clay
|
9
|
+
- Ted Han
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: |2
|
16
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
17
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
18
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
19
|
+
metadata (title, author, number of pages...)
|
20
|
+
email: opensource@documentcloud.org
|
21
|
+
executables:
|
22
|
+
- docsplit
|
23
|
+
extensions: []
|
24
|
+
extra_rdoc_files: []
|
25
|
+
files:
|
26
|
+
- lib/docsplit/image_extractor.rb
|
27
|
+
- lib/docsplit/info_extractor.rb
|
28
|
+
- lib/docsplit/transparent_pdfs.rb
|
29
|
+
- lib/docsplit/text_extractor.rb
|
30
|
+
- lib/docsplit/text_cleaner.rb
|
31
|
+
- lib/docsplit/page_extractor.rb
|
32
|
+
- lib/docsplit/pdf_extractor.rb
|
33
|
+
- lib/docsplit/command_line.rb
|
34
|
+
- lib/docsplit.rb
|
35
|
+
- bin/docsplit
|
36
|
+
- vendor/logging.properties
|
37
|
+
- vendor/conf/document-formats.js
|
38
|
+
- vendor/jodconverter/jurt-3.2.1.jar
|
39
|
+
- vendor/jodconverter/unoil-3.2.1.jar
|
40
|
+
- vendor/jodconverter/commons-cli-1.1.jar
|
41
|
+
- vendor/jodconverter/json-20090211.jar
|
42
|
+
- vendor/jodconverter/ridl-3.2.1.jar
|
43
|
+
- vendor/jodconverter/commons-io-1.4.jar
|
44
|
+
- vendor/jodconverter/juh-3.2.1.jar
|
45
|
+
- vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
|
46
|
+
- docsplit.gemspec
|
47
|
+
- LICENSE
|
48
|
+
- README
|
49
|
+
homepage: http://documentcloud.github.com/docsplit/
|
50
|
+
licenses:
|
51
|
+
- MIT
|
52
|
+
metadata: {}
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
requirements: []
|
68
|
+
rubyforge_project:
|
69
|
+
rubygems_version: 2.0.14
|
70
|
+
signing_key:
|
71
|
+
specification_version: 4
|
72
|
+
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
73
|
+
test_files: []
|