concerto_docsplit 0.7.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +25 -0
- data/README +22 -0
- data/bin/docsplit +5 -0
- data/lib/docsplit.rb +102 -0
- data/lib/docsplit/command_line.rb +123 -0
- data/lib/docsplit/image_extractor.rb +103 -0
- data/lib/docsplit/info_extractor.rb +50 -0
- data/lib/docsplit/page_extractor.rb +36 -0
- data/lib/docsplit/pdf_extractor.rb +163 -0
- data/lib/docsplit/text_cleaner.rb +99 -0
- data/lib/docsplit/text_extractor.rb +138 -0
- data/lib/docsplit/transparent_pdfs.rb +29 -0
- data/vendor/conf/document-formats.js +233 -0
- data/vendor/jodconverter/commons-cli-1.1.jar +0 -0
- data/vendor/jodconverter/commons-io-1.4.jar +0 -0
- data/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar +0 -0
- data/vendor/jodconverter/json-20090211.jar +0 -0
- data/vendor/jodconverter/juh-3.2.1.jar +0 -0
- data/vendor/jodconverter/jurt-3.2.1.jar +0 -0
- data/vendor/jodconverter/ridl-3.2.1.jar +0 -0
- data/vendor/jodconverter/unoil-3.2.1.jar +0 -0
- data/vendor/logging.properties +1 -0
- metadata +72 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a54bd6493f131da57298fd581c261a0d25913569
|
4
|
+
data.tar.gz: 7c9e99ec164c30d9d9378dea9b82e524601d2bd7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5e2bfc51c164e989a1295206028d8462d120ce735a7ccdc4dcf9f36835c5509aa6a6947a6e8a59184c444eb05fd927a65f8755d79af2c78ee1fdc276c929e616
|
7
|
+
data.tar.gz: 76d58902b9e279203b0ac030cb2e692984176416a397de480d373c15c5c9b57d8db143e66daaad1905e8a7ea6bc4ee3062a1843a9127cf8544aa7a661d7c06fb
|
data/LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
|
2
|
+
|
3
|
+
Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
|
4
|
+
Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person
|
7
|
+
obtaining a copy of this software and associated documentation
|
8
|
+
files (the "Software"), to deal in the Software without
|
9
|
+
restriction, including without limitation the rights to use,
|
10
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
copies of the Software, and to permit persons to whom the
|
12
|
+
Software is furnished to do so, subject to the following
|
13
|
+
conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
20
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
22
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
23
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
24
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
25
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
==
|
2
|
+
__ ___ __
|
3
|
+
____/ /___ ______________ / (_) /_
|
4
|
+
/ __ / __ \/ ___/ ___/ __ \/ / / __/
|
5
|
+
/ /_/ / /_/ / /__(__ ) /_/ / / / /_
|
6
|
+
\____/\____/\___/____/ .___/_/_/\__/
|
7
|
+
/_/
|
8
|
+
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
|
14
|
+
Installation:
|
15
|
+
gem install docsplit
|
16
|
+
|
17
|
+
For documentation, usage, and examples, see:
|
18
|
+
http://documentcloud.github.com/docsplit/
|
19
|
+
|
20
|
+
To suggest a feature or report a bug:
|
21
|
+
http://github.com/documentcloud/docsplit/issues/
|
22
|
+
|
data/bin/docsplit
ADDED
data/lib/docsplit.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
# The Docsplit module delegates to the Java PDF extractors.
|
6
|
+
module Docsplit
|
7
|
+
|
8
|
+
VERSION = '0.7.5' # Keep in sync with gemspec.
|
9
|
+
|
10
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
|
+
|
12
|
+
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
|
+
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
|
+
|
15
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
16
|
+
|
17
|
+
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
18
|
+
|
19
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
20
|
+
|
21
|
+
# Check for all dependencies, and note their absence.
|
22
|
+
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
23
|
+
DEPENDENCIES.each_key do |dep|
|
24
|
+
dirs.each do |dir|
|
25
|
+
if File.executable?(File.join(dir, dep.to_s))
|
26
|
+
DEPENDENCIES[dep] = true
|
27
|
+
break
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
33
|
+
# broke.
|
34
|
+
class ExtractionFailed < StandardError; end
|
35
|
+
|
36
|
+
# Use the ExtractPages Java class to burst a PDF into single pages.
|
37
|
+
def self.extract_pages(pdfs, opts={})
|
38
|
+
pdfs = ensure_pdfs(pdfs)
|
39
|
+
PageExtractor.new.extract(pdfs, opts)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Use the ExtractText Java class to write out all embedded text.
|
43
|
+
def self.extract_text(pdfs, opts={})
|
44
|
+
pdfs = ensure_pdfs(pdfs)
|
45
|
+
TextExtractor.new.extract(pdfs, opts)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
49
|
+
def self.extract_images(pdfs, opts={})
|
50
|
+
pdfs = ensure_pdfs(pdfs)
|
51
|
+
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
52
|
+
ImageExtractor.new.extract(pdfs, opts)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Use JODCConverter to extract the documents as PDFs.
|
56
|
+
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
57
|
+
def self.extract_pdf(docs, opts={})
|
58
|
+
PdfExtractor.new.extract(docs, opts)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Define custom methods for each of the metadata keys that we support.
|
62
|
+
# Use the ExtractInfo Java class to print out a single bit of metadata.
|
63
|
+
METADATA_KEYS.each do |key|
|
64
|
+
instance_eval <<-EOS
|
65
|
+
def self.extract_#{key}(pdfs, opts={})
|
66
|
+
pdfs = ensure_pdfs(pdfs)
|
67
|
+
InfoExtractor.new.extract(:#{key}, pdfs, opts)
|
68
|
+
end
|
69
|
+
EOS
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.extract_info(pdfs, opts={})
|
73
|
+
pdfs = ensure_pdfs(pdfs)
|
74
|
+
InfoExtractor.new.extract_all(pdfs, opts)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Utility method to clean OCR'd text with garbage characters.
|
78
|
+
def self.clean_text(text)
|
79
|
+
TextCleaner.new.clean(text)
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# Normalize a value in an options hash for the command line.
|
85
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
86
|
+
def self.normalize_value(value)
|
87
|
+
case value
|
88
|
+
when Range then value.to_a.join(',')
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
90
|
+
else value.to_s
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
97
|
+
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
98
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
99
|
+
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
100
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
|
101
|
+
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
102
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
|
+
|
4
|
+
module Docsplit
|
5
|
+
|
6
|
+
# A single command-line utility to separate a PDF into all its component parts.
|
7
|
+
class CommandLine
|
8
|
+
|
9
|
+
BANNER = <<-EOS
|
10
|
+
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
+
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
15
|
+
Main commands:
|
16
|
+
pages, images, text, pdf.
|
17
|
+
Metadata commands:
|
18
|
+
author, date, creator, keywords, producer, subject, title, length.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
docsplit images --size 700x --format jpg document.pdf
|
22
|
+
|
23
|
+
Dependencies:
|
24
|
+
Ruby, Java, A working GraphicsMagick (gm) command,
|
25
|
+
and a headless OpenOffice server for non-PDF documents.
|
26
|
+
|
27
|
+
Options:
|
28
|
+
(size, pages and format can take comma-separated values)
|
29
|
+
|
30
|
+
EOS
|
31
|
+
|
32
|
+
# Creating a CommandLine runs off of the contents of ARGV.
|
33
|
+
def initialize
|
34
|
+
parse_options
|
35
|
+
cmd = ARGV.shift
|
36
|
+
@command = cmd && cmd.to_sym
|
37
|
+
run
|
38
|
+
end
|
39
|
+
|
40
|
+
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
|
+
def run
|
42
|
+
begin
|
43
|
+
case @command
|
44
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
45
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
46
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
47
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
48
|
+
else
|
49
|
+
if METADATA_KEYS.include?(@command)
|
50
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
+
puts value unless value.nil?
|
52
|
+
else
|
53
|
+
usage
|
54
|
+
end
|
55
|
+
end
|
56
|
+
rescue ExtractionFailed => e
|
57
|
+
puts e.message.chomp
|
58
|
+
exit(1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Print out the usage help message.
|
63
|
+
def usage
|
64
|
+
puts "\n#{@option_parser}\n"
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
# Use the OptionParser library to parse out all supported options. Return
|
72
|
+
# options formatted for the Ruby API.
|
73
|
+
def parse_options
|
74
|
+
@options = {:ocr => :default, :clean => true}
|
75
|
+
@option_parser = OptionParser.new do |opts|
|
76
|
+
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
|
+
@options[:output] = d
|
78
|
+
end
|
79
|
+
opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
|
80
|
+
@options[:pages] = p
|
81
|
+
end
|
82
|
+
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
83
|
+
@options[:size] = s.split(',')
|
84
|
+
end
|
85
|
+
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
|
+
@options[:format] = t.split(',')
|
87
|
+
end
|
88
|
+
opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
|
89
|
+
@options[:density] = d
|
90
|
+
end
|
91
|
+
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
92
|
+
@options[:ocr] = o
|
93
|
+
end
|
94
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
95
|
+
@options[:clean] = false
|
96
|
+
end
|
97
|
+
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
|
+
@options[:language] = l
|
99
|
+
@options[:clean] = false
|
100
|
+
end
|
101
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
102
|
+
@options[:rolling] = true
|
103
|
+
end
|
104
|
+
opts.on_tail('-v', '--version', 'display docsplit version') do
|
105
|
+
puts "Docsplit version #{Docsplit::VERSION}"
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
opts.on_tail('-h', '--help', 'display this help message') do
|
109
|
+
usage
|
110
|
+
end
|
111
|
+
end
|
112
|
+
@option_parser.banner = BANNER
|
113
|
+
begin
|
114
|
+
@option_parser.parse!(ARGV)
|
115
|
+
rescue OptionParser::InvalidOption => e
|
116
|
+
puts e.message
|
117
|
+
exit(1)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
|
+
# nicely sized images.
|
5
|
+
class ImageExtractor
|
6
|
+
|
7
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
8
|
+
DEFAULT_FORMAT = :png
|
9
|
+
DEFAULT_DENSITY = '150'
|
10
|
+
|
11
|
+
# Extract a list of PDFs as rasterized page images, according to the
|
12
|
+
# configuration in options.
|
13
|
+
def extract(pdfs, options)
|
14
|
+
@pdfs = [pdfs].flatten
|
15
|
+
extract_options(options)
|
16
|
+
@pdfs.each do |pdf|
|
17
|
+
previous = nil
|
18
|
+
@sizes.each_with_index do |size, i|
|
19
|
+
@formats.each {|format| convert(pdf, size, format, previous) }
|
20
|
+
previous = size if @rolling
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Convert a single PDF into page images at the specified size and format.
|
26
|
+
# If `--rolling`, and we have a previous image at a larger size to work with,
|
27
|
+
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
|
+
# Now we generate one page at a time, a counterintuitive opimization
|
29
|
+
# suggested by the GraphicsMagick list, that seems to work quite well.
|
30
|
+
def convert(pdf, size, format, previous=nil)
|
31
|
+
tempdir = Dir.mktmpdir
|
32
|
+
basename = File.basename(pdf, File.extname(pdf))
|
33
|
+
directory = directory_for(size)
|
34
|
+
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
|
+
escaped_pdf = ESCAPE[pdf]
|
36
|
+
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
37
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
38
|
+
if previous
|
39
|
+
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
40
|
+
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 #{"gm" unless ENV["toolchain"] == "imagemagick"} mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
41
|
+
raise ExtractionFailed, result if $? != 0
|
42
|
+
else
|
43
|
+
page_list(pages).each do |page|
|
44
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 #{"gm" unless ENV["toolchain"] == "imagemagick"} convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
|
+
result = `#{cmd}`.chomp
|
47
|
+
raise ExtractionFailed, result if $? != 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
ensure
|
51
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# Extract the relevant GraphicsMagick options from the options hash.
|
58
|
+
def extract_options(options)
|
59
|
+
@output = options[:output] || '.'
|
60
|
+
@pages = options[:pages]
|
61
|
+
@density = options[:density] || DEFAULT_DENSITY
|
62
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
63
|
+
@sizes = [options[:size]].flatten.compact
|
64
|
+
@sizes = [nil] if @sizes.empty?
|
65
|
+
@rolling = !!options[:rolling]
|
66
|
+
end
|
67
|
+
|
68
|
+
# If there's only one size requested, generate the images directly into
|
69
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
70
|
+
def directory_for(size)
|
71
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
72
|
+
File.expand_path(path)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Generate the resize argument.
|
76
|
+
def resize_arg(size)
|
77
|
+
size.nil? ? '' : "-resize #{size}"
|
78
|
+
end
|
79
|
+
|
80
|
+
# Generate the appropriate quality argument for the image format.
|
81
|
+
def quality_arg(format)
|
82
|
+
case format.to_s
|
83
|
+
when /jpe?g/ then "-quality 85"
|
84
|
+
when /png/ then "-quality 100"
|
85
|
+
else ""
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Generate the expanded list of requested page numbers.
|
90
|
+
def page_list(pages)
|
91
|
+
pages.split(',').map { |range|
|
92
|
+
if range.include?('-')
|
93
|
+
range = range.split('-')
|
94
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
95
|
+
else
|
96
|
+
range.to_i
|
97
|
+
end
|
98
|
+
}.flatten.uniq.sort
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
|
+
class InfoExtractor
|
5
|
+
|
6
|
+
# Regex matchers for different bits of information.
|
7
|
+
MATCHERS = {
|
8
|
+
:author => /^Author:\s+([^\n]+)/,
|
9
|
+
:date => /^CreationDate:\s+([^\n]+)/,
|
10
|
+
:creator => /^Creator:\s+([^\n]+)/,
|
11
|
+
:keywords => /^Keywords:\s+([^\n]+)/,
|
12
|
+
:producer => /^Producer:\s+([^\n]+)/,
|
13
|
+
:subject => /^Subject:\s+([^\n]+)/,
|
14
|
+
:title => /^Title:\s+([^\n]+)/,
|
15
|
+
:length => /^Pages:\s+([^\n]+)/,
|
16
|
+
}
|
17
|
+
|
18
|
+
# Pull out a single datum from a pdf.
|
19
|
+
def extract(key, pdfs, opts)
|
20
|
+
extract_all(pdfs, opts)[key]
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_all(pdfs, opts)
|
24
|
+
pdf = [pdfs].flatten.first
|
25
|
+
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
26
|
+
result = `#{cmd}`.chomp
|
27
|
+
raise ExtractionFailed, result if $? != 0
|
28
|
+
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
|
+
if String.method_defined?(:encode)
|
30
|
+
result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
|
31
|
+
else
|
32
|
+
require 'iconv' unless defined?(Iconv)
|
33
|
+
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
34
|
+
result = ic.iconv(result)
|
35
|
+
end
|
36
|
+
info = {}
|
37
|
+
MATCHERS.each do |key, matcher|
|
38
|
+
match = result.match(matcher)
|
39
|
+
answer = match && match[1]
|
40
|
+
if answer
|
41
|
+
answer = answer.to_i if key == :length
|
42
|
+
info[key] = answer
|
43
|
+
end
|
44
|
+
end
|
45
|
+
info
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftk** in order to create bursted single pages from
|
4
|
+
# a PDF document.
|
5
|
+
class PageExtractor
|
6
|
+
|
7
|
+
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
|
+
def extract(pdfs, opts)
|
9
|
+
extract_options opts
|
10
|
+
[pdfs].flatten.each do |pdf|
|
11
|
+
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
+
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
+
|
15
|
+
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
+
"pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
|
17
|
+
else
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
19
|
+
end
|
20
|
+
result = `#{cmd}`.chomp
|
21
|
+
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
22
|
+
raise ExtractionFailed, result if $? != 0
|
23
|
+
result
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def extract_options(options)
|
31
|
+
@output = options[:output] || '.'
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
class PdfExtractor
|
5
|
+
@@executable = nil
|
6
|
+
@@version_string = nil
|
7
|
+
|
8
|
+
# Provide a set of helper functions to determine the OS.
|
9
|
+
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
10
|
+
def windows?
|
11
|
+
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
12
|
+
end
|
13
|
+
def osx?
|
14
|
+
!!HOST_OS.match(/darwin/i)
|
15
|
+
end
|
16
|
+
def linux?
|
17
|
+
!!HOST_OS.match(/linux/i)
|
18
|
+
end
|
19
|
+
|
20
|
+
# The first line of the help output holds the name and version number
|
21
|
+
# of the office software to be used for extraction.
|
22
|
+
def version_string
|
23
|
+
unless @@version_string
|
24
|
+
null = windows? ? "NUL" : "/dev/null"
|
25
|
+
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
|
+
if !!@@version_string.match(/[0-9]*/)
|
27
|
+
@@version_string = `#{office_executable} --version`.split("\n").first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
@@version_string
|
31
|
+
end
|
32
|
+
def libre_office?
|
33
|
+
!!version_string.match(/^LibreOffice/)
|
34
|
+
end
|
35
|
+
def open_office?
|
36
|
+
!!version_string.match(/^OpenOffice.org/)
|
37
|
+
end
|
38
|
+
|
39
|
+
# A set of default locations to search for office software
|
40
|
+
# These have been extracted from JODConverter. Each listed
|
41
|
+
# path should contain a directory "program" which in turn
|
42
|
+
# contains the "soffice" executable.
|
43
|
+
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
|
44
|
+
def office_search_paths
|
45
|
+
if windows?
|
46
|
+
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
47
|
+
program_files_path = ENV["CommonProgramFiles"]
|
48
|
+
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
49
|
+
elsif osx?
|
50
|
+
search_paths = %w(
|
51
|
+
/Applications/LibreOffice.app/Contents
|
52
|
+
/Applications/OpenOffice.org.app/Contents
|
53
|
+
)
|
54
|
+
else # probably linux/unix
|
55
|
+
# heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
|
56
|
+
search_paths = %w(
|
57
|
+
/usr/lib/libreoffice
|
58
|
+
/usr/lib64/libreoffice
|
59
|
+
/opt/libreoffice
|
60
|
+
/usr/lib/openoffice
|
61
|
+
/usr/lib64/openoffice
|
62
|
+
/opt/openoffice.org3
|
63
|
+
/app/vendor/libreoffice
|
64
|
+
)
|
65
|
+
end
|
66
|
+
search_paths
|
67
|
+
end
|
68
|
+
|
69
|
+
# Identify the path to a working office executable.
|
70
|
+
def office_executable
|
71
|
+
paths = office_search_paths
|
72
|
+
|
73
|
+
# If an OFFICE_PATH has been specified on the commandline
|
74
|
+
# raise an error if that path isn't valid, otherwise, add
|
75
|
+
# it to the front of our search paths.
|
76
|
+
if ENV['OFFICE_PATH']
|
77
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
78
|
+
paths.unshift(ENV['OFFICE_PATH'])
|
79
|
+
end
|
80
|
+
|
81
|
+
# The location of the office executable is OS dependent
|
82
|
+
path_pieces = ["soffice"]
|
83
|
+
if windows?
|
84
|
+
path_pieces += [["program", "soffice.bin"]]
|
85
|
+
elsif osx?
|
86
|
+
path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
|
87
|
+
else
|
88
|
+
path_pieces += [["program", "soffice"]]
|
89
|
+
end
|
90
|
+
|
91
|
+
# Search for the first suitable office executable
|
92
|
+
# and short circuit an executable is found.
|
93
|
+
paths.each do |path|
|
94
|
+
if File.exists? path
|
95
|
+
@@executable ||= path unless File.directory? path
|
96
|
+
path_pieces.each do |pieces|
|
97
|
+
check_path = File.join(path, pieces)
|
98
|
+
@@executable ||= check_path if File.exists? check_path
|
99
|
+
end
|
100
|
+
end
|
101
|
+
break if @@executable
|
102
|
+
end
|
103
|
+
raise OfficeNotFound, "No office software found" unless @@executable
|
104
|
+
@@executable
|
105
|
+
end
|
106
|
+
|
107
|
+
# Used to specify the office location for JODConverter
|
108
|
+
def office_path
|
109
|
+
File.dirname(File.dirname(office_executable))
|
110
|
+
end
|
111
|
+
|
112
|
+
# Convert documents to PDF.
|
113
|
+
def extract(docs, opts)
|
114
|
+
out = opts[:output] || '.'
|
115
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
116
|
+
[docs].flatten.each do |doc|
|
117
|
+
ext = File.extname(doc)
|
118
|
+
basename = File.basename(doc, ext)
|
119
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
120
|
+
|
121
|
+
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
122
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
123
|
+
`#{"gm" unless ENV["toolchain"] == "imagemagick"} convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
124
|
+
else
|
125
|
+
if libre_office?
|
126
|
+
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
|
127
|
+
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
|
128
|
+
|
129
|
+
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
130
|
+
cmd = "#{office_executable} #{options} 2>&1"
|
131
|
+
result = `#{cmd}`.chomp
|
132
|
+
raise ExtractionFailed, result if $? != 0
|
133
|
+
true
|
134
|
+
else # open office presumably, rely on JODConverter to figure it out.
|
135
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
136
|
+
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
143
|
+
|
144
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
145
|
+
|
146
|
+
HEADLESS = "-Djava.awt.headless=true"
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
151
|
+
def run_jod(command, pdfs, opts, return_output=false)
|
152
|
+
|
153
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
154
|
+
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
155
|
+
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
156
|
+
result = `#{cmd}`.chomp
|
157
|
+
raise ExtractionFailed, result if $? != 0
|
158
|
+
return return_output ? (result.empty? ? nil : result) : true
|
159
|
+
end
|
160
|
+
|
161
|
+
class OfficeNotFound < StandardError; end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
|
5
|
+
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
6
|
+
# words. Algorithms taken from:
|
7
|
+
#
|
8
|
+
# Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
|
9
|
+
# -- Taghva, Nartker, Condit, and Borsack
|
10
|
+
#
|
11
|
+
# Improving Search and Retrieval Performance through Shortening Documents,
|
12
|
+
# Detecting Garbage, and Throwing out Jargon
|
13
|
+
# -- Kulp
|
14
|
+
#
|
15
|
+
class TextCleaner
|
16
|
+
|
17
|
+
# Cached regexes we plan on using.
|
18
|
+
WORD = /\S+/
|
19
|
+
SPACE = /\s+/
|
20
|
+
NEWLINE = /[\r\n]/
|
21
|
+
ALNUM = /[a-z0-9]/i
|
22
|
+
PUNCT = /[[:punct:]]/i
|
23
|
+
REPEAT = /([^0-9])\1{2,}/
|
24
|
+
UPPER = /[A-Z]/
|
25
|
+
LOWER = /[a-z]/
|
26
|
+
ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
|
27
|
+
ALL_ALPHA = /^[a-z]+$/i
|
28
|
+
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
|
29
|
+
VOWEL = /([aeiou]|y$)/i
|
30
|
+
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
|
31
|
+
VOWEL_5 = /[aeiou]{5}/i
|
32
|
+
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
|
33
|
+
SINGLETONS = /^[AaIi]$/
|
34
|
+
|
35
|
+
# For the time being, `clean` uses the regular StringScanner, and not the
|
36
|
+
# multibyte-aware version, coercing to ASCII first.
|
37
|
+
def clean(text)
|
38
|
+
if String.method_defined?(:encode)
|
39
|
+
text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
|
40
|
+
else
|
41
|
+
require 'iconv' unless defined?(Iconv)
|
42
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
43
|
+
end
|
44
|
+
|
45
|
+
scanner = StringScanner.new(text)
|
46
|
+
cleaned = []
|
47
|
+
spaced = false
|
48
|
+
loop do
|
49
|
+
if space = scanner.scan(SPACE)
|
50
|
+
cleaned.push(space) unless spaced && (space !~ NEWLINE)
|
51
|
+
spaced = true
|
52
|
+
elsif word = scanner.scan(WORD)
|
53
|
+
unless garbage(word)
|
54
|
+
cleaned.push(word)
|
55
|
+
spaced = false
|
56
|
+
end
|
57
|
+
elsif scanner.eos?
|
58
|
+
return cleaned.join('').gsub(REPEATED, '')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Is a given word OCR garbage?
|
64
|
+
def garbage(w)
|
65
|
+
acronym = w =~ ACRONYM
|
66
|
+
|
67
|
+
# More than 30 bytes in length.
|
68
|
+
(w.length > 30) ||
|
69
|
+
|
70
|
+
# If there are three or more identical characters in a row in the string.
|
71
|
+
(w =~ REPEAT) ||
|
72
|
+
|
73
|
+
# More punctuation than alpha numerics.
|
74
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
75
|
+
|
76
|
+
# Ignoring the first and last characters in the string, if there are three or
|
77
|
+
# more different punctuation characters in the string.
|
78
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
79
|
+
|
80
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
81
|
+
((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
|
82
|
+
|
83
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
84
|
+
# not all uppercase + punctuation.
|
85
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
86
|
+
|
87
|
+
# Single letters that are not A or I.
|
88
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
89
|
+
|
90
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
91
|
+
# consonants, or 8 times more consonants than vowels.
|
92
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
93
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
94
|
+
(cons > vows * 8)))
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
|
+
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
|
+
# forbid OCR extraction, but by default the heuristic works like this:
|
6
|
+
#
|
7
|
+
# * Check for the presence of fonts in the PDF. If no fonts are detected,
|
8
|
+
# OCR is used automatically.
|
9
|
+
# * Extract the text of each page with **pdftotext**, if the page has less
|
10
|
+
# than 100 bytes of text (a scanned image page, or a page that just
|
11
|
+
# contains a filename and a page number), then add it to the list of
|
12
|
+
# `@pages_to_ocr`.
|
13
|
+
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
|
+
#
|
15
|
+
class TextExtractor
|
16
|
+
|
17
|
+
NO_TEXT_DETECTED = /---------\n\Z/
|
18
|
+
|
19
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
|
+
|
22
|
+
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@pages_to_ocr = []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Extract text from a list of PDFs.
|
29
|
+
def extract(pdfs, opts)
|
30
|
+
extract_options opts
|
31
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
32
|
+
[pdfs].flatten.each do |pdf|
|
33
|
+
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
+
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
|
35
|
+
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
|
+
extract_from_ocr(pdf, pages)
|
37
|
+
else
|
38
|
+
extract_from_pdf(pdf, pages)
|
39
|
+
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
|
40
|
+
extract_from_ocr(pdf, @pages_to_ocr)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Does a PDF have any text embedded?
|
47
|
+
def contains_text?(pdf)
|
48
|
+
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
|
49
|
+
!fonts.match(NO_TEXT_DETECTED)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Extract a page range worth of text from a PDF, directly.
|
53
|
+
def extract_from_pdf(pdf, pages)
|
54
|
+
return extract_full(pdf) unless pages
|
55
|
+
pages.each {|page| extract_page(pdf, page) }
|
56
|
+
end
|
57
|
+
|
58
|
+
# Extract a page range worth of text from a PDF via OCR.
|
59
|
+
def extract_from_ocr(pdf, pages)
|
60
|
+
tempdir = Dir.mktmpdir
|
61
|
+
base_path = File.join(@output, @pdf_name)
|
62
|
+
escaped_pdf = ESCAPE[pdf]
|
63
|
+
if pages
|
64
|
+
pages.each do |page|
|
65
|
+
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
|
+
escaped_tiff = ESCAPE[tiff]
|
67
|
+
file = "#{base_path}_#{page}"
|
68
|
+
if ENV["toolchain"] == 'graphicsmagick'
|
69
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
70
|
+
else
|
71
|
+
run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
|
72
|
+
end
|
73
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
|
74
|
+
clean_text(file + '.txt') if @clean_ocr
|
75
|
+
FileUtils.remove_entry_secure tiff
|
76
|
+
end
|
77
|
+
else
|
78
|
+
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
79
|
+
escaped_tiff = ESCAPE[tiff]
|
80
|
+
if ENV["toolchain"] == 'graphicsmagick'
|
81
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
82
|
+
else
|
83
|
+
run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
|
84
|
+
end
|
85
|
+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
|
86
|
+
clean_text(base_path + '.txt') if @clean_ocr
|
87
|
+
end
|
88
|
+
ensure
|
89
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def clean_text(file)
|
96
|
+
File.open(file, 'r+') do |f|
|
97
|
+
text = f.read
|
98
|
+
f.truncate(0)
|
99
|
+
f.rewind
|
100
|
+
f.write(Docsplit.clean_text(text))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Run an external process and raise an exception if it fails.
|
105
|
+
def run(command)
|
106
|
+
result = `#{command}`
|
107
|
+
raise ExtractionFailed, result if $? != 0
|
108
|
+
result
|
109
|
+
end
|
110
|
+
|
111
|
+
# Extract the full contents of a pdf as a single file, directly.
|
112
|
+
def extract_full(pdf)
|
113
|
+
text_path = File.join(@output, "#{@pdf_name}.txt")
|
114
|
+
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
115
|
+
end
|
116
|
+
|
117
|
+
# Extract the contents of a single page of text, directly, adding it to
|
118
|
+
# the `@pages_to_ocr` list if the text length is inadequate.
|
119
|
+
def extract_page(pdf, page)
|
120
|
+
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
121
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
122
|
+
unless @forbid_ocr
|
123
|
+
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def extract_options(options)
|
128
|
+
@output = options[:output] || '.'
|
129
|
+
@pages = options[:pages]
|
130
|
+
@force_ocr = options[:ocr] == true
|
131
|
+
@forbid_ocr = options[:ocr] == false
|
132
|
+
@clean_ocr = !(options[:clean] == false)
|
133
|
+
@language = options[:language] || 'eng'
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Include a method to transparently convert non-PDF arguments to temporary
|
4
|
+
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
|
+
module TransparentPDFs
|
6
|
+
|
7
|
+
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
|
+
# through further extraction.
|
9
|
+
def ensure_pdfs(docs)
|
10
|
+
[docs].flatten.map do |doc|
|
11
|
+
if is_pdf?(doc)
|
12
|
+
doc
|
13
|
+
else
|
14
|
+
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
15
|
+
extract_pdf([doc], {:output => tempdir})
|
16
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def is_pdf?(doc)
|
22
|
+
File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
extend TransparentPDFs
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"name": "Portable Document Format",
|
4
|
+
"extension": "pdf",
|
5
|
+
"mediaType": "application/pdf",
|
6
|
+
"storePropertiesByFamily": {
|
7
|
+
"DRAWING": {"FilterName": "draw_pdf_Export"},
|
8
|
+
"SPREADSHEET": {"FilterName": "calc_pdf_Export"},
|
9
|
+
"PRESENTATION": {"FilterName": "impress_pdf_Export"},
|
10
|
+
"TEXT": {"FilterName": "writer_pdf_Export"}
|
11
|
+
}
|
12
|
+
},
|
13
|
+
{
|
14
|
+
"name": "Macromedia Flash",
|
15
|
+
"extension": "swf",
|
16
|
+
"mediaType": "application/x-shockwave-flash",
|
17
|
+
"storePropertiesByFamily": {
|
18
|
+
"DRAWING": {"FilterName": "draw_flash_Export"},
|
19
|
+
"PRESENTATION": {"FilterName": "impress_flash_Export"}
|
20
|
+
}
|
21
|
+
},
|
22
|
+
{
|
23
|
+
"name": "HTML",
|
24
|
+
"extension": "html",
|
25
|
+
"mediaType": "text/html",
|
26
|
+
"inputFamily": "TEXT",
|
27
|
+
"storePropertiesByFamily": {
|
28
|
+
"SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
|
29
|
+
"PRESENTATION": {"FilterName": "impress_html_Export"},
|
30
|
+
"TEXT": {"FilterName": "HTML (StarWriter)"}
|
31
|
+
}
|
32
|
+
},
|
33
|
+
{
|
34
|
+
"name": "OpenDocument Text",
|
35
|
+
"extension": "odt",
|
36
|
+
"mediaType": "application/vnd.oasis.opendocument.text",
|
37
|
+
"inputFamily": "TEXT",
|
38
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"name": "OpenOffice.org 1.0 Text Document",
|
42
|
+
"extension": "sxw",
|
43
|
+
"mediaType": "application/vnd.sun.xml.writer",
|
44
|
+
"inputFamily": "TEXT",
|
45
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
|
46
|
+
},
|
47
|
+
{
|
48
|
+
"name": "Microsoft Word",
|
49
|
+
"extension": "doc",
|
50
|
+
"mediaType": "application/msword",
|
51
|
+
"inputFamily": "TEXT",
|
52
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
|
53
|
+
},
|
54
|
+
{
|
55
|
+
"name": "Microsoft Word 2007 XML",
|
56
|
+
"extension": "docx",
|
57
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
58
|
+
"inputFamily": "TEXT"
|
59
|
+
},
|
60
|
+
{
|
61
|
+
"name": "Rich Text Format",
|
62
|
+
"extension": "rtf",
|
63
|
+
"mediaType": "text/rtf",
|
64
|
+
"inputFamily": "TEXT",
|
65
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
|
66
|
+
},
|
67
|
+
{
|
68
|
+
"name": "WordPerfect",
|
69
|
+
"extension": "wpd",
|
70
|
+
"mediaType": "application/wordperfect",
|
71
|
+
"inputFamily": "TEXT"
|
72
|
+
},
|
73
|
+
{
|
74
|
+
"name": "Plain Text",
|
75
|
+
"extension": "txt",
|
76
|
+
"mediaType": "text/plain",
|
77
|
+
"inputFamily": "TEXT",
|
78
|
+
"loadProperties": {
|
79
|
+
"FilterName": "Text (encoded)",
|
80
|
+
"FilterOptions": "utf8"
|
81
|
+
},
|
82
|
+
"storePropertiesByFamily": {"TEXT": {
|
83
|
+
"FilterName": "Text (encoded)",
|
84
|
+
"FilterOptions": "utf8"
|
85
|
+
}}
|
86
|
+
},
|
87
|
+
{
|
88
|
+
"name": "MediaWiki wikitext",
|
89
|
+
"extension": "wiki",
|
90
|
+
"mediaType": "text/x-wiki",
|
91
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
|
92
|
+
},
|
93
|
+
{
|
94
|
+
"name": "OpenDocument Spreadsheet",
|
95
|
+
"extension": "ods",
|
96
|
+
"mediaType": "application/vnd.oasis.opendocument.spreadsheet",
|
97
|
+
"inputFamily": "SPREADSHEET",
|
98
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
|
99
|
+
},
|
100
|
+
{
|
101
|
+
"name": "OpenOffice.org 1.0 Spreadsheet",
|
102
|
+
"extension": "sxc",
|
103
|
+
"mediaType": "application/vnd.sun.xml.calc",
|
104
|
+
"inputFamily": "SPREADSHEET",
|
105
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
|
106
|
+
},
|
107
|
+
{
|
108
|
+
"name": "Microsoft Excel",
|
109
|
+
"extension": "xls",
|
110
|
+
"mediaType": "application/vnd.ms-excel",
|
111
|
+
"inputFamily": "SPREADSHEET",
|
112
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
|
113
|
+
},
|
114
|
+
{
|
115
|
+
"name": "Microsoft Excel 2007 XML",
|
116
|
+
"extension": "xlsx",
|
117
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
118
|
+
"inputFamily": "SPREADSHEET"
|
119
|
+
},
|
120
|
+
{
|
121
|
+
"name": "Comma Separated Values",
|
122
|
+
"extension": "csv",
|
123
|
+
"mediaType": "text/csv",
|
124
|
+
"inputFamily": "SPREADSHEET",
|
125
|
+
"loadProperties": {
|
126
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
127
|
+
"FilterOptions": "44,34,0"
|
128
|
+
},
|
129
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
130
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
131
|
+
"FilterOptions": "44,34,0"
|
132
|
+
}}
|
133
|
+
},
|
134
|
+
{
|
135
|
+
"name": "Tab Separated Values",
|
136
|
+
"extension": "tsv",
|
137
|
+
"mediaType": "text/tab-separated-values",
|
138
|
+
"inputFamily": "SPREADSHEET",
|
139
|
+
"loadProperties": {
|
140
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
141
|
+
"FilterOptions": "9,34,0"
|
142
|
+
},
|
143
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
144
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
145
|
+
"FilterOptions": "9,34,0"
|
146
|
+
}}
|
147
|
+
},
|
148
|
+
{
|
149
|
+
"name": "OpenDocument Presentation",
|
150
|
+
"extension": "odp",
|
151
|
+
"mediaType": "application/vnd.oasis.opendocument.presentation",
|
152
|
+
"inputFamily": "PRESENTATION",
|
153
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
|
154
|
+
},
|
155
|
+
{
|
156
|
+
"name": "OpenOffice.org 1.0 Presentation",
|
157
|
+
"extension": "sxi",
|
158
|
+
"mediaType": "application/vnd.sun.xml.impress",
|
159
|
+
"inputFamily": "PRESENTATION",
|
160
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
|
161
|
+
},
|
162
|
+
{
|
163
|
+
"name": "Microsoft PowerPoint",
|
164
|
+
"extension": "ppt",
|
165
|
+
"mediaType": "application/vnd.ms-powerpoint",
|
166
|
+
"inputFamily": "PRESENTATION",
|
167
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
|
168
|
+
},
|
169
|
+
{
|
170
|
+
"name": "Microsoft PowerPoint 2007 XML",
|
171
|
+
"extension": "pptx",
|
172
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
173
|
+
"inputFamily": "PRESENTATION"
|
174
|
+
},
|
175
|
+
{
|
176
|
+
"name": "OpenDocument Drawing",
|
177
|
+
"extension": "odg",
|
178
|
+
"mediaType": "application/vnd.oasis.opendocument.graphics",
|
179
|
+
"inputFamily": "DRAWING",
|
180
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
|
181
|
+
},
|
182
|
+
{
|
183
|
+
"name": "Scalable Vector Graphics",
|
184
|
+
"extension": "svg",
|
185
|
+
"mediaType": "image/svg+xml",
|
186
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
|
187
|
+
},
|
188
|
+
{
|
189
|
+
"name": "Portable Network Graphic",
|
190
|
+
"extension": "png",
|
191
|
+
"mediaType": "image/png",
|
192
|
+
"storePropertiesByFamily": {
|
193
|
+
"DRAWING": {"FilterName": "draw_png_Export"},
|
194
|
+
"PRESENTATION": {"FilterName": "impress_png_Export"}
|
195
|
+
}
|
196
|
+
},
|
197
|
+
{
|
198
|
+
"name": "Graphics Interchange Format",
|
199
|
+
"extension": "gif",
|
200
|
+
"mediaType": "image/gif",
|
201
|
+
"storePropertiesByFamily": {
|
202
|
+
"DRAWING": {"FilterName": "draw_gif_Export"},
|
203
|
+
"PRESENTATION": {"FilterName": "impress_gif_Export"}
|
204
|
+
}
|
205
|
+
},
|
206
|
+
{
|
207
|
+
"name": "Joint Photographic Experts Group",
|
208
|
+
"extension": "jpg",
|
209
|
+
"mediaType": "image/jpeg",
|
210
|
+
"storePropertiesByFamily": {
|
211
|
+
"DRAWING": {"FilterName": "draw_jpg_Export"},
|
212
|
+
"PRESENTATION": {"FilterName": "impress_jpg_Export"}
|
213
|
+
}
|
214
|
+
},
|
215
|
+
{
|
216
|
+
"name": "Windows Bitmap",
|
217
|
+
"extension": "bmp",
|
218
|
+
"mediaType": "image/bmp",
|
219
|
+
"storePropertiesByFamily": {
|
220
|
+
"DRAWING": {"FilterName": "draw_bmp_Export"},
|
221
|
+
"PRESENTATION": {"FilterName": "impress_bmp_Export"}
|
222
|
+
}
|
223
|
+
},
|
224
|
+
{
|
225
|
+
"name": "Tagged Image File Format",
|
226
|
+
"extension": "tif",
|
227
|
+
"mediaType": "image/tiff",
|
228
|
+
"storePropertiesByFamily": {
|
229
|
+
"DRAWING": {"FilterName": "draw_tif_Export"},
|
230
|
+
"PRESENTATION": {"FilterName": "impress_tif_Export"}
|
231
|
+
}
|
232
|
+
}
|
233
|
+
]
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
.level=WARNING
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: concerto_docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeremy Ashkenas
|
8
|
+
- Samuel Clay
|
9
|
+
- Ted Han
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-05-28 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: |2
|
16
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
17
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
18
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
19
|
+
metadata (title, author, number of pages...)
|
20
|
+
email: opensource@documentcloud.org
|
21
|
+
executables:
|
22
|
+
- docsplit
|
23
|
+
extensions: []
|
24
|
+
extra_rdoc_files: []
|
25
|
+
files:
|
26
|
+
- lib/docsplit/command_line.rb
|
27
|
+
- lib/docsplit/image_extractor.rb
|
28
|
+
- lib/docsplit/info_extractor.rb
|
29
|
+
- lib/docsplit/page_extractor.rb
|
30
|
+
- lib/docsplit/pdf_extractor.rb
|
31
|
+
- lib/docsplit/text_cleaner.rb
|
32
|
+
- lib/docsplit/text_extractor.rb
|
33
|
+
- lib/docsplit/transparent_pdfs.rb
|
34
|
+
- lib/docsplit.rb
|
35
|
+
- bin/docsplit
|
36
|
+
- vendor/conf/document-formats.js
|
37
|
+
- vendor/jodconverter/commons-cli-1.1.jar
|
38
|
+
- vendor/jodconverter/commons-io-1.4.jar
|
39
|
+
- vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
|
40
|
+
- vendor/jodconverter/json-20090211.jar
|
41
|
+
- vendor/jodconverter/juh-3.2.1.jar
|
42
|
+
- vendor/jodconverter/jurt-3.2.1.jar
|
43
|
+
- vendor/jodconverter/ridl-3.2.1.jar
|
44
|
+
- vendor/jodconverter/unoil-3.2.1.jar
|
45
|
+
- vendor/logging.properties
|
46
|
+
- LICENSE
|
47
|
+
- README
|
48
|
+
homepage: http://documentcloud.github.com/docsplit/
|
49
|
+
licenses:
|
50
|
+
- MIT
|
51
|
+
metadata: {}
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubyforge_project: docsplit
|
68
|
+
rubygems_version: 2.0.14
|
69
|
+
signing_key:
|
70
|
+
specification_version: 4
|
71
|
+
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
72
|
+
test_files: []
|