mateusmaso-docsplit 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
4
+
5
+ Permission is hereby granted, free of charge, to any person
6
+ obtaining a copy of this software and associated documentation
7
+ files (the "Software"), to deal in the Software without
8
+ restriction, including without limitation the rights to use,
9
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the
11
+ Software is furnished to do so, subject to the following
12
+ conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'mateusmaso-docsplit'
3
+ s.version = '0.6.4'
4
+ s.date = '2013-02-05'
5
+
6
+ s.homepage = "http://github.com/mateusmaso/docsplit"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+
19
+ s.require_paths = ['lib']
20
+ s.executables = ['docsplit']
21
+
22
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
+ 'docsplit.gemspec', 'LICENSE', 'README']
24
+ end
@@ -0,0 +1,122 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
91
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
+ @options[:ocr] = o
93
+ end
94
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
+ @options[:clean] = false
96
+ end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ end
100
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
101
+ @options[:rolling] = true
102
+ end
103
+ opts.on_tail('-v', '--version', 'display docsplit version') do
104
+ puts "Docsplit version #{Docsplit::VERSION}"
105
+ exit
106
+ end
107
+ opts.on_tail('-h', '--help', 'display this help message') do
108
+ usage
109
+ end
110
+ end
111
+ @option_parser.banner = BANNER
112
+ begin
113
+ @option_parser.parse!(ARGV)
114
+ rescue OptionParser::InvalidOption => e
115
+ puts e.message
116
+ exit(1)
117
+ end
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,103 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
8
+ DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
36
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
+ if previous
39
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
+ raise ExtractionFailed, result if $? != 0
42
+ else
43
+ page_list(pages).each do |page|
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
+ result = `#{cmd}`.chomp
47
+ raise ExtractionFailed, result if $? != 0
48
+ end
49
+ end
50
+ ensure
51
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
52
+ end
53
+
54
+
55
+ private
56
+
57
+ # Extract the relevant GraphicsMagick options from the options hash.
58
+ def extract_options(options)
59
+ @output = options[:output] || '.'
60
+ @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
62
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
63
+ @sizes = [options[:size]].flatten.compact
64
+ @sizes = [nil] if @sizes.empty?
65
+ @rolling = !!options[:rolling]
66
+ end
67
+
68
+ # If there's only one size requested, generate the images directly into
69
+ # the output directory. Multiple sizes each get a directory of their own.
70
+ def directory_for(size)
71
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
72
+ File.expand_path(path)
73
+ end
74
+
75
+ # Generate the resize argument.
76
+ def resize_arg(size)
77
+ size.nil? ? '' : "-resize #{size}"
78
+ end
79
+
80
+ # Generate the appropriate quality argument for the image format.
81
+ def quality_arg(format)
82
+ case format.to_s
83
+ when /jpe?g/ then "-quality 85"
84
+ when /png/ then "-quality 100"
85
+ else ""
86
+ end
87
+ end
88
+
89
+ # Generate the expanded list of requested page numbers.
90
+ def page_list(pages)
91
+ pages.split(',').map { |range|
92
+ if range.include?('-')
93
+ range = range.split('-')
94
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
95
+ else
96
+ range.to_i
97
+ end
98
+ }.flatten.uniq.sort
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -0,0 +1,39 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
25
+ if String.method_defined?(:encode)
26
+ result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
27
+ else
28
+ ic = Iconv.new('UTF-8', 'UTF-8//IGNORE')
29
+ result = ic.iconv(result)
30
+ end
31
+ match = result.match(MATCHERS[key])
32
+ answer = match && match[1]
33
+ answer = answer.to_i if answer && key == :length
34
+ answer
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,36 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
20
+ result = `#{cmd}`.chomp
21
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
+ raise ExtractionFailed, result if $? != 0
23
+ result
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def extract_options(options)
31
+ @output = options[:output] || '.'
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,94 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,130 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
63
+ if pages
64
+ pages.each do |page|
65
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
67
+ file = "#{base_path}_#{page}"
68
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ clean_text(file + '.txt') if @clean_ocr
71
+ FileUtils.remove_entry_secure tiff
72
+ end
73
+ else
74
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
75
+ escaped_tiff = ESCAPE[tiff]
76
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ clean_text(base_path + '.txt') if @clean_ocr
79
+ end
80
+ ensure
81
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
82
+ end
83
+
84
+
85
+ private
86
+
87
+ def clean_text(file)
88
+ File.open(file, 'r+') do |f|
89
+ text = f.read
90
+ f.truncate(0)
91
+ f.rewind
92
+ f.write(Docsplit.clean_text(text))
93
+ end
94
+ end
95
+
96
+ # Run an external process and raise an exception if it fails.
97
+ def run(command)
98
+ result = `#{command}`
99
+ raise ExtractionFailed, result if $? != 0
100
+ result
101
+ end
102
+
103
+ # Extract the full contents of a pdf as a single file, directly.
104
+ def extract_full(pdf)
105
+ text_path = File.join(@output, "#{@pdf_name}.txt")
106
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
107
+ end
108
+
109
+ # Extract the contents of a single page of text, directly, adding it to
110
+ # the `@pages_to_ocr` list if the text length is inadequate.
111
+ def extract_page(pdf, page)
112
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
113
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
114
+ unless @forbid_ocr
115
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
116
+ end
117
+ end
118
+
119
+ def extract_options(options)
120
+ @output = options[:output] || '.'
121
+ @pages = options[:pages]
122
+ @force_ocr = options[:ocr] == true
123
+ @forbid_ocr = options[:ocr] == false
124
+ @clean_ocr = !(options[:clean] == false)
125
+ @language = options[:language] || 'eng'
126
+ end
127
+
128
+ end
129
+
130
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext.downcase == '.pdf'
13
+ doc
14
+ else
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,130 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
5
+ # The Docsplit module delegates to the Java PDF extractors.
6
+ module Docsplit
7
+
8
+ VERSION = '0.6.4' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
+
12
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
14
+
15
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
16
+
17
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
18
+
19
+ HEADLESS = "-Djava.awt.headless=true"
20
+
21
+ office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
22
+ office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
23
+
24
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
25
+
26
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
27
+
28
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
29
+
30
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
31
+
32
+ # Check for all dependencies, and note their absence.
33
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
34
+ DEPENDENCIES.each_key do |dep|
35
+ dirs.each do |dir|
36
+ if File.executable?(File.join(dir, dep.to_s))
37
+ DEPENDENCIES[dep] = true
38
+ break
39
+ end
40
+ end
41
+ end
42
+
43
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
44
+ # broke.
45
+ class ExtractionFailed < StandardError; end
46
+
47
+ # Use the ExtractPages Java class to burst a PDF into single pages.
48
+ def self.extract_pages(pdfs, opts={})
49
+ pdfs = ensure_pdfs(pdfs)
50
+ PageExtractor.new.extract(pdfs, opts)
51
+ end
52
+
53
+ # Use the ExtractText Java class to write out all embedded text.
54
+ def self.extract_text(pdfs, opts={})
55
+ pdfs = ensure_pdfs(pdfs)
56
+ TextExtractor.new.extract(pdfs, opts)
57
+ end
58
+
59
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
60
+ def self.extract_images(pdfs, opts={})
61
+ pdfs = ensure_pdfs(pdfs)
62
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
63
+ ImageExtractor.new.extract(pdfs, opts)
64
+ end
65
+
66
+ # Use JODCConverter to extract the documents as PDFs.
67
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
68
+ def self.extract_pdf(docs, opts={})
69
+ out = opts[:output] || '.'
70
+ FileUtils.mkdir_p out unless File.exists?(out)
71
+ [docs].flatten.each do |doc|
72
+ ext = File.extname(doc)
73
+ basename = File.basename(doc, ext)
74
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
75
+
76
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
77
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
78
+ else
79
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
80
+ run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
81
+ end
82
+ end
83
+ end
84
+
85
+ # Define custom methods for each of the metadata keys that we support.
86
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
87
+ METADATA_KEYS.each do |key|
88
+ instance_eval <<-EOS
89
+ def self.extract_#{key}(pdfs, opts={})
90
+ pdfs = ensure_pdfs(pdfs)
91
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
92
+ end
93
+ EOS
94
+ end
95
+
96
+ # Utility method to clean OCR'd text with garbage characters.
97
+ def self.clean_text(text)
98
+ TextCleaner.new.clean(text)
99
+ end
100
+
101
+
102
+ private
103
+
104
+ # Runs a Java command, with quieted logging, and the classpath set properly.
105
+ def self.run(command, pdfs, opts, return_output=false)
106
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
107
+ cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
108
+ result = `#{cmd}`.chomp
109
+ raise ExtractionFailed, result if $? != 0
110
+ return return_output ? (result.empty? ? nil : result) : true
111
+ end
112
+
113
+ # Normalize a value in an options hash for the command line.
114
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
115
+ def self.normalize_value(value)
116
+ case value
117
+ when Range then normalize_range(value)
118
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
119
+ else value.to_s
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
126
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
127
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
128
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
129
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
130
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -0,0 +1,233 @@
1
+ [
2
+ {
3
+ "name": "Portable Document Format",
4
+ "extension": "pdf",
5
+ "mediaType": "application/pdf",
6
+ "storePropertiesByFamily": {
7
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
8
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
9
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
10
+ "TEXT": {"FilterName": "writer_pdf_Export"}
11
+ }
12
+ },
13
+ {
14
+ "name": "Macromedia Flash",
15
+ "extension": "swf",
16
+ "mediaType": "application/x-shockwave-flash",
17
+ "storePropertiesByFamily": {
18
+ "DRAWING": {"FilterName": "draw_flash_Export"},
19
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
20
+ }
21
+ },
22
+ {
23
+ "name": "HTML",
24
+ "extension": "html",
25
+ "mediaType": "text/html",
26
+ "inputFamily": "TEXT",
27
+ "storePropertiesByFamily": {
28
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
29
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
30
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
31
+ }
32
+ },
33
+ {
34
+ "name": "OpenDocument Text",
35
+ "extension": "odt",
36
+ "mediaType": "application/vnd.oasis.opendocument.text",
37
+ "inputFamily": "TEXT",
38
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
39
+ },
40
+ {
41
+ "name": "OpenOffice.org 1.0 Text Document",
42
+ "extension": "sxw",
43
+ "mediaType": "application/vnd.sun.xml.writer",
44
+ "inputFamily": "TEXT",
45
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
46
+ },
47
+ {
48
+ "name": "Microsoft Word",
49
+ "extension": "doc",
50
+ "mediaType": "application/msword",
51
+ "inputFamily": "TEXT",
52
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
53
+ },
54
+ {
55
+ "name": "Microsoft Word 2007 XML",
56
+ "extension": "docx",
57
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
58
+ "inputFamily": "TEXT"
59
+ },
60
+ {
61
+ "name": "Rich Text Format",
62
+ "extension": "rtf",
63
+ "mediaType": "text/rtf",
64
+ "inputFamily": "TEXT",
65
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
66
+ },
67
+ {
68
+ "name": "WordPerfect",
69
+ "extension": "wpd",
70
+ "mediaType": "application/wordperfect",
71
+ "inputFamily": "TEXT"
72
+ },
73
+ {
74
+ "name": "Plain Text",
75
+ "extension": "txt",
76
+ "mediaType": "text/plain",
77
+ "inputFamily": "TEXT",
78
+ "loadProperties": {
79
+ "FilterName": "Text (encoded)",
80
+ "FilterOptions": "utf8"
81
+ },
82
+ "storePropertiesByFamily": {"TEXT": {
83
+ "FilterName": "Text (encoded)",
84
+ "FilterOptions": "utf8"
85
+ }}
86
+ },
87
+ {
88
+ "name": "MediaWiki wikitext",
89
+ "extension": "wiki",
90
+ "mediaType": "text/x-wiki",
91
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
92
+ },
93
+ {
94
+ "name": "OpenDocument Spreadsheet",
95
+ "extension": "ods",
96
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
97
+ "inputFamily": "SPREADSHEET",
98
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
99
+ },
100
+ {
101
+ "name": "OpenOffice.org 1.0 Spreadsheet",
102
+ "extension": "sxc",
103
+ "mediaType": "application/vnd.sun.xml.calc",
104
+ "inputFamily": "SPREADSHEET",
105
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106
+ },
107
+ {
108
+ "name": "Microsoft Excel",
109
+ "extension": "xls",
110
+ "mediaType": "application/vnd.ms-excel",
111
+ "inputFamily": "SPREADSHEET",
112
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113
+ },
114
+ {
115
+ "name": "Microsoft Excel 2007 XML",
116
+ "extension": "xlsx",
117
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118
+ "inputFamily": "SPREADSHEET"
119
+ },
120
+ {
121
+ "name": "Comma Separated Values",
122
+ "extension": "csv",
123
+ "mediaType": "text/csv",
124
+ "inputFamily": "SPREADSHEET",
125
+ "loadProperties": {
126
+ "FilterName": "Text - txt - csv (StarCalc)",
127
+ "FilterOptions": "44,34,0"
128
+ },
129
+ "storePropertiesByFamily": {"SPREADSHEET": {
130
+ "FilterName": "Text - txt - csv (StarCalc)",
131
+ "FilterOptions": "44,34,0"
132
+ }}
133
+ },
134
+ {
135
+ "name": "Tab Separated Values",
136
+ "extension": "tsv",
137
+ "mediaType": "text/tab-separated-values",
138
+ "inputFamily": "SPREADSHEET",
139
+ "loadProperties": {
140
+ "FilterName": "Text - txt - csv (StarCalc)",
141
+ "FilterOptions": "9,34,0"
142
+ },
143
+ "storePropertiesByFamily": {"SPREADSHEET": {
144
+ "FilterName": "Text - txt - csv (StarCalc)",
145
+ "FilterOptions": "9,34,0"
146
+ }}
147
+ },
148
+ {
149
+ "name": "OpenDocument Presentation",
150
+ "extension": "odp",
151
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
152
+ "inputFamily": "PRESENTATION",
153
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154
+ },
155
+ {
156
+ "name": "OpenOffice.org 1.0 Presentation",
157
+ "extension": "sxi",
158
+ "mediaType": "application/vnd.sun.xml.impress",
159
+ "inputFamily": "PRESENTATION",
160
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161
+ },
162
+ {
163
+ "name": "Microsoft PowerPoint",
164
+ "extension": "ppt",
165
+ "mediaType": "application/vnd.ms-powerpoint",
166
+ "inputFamily": "PRESENTATION",
167
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168
+ },
169
+ {
170
+ "name": "Microsoft PowerPoint 2007 XML",
171
+ "extension": "pptx",
172
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173
+ "inputFamily": "PRESENTATION"
174
+ },
175
+ {
176
+ "name": "OpenDocument Drawing",
177
+ "extension": "odg",
178
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
179
+ "inputFamily": "DRAWING",
180
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181
+ },
182
+ {
183
+ "name": "Scalable Vector Graphics",
184
+ "extension": "svg",
185
+ "mediaType": "image/svg+xml",
186
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187
+ },
188
+ {
189
+ "name": "Portable Network Graphic",
190
+ "extension": "png",
191
+ "mediaType": "image/png",
192
+ "storePropertiesByFamily": {
193
+ "DRAWING": {"FilterName": "draw_png_Export"},
194
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
195
+ }
196
+ },
197
+ {
198
+ "name": "Graphics Interchange Format",
199
+ "extension": "gif",
200
+ "mediaType": "image/gif",
201
+ "storePropertiesByFamily": {
202
+ "DRAWING": {"FilterName": "draw_gif_Export"},
203
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
204
+ }
205
+ },
206
+ {
207
+ "name": "Joint Photographic Experts Group",
208
+ "extension": "jpg",
209
+ "mediaType": "image/jpeg",
210
+ "storePropertiesByFamily": {
211
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
212
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213
+ }
214
+ },
215
+ {
216
+ "name": "Windows Bitmap",
217
+ "extension": "bmp",
218
+ "mediaType": "image/bmp",
219
+ "storePropertiesByFamily": {
220
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
221
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222
+ }
223
+ },
224
+ {
225
+ "name": "Tagged Image File Format",
226
+ "extension": "tif",
227
+ "mediaType": "image/tiff",
228
+ "storePropertiesByFamily": {
229
+ "DRAWING": {"FilterName": "draw_tif_Export"},
230
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
231
+ }
232
+ }
233
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mateusmaso-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.4
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeremy Ashkenas
9
+ - Samuel Clay
10
+ - Ted Han
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2013-02-05 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: jeremy@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/command_line.rb
27
+ - lib/docsplit/image_extractor.rb
28
+ - lib/docsplit/info_extractor.rb
29
+ - lib/docsplit/page_extractor.rb
30
+ - lib/docsplit/text_cleaner.rb
31
+ - lib/docsplit/text_extractor.rb
32
+ - lib/docsplit/transparent_pdfs.rb
33
+ - lib/docsplit.rb
34
+ - bin/docsplit
35
+ - vendor/conf/document-formats.js
36
+ - vendor/jodconverter/commons-cli-1.1.jar
37
+ - vendor/jodconverter/commons-io-1.4.jar
38
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
39
+ - vendor/jodconverter/json-20090211.jar
40
+ - vendor/jodconverter/juh-3.2.1.jar
41
+ - vendor/jodconverter/jurt-3.2.1.jar
42
+ - vendor/jodconverter/ridl-3.2.1.jar
43
+ - vendor/jodconverter/unoil-3.2.1.jar
44
+ - vendor/logging.properties
45
+ - docsplit.gemspec
46
+ - LICENSE
47
+ - README
48
+ homepage: http://github.com/mateusmaso/docsplit
49
+ licenses: []
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubyforge_project: docsplit
68
+ rubygems_version: 1.8.10
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
72
+ test_files: []