mateusmaso-docsplit 0.6.4

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
4
+
5
+ Permission is hereby granted, free of charge, to any person
6
+ obtaining a copy of this software and associated documentation
7
+ files (the "Software"), to deal in the Software without
8
+ restriction, including without limitation the rights to use,
9
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the
11
+ Software is furnished to do so, subject to the following
12
+ conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'mateusmaso-docsplit'
3
+ s.version = '0.6.4'
4
+ s.date = '2013-02-05'
5
+
6
+ s.homepage = "http://github.com/mateusmaso/docsplit"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+
19
+ s.require_paths = ['lib']
20
+ s.executables = ['docsplit']
21
+
22
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
+ 'docsplit.gemspec', 'LICENSE', 'README']
24
+ end
@@ -0,0 +1,122 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
91
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
+ @options[:ocr] = o
93
+ end
94
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
+ @options[:clean] = false
96
+ end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ end
100
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
101
+ @options[:rolling] = true
102
+ end
103
+ opts.on_tail('-v', '--version', 'display docsplit version') do
104
+ puts "Docsplit version #{Docsplit::VERSION}"
105
+ exit
106
+ end
107
+ opts.on_tail('-h', '--help', 'display this help message') do
108
+ usage
109
+ end
110
+ end
111
+ @option_parser.banner = BANNER
112
+ begin
113
+ @option_parser.parse!(ARGV)
114
+ rescue OptionParser::InvalidOption => e
115
+ puts e.message
116
+ exit(1)
117
+ end
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,103 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
8
+ DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
36
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
+ if previous
39
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
+ raise ExtractionFailed, result if $? != 0
42
+ else
43
+ page_list(pages).each do |page|
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
+ result = `#{cmd}`.chomp
47
+ raise ExtractionFailed, result if $? != 0
48
+ end
49
+ end
50
+ ensure
51
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
52
+ end
53
+
54
+
55
+ private
56
+
57
+ # Extract the relevant GraphicsMagick options from the options hash.
58
+ def extract_options(options)
59
+ @output = options[:output] || '.'
60
+ @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
62
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
63
+ @sizes = [options[:size]].flatten.compact
64
+ @sizes = [nil] if @sizes.empty?
65
+ @rolling = !!options[:rolling]
66
+ end
67
+
68
+ # If there's only one size requested, generate the images directly into
69
+ # the output directory. Multiple sizes each get a directory of their own.
70
+ def directory_for(size)
71
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
72
+ File.expand_path(path)
73
+ end
74
+
75
+ # Generate the resize argument.
76
+ def resize_arg(size)
77
+ size.nil? ? '' : "-resize #{size}"
78
+ end
79
+
80
+ # Generate the appropriate quality argument for the image format.
81
+ def quality_arg(format)
82
+ case format.to_s
83
+ when /jpe?g/ then "-quality 85"
84
+ when /png/ then "-quality 100"
85
+ else ""
86
+ end
87
+ end
88
+
89
+ # Generate the expanded list of requested page numbers.
90
+ def page_list(pages)
91
+ pages.split(',').map { |range|
92
+ if range.include?('-')
93
+ range = range.split('-')
94
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
95
+ else
96
+ range.to_i
97
+ end
98
+ }.flatten.uniq.sort
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -0,0 +1,39 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
25
+ if String.method_defined?(:encode)
26
+ result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
27
+ else
28
+ ic = Iconv.new('UTF-8', 'UTF-8//IGNORE')
29
+ result = ic.iconv(result)
30
+ end
31
+ match = result.match(MATCHERS[key])
32
+ answer = match && match[1]
33
+ answer = answer.to_i if answer && key == :length
34
+ answer
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,36 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
20
+ result = `#{cmd}`.chomp
21
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
+ raise ExtractionFailed, result if $? != 0
23
+ result
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def extract_options(options)
31
+ @output = options[:output] || '.'
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,94 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,130 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
63
+ if pages
64
+ pages.each do |page|
65
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
67
+ file = "#{base_path}_#{page}"
68
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ clean_text(file + '.txt') if @clean_ocr
71
+ FileUtils.remove_entry_secure tiff
72
+ end
73
+ else
74
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
75
+ escaped_tiff = ESCAPE[tiff]
76
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ clean_text(base_path + '.txt') if @clean_ocr
79
+ end
80
+ ensure
81
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
82
+ end
83
+
84
+
85
+ private
86
+
87
+ def clean_text(file)
88
+ File.open(file, 'r+') do |f|
89
+ text = f.read
90
+ f.truncate(0)
91
+ f.rewind
92
+ f.write(Docsplit.clean_text(text))
93
+ end
94
+ end
95
+
96
+ # Run an external process and raise an exception if it fails.
97
+ def run(command)
98
+ result = `#{command}`
99
+ raise ExtractionFailed, result if $? != 0
100
+ result
101
+ end
102
+
103
+ # Extract the full contents of a pdf as a single file, directly.
104
+ def extract_full(pdf)
105
+ text_path = File.join(@output, "#{@pdf_name}.txt")
106
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
107
+ end
108
+
109
+ # Extract the contents of a single page of text, directly, adding it to
110
+ # the `@pages_to_ocr` list if the text length is inadequate.
111
+ def extract_page(pdf, page)
112
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
113
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
114
+ unless @forbid_ocr
115
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
116
+ end
117
+ end
118
+
119
+ def extract_options(options)
120
+ @output = options[:output] || '.'
121
+ @pages = options[:pages]
122
+ @force_ocr = options[:ocr] == true
123
+ @forbid_ocr = options[:ocr] == false
124
+ @clean_ocr = !(options[:clean] == false)
125
+ @language = options[:language] || 'eng'
126
+ end
127
+
128
+ end
129
+
130
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext.downcase == '.pdf'
13
+ doc
14
+ else
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,130 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
5
+ # The Docsplit module delegates to the Java PDF extractors.
6
+ module Docsplit
7
+
8
+ VERSION = '0.6.4' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
+
12
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
14
+
15
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
16
+
17
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
18
+
19
+ HEADLESS = "-Djava.awt.headless=true"
20
+
21
+ office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
22
+ office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
23
+
24
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
25
+
26
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
27
+
28
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
29
+
30
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
31
+
32
+ # Check for all dependencies, and note their absence.
33
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
34
+ DEPENDENCIES.each_key do |dep|
35
+ dirs.each do |dir|
36
+ if File.executable?(File.join(dir, dep.to_s))
37
+ DEPENDENCIES[dep] = true
38
+ break
39
+ end
40
+ end
41
+ end
42
+
43
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
44
+ # broke.
45
+ class ExtractionFailed < StandardError; end
46
+
47
+ # Use the ExtractPages Java class to burst a PDF into single pages.
48
+ def self.extract_pages(pdfs, opts={})
49
+ pdfs = ensure_pdfs(pdfs)
50
+ PageExtractor.new.extract(pdfs, opts)
51
+ end
52
+
53
+ # Use the ExtractText Java class to write out all embedded text.
54
+ def self.extract_text(pdfs, opts={})
55
+ pdfs = ensure_pdfs(pdfs)
56
+ TextExtractor.new.extract(pdfs, opts)
57
+ end
58
+
59
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
60
+ def self.extract_images(pdfs, opts={})
61
+ pdfs = ensure_pdfs(pdfs)
62
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
63
+ ImageExtractor.new.extract(pdfs, opts)
64
+ end
65
+
66
+ # Use JODCConverter to extract the documents as PDFs.
67
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
68
+ def self.extract_pdf(docs, opts={})
69
+ out = opts[:output] || '.'
70
+ FileUtils.mkdir_p out unless File.exists?(out)
71
+ [docs].flatten.each do |doc|
72
+ ext = File.extname(doc)
73
+ basename = File.basename(doc, ext)
74
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
75
+
76
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
77
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
78
+ else
79
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
80
+ run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
81
+ end
82
+ end
83
+ end
84
+
85
+ # Define custom methods for each of the metadata keys that we support.
86
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
87
+ METADATA_KEYS.each do |key|
88
+ instance_eval <<-EOS
89
+ def self.extract_#{key}(pdfs, opts={})
90
+ pdfs = ensure_pdfs(pdfs)
91
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
92
+ end
93
+ EOS
94
+ end
95
+
96
+ # Utility method to clean OCR'd text with garbage characters.
97
+ def self.clean_text(text)
98
+ TextCleaner.new.clean(text)
99
+ end
100
+
101
+
102
+ private
103
+
104
+ # Runs a Java command, with quieted logging, and the classpath set properly.
105
+ def self.run(command, pdfs, opts, return_output=false)
106
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
107
+ cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
108
+ result = `#{cmd}`.chomp
109
+ raise ExtractionFailed, result if $? != 0
110
+ return return_output ? (result.empty? ? nil : result) : true
111
+ end
112
+
113
+ # Normalize a value in an options hash for the command line.
114
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
115
+ def self.normalize_value(value)
116
+ case value
117
+ when Range then normalize_range(value)
118
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
119
+ else value.to_s
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
126
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
127
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
128
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
129
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
130
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -0,0 +1,233 @@
1
+ [
2
+ {
3
+ "name": "Portable Document Format",
4
+ "extension": "pdf",
5
+ "mediaType": "application/pdf",
6
+ "storePropertiesByFamily": {
7
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
8
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
9
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
10
+ "TEXT": {"FilterName": "writer_pdf_Export"}
11
+ }
12
+ },
13
+ {
14
+ "name": "Macromedia Flash",
15
+ "extension": "swf",
16
+ "mediaType": "application/x-shockwave-flash",
17
+ "storePropertiesByFamily": {
18
+ "DRAWING": {"FilterName": "draw_flash_Export"},
19
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
20
+ }
21
+ },
22
+ {
23
+ "name": "HTML",
24
+ "extension": "html",
25
+ "mediaType": "text/html",
26
+ "inputFamily": "TEXT",
27
+ "storePropertiesByFamily": {
28
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
29
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
30
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
31
+ }
32
+ },
33
+ {
34
+ "name": "OpenDocument Text",
35
+ "extension": "odt",
36
+ "mediaType": "application/vnd.oasis.opendocument.text",
37
+ "inputFamily": "TEXT",
38
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
39
+ },
40
+ {
41
+ "name": "OpenOffice.org 1.0 Text Document",
42
+ "extension": "sxw",
43
+ "mediaType": "application/vnd.sun.xml.writer",
44
+ "inputFamily": "TEXT",
45
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
46
+ },
47
+ {
48
+ "name": "Microsoft Word",
49
+ "extension": "doc",
50
+ "mediaType": "application/msword",
51
+ "inputFamily": "TEXT",
52
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
53
+ },
54
+ {
55
+ "name": "Microsoft Word 2007 XML",
56
+ "extension": "docx",
57
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
58
+ "inputFamily": "TEXT"
59
+ },
60
+ {
61
+ "name": "Rich Text Format",
62
+ "extension": "rtf",
63
+ "mediaType": "text/rtf",
64
+ "inputFamily": "TEXT",
65
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
66
+ },
67
+ {
68
+ "name": "WordPerfect",
69
+ "extension": "wpd",
70
+ "mediaType": "application/wordperfect",
71
+ "inputFamily": "TEXT"
72
+ },
73
+ {
74
+ "name": "Plain Text",
75
+ "extension": "txt",
76
+ "mediaType": "text/plain",
77
+ "inputFamily": "TEXT",
78
+ "loadProperties": {
79
+ "FilterName": "Text (encoded)",
80
+ "FilterOptions": "utf8"
81
+ },
82
+ "storePropertiesByFamily": {"TEXT": {
83
+ "FilterName": "Text (encoded)",
84
+ "FilterOptions": "utf8"
85
+ }}
86
+ },
87
+ {
88
+ "name": "MediaWiki wikitext",
89
+ "extension": "wiki",
90
+ "mediaType": "text/x-wiki",
91
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
92
+ },
93
+ {
94
+ "name": "OpenDocument Spreadsheet",
95
+ "extension": "ods",
96
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
97
+ "inputFamily": "SPREADSHEET",
98
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
99
+ },
100
+ {
101
+ "name": "OpenOffice.org 1.0 Spreadsheet",
102
+ "extension": "sxc",
103
+ "mediaType": "application/vnd.sun.xml.calc",
104
+ "inputFamily": "SPREADSHEET",
105
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106
+ },
107
+ {
108
+ "name": "Microsoft Excel",
109
+ "extension": "xls",
110
+ "mediaType": "application/vnd.ms-excel",
111
+ "inputFamily": "SPREADSHEET",
112
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113
+ },
114
+ {
115
+ "name": "Microsoft Excel 2007 XML",
116
+ "extension": "xlsx",
117
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118
+ "inputFamily": "SPREADSHEET"
119
+ },
120
+ {
121
+ "name": "Comma Separated Values",
122
+ "extension": "csv",
123
+ "mediaType": "text/csv",
124
+ "inputFamily": "SPREADSHEET",
125
+ "loadProperties": {
126
+ "FilterName": "Text - txt - csv (StarCalc)",
127
+ "FilterOptions": "44,34,0"
128
+ },
129
+ "storePropertiesByFamily": {"SPREADSHEET": {
130
+ "FilterName": "Text - txt - csv (StarCalc)",
131
+ "FilterOptions": "44,34,0"
132
+ }}
133
+ },
134
+ {
135
+ "name": "Tab Separated Values",
136
+ "extension": "tsv",
137
+ "mediaType": "text/tab-separated-values",
138
+ "inputFamily": "SPREADSHEET",
139
+ "loadProperties": {
140
+ "FilterName": "Text - txt - csv (StarCalc)",
141
+ "FilterOptions": "9,34,0"
142
+ },
143
+ "storePropertiesByFamily": {"SPREADSHEET": {
144
+ "FilterName": "Text - txt - csv (StarCalc)",
145
+ "FilterOptions": "9,34,0"
146
+ }}
147
+ },
148
+ {
149
+ "name": "OpenDocument Presentation",
150
+ "extension": "odp",
151
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
152
+ "inputFamily": "PRESENTATION",
153
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154
+ },
155
+ {
156
+ "name": "OpenOffice.org 1.0 Presentation",
157
+ "extension": "sxi",
158
+ "mediaType": "application/vnd.sun.xml.impress",
159
+ "inputFamily": "PRESENTATION",
160
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161
+ },
162
+ {
163
+ "name": "Microsoft PowerPoint",
164
+ "extension": "ppt",
165
+ "mediaType": "application/vnd.ms-powerpoint",
166
+ "inputFamily": "PRESENTATION",
167
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168
+ },
169
+ {
170
+ "name": "Microsoft PowerPoint 2007 XML",
171
+ "extension": "pptx",
172
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173
+ "inputFamily": "PRESENTATION"
174
+ },
175
+ {
176
+ "name": "OpenDocument Drawing",
177
+ "extension": "odg",
178
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
179
+ "inputFamily": "DRAWING",
180
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181
+ },
182
+ {
183
+ "name": "Scalable Vector Graphics",
184
+ "extension": "svg",
185
+ "mediaType": "image/svg+xml",
186
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187
+ },
188
+ {
189
+ "name": "Portable Network Graphic",
190
+ "extension": "png",
191
+ "mediaType": "image/png",
192
+ "storePropertiesByFamily": {
193
+ "DRAWING": {"FilterName": "draw_png_Export"},
194
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
195
+ }
196
+ },
197
+ {
198
+ "name": "Graphics Interchange Format",
199
+ "extension": "gif",
200
+ "mediaType": "image/gif",
201
+ "storePropertiesByFamily": {
202
+ "DRAWING": {"FilterName": "draw_gif_Export"},
203
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
204
+ }
205
+ },
206
+ {
207
+ "name": "Joint Photographic Experts Group",
208
+ "extension": "jpg",
209
+ "mediaType": "image/jpeg",
210
+ "storePropertiesByFamily": {
211
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
212
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213
+ }
214
+ },
215
+ {
216
+ "name": "Windows Bitmap",
217
+ "extension": "bmp",
218
+ "mediaType": "image/bmp",
219
+ "storePropertiesByFamily": {
220
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
221
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222
+ }
223
+ },
224
+ {
225
+ "name": "Tagged Image File Format",
226
+ "extension": "tif",
227
+ "mediaType": "image/tiff",
228
+ "storePropertiesByFamily": {
229
+ "DRAWING": {"FilterName": "draw_tif_Export"},
230
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
231
+ }
232
+ }
233
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mateusmaso-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.4
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeremy Ashkenas
9
+ - Samuel Clay
10
+ - Ted Han
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2013-02-05 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: jeremy@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/command_line.rb
27
+ - lib/docsplit/image_extractor.rb
28
+ - lib/docsplit/info_extractor.rb
29
+ - lib/docsplit/page_extractor.rb
30
+ - lib/docsplit/text_cleaner.rb
31
+ - lib/docsplit/text_extractor.rb
32
+ - lib/docsplit/transparent_pdfs.rb
33
+ - lib/docsplit.rb
34
+ - bin/docsplit
35
+ - vendor/conf/document-formats.js
36
+ - vendor/jodconverter/commons-cli-1.1.jar
37
+ - vendor/jodconverter/commons-io-1.4.jar
38
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
39
+ - vendor/jodconverter/json-20090211.jar
40
+ - vendor/jodconverter/juh-3.2.1.jar
41
+ - vendor/jodconverter/jurt-3.2.1.jar
42
+ - vendor/jodconverter/ridl-3.2.1.jar
43
+ - vendor/jodconverter/unoil-3.2.1.jar
44
+ - vendor/logging.properties
45
+ - docsplit.gemspec
46
+ - LICENSE
47
+ - README
48
+ homepage: http://github.com/mateusmaso/docsplit
49
+ licenses: []
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubyforge_project: docsplit
68
+ rubygems_version: 1.8.10
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
72
+ test_files: []