concerto_docsplit 0.7.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a54bd6493f131da57298fd581c261a0d25913569
4
+ data.tar.gz: 7c9e99ec164c30d9d9378dea9b82e524601d2bd7
5
+ SHA512:
6
+ metadata.gz: 5e2bfc51c164e989a1295206028d8462d120ce735a7ccdc4dcf9f36835c5509aa6a6947a6e8a59184c444eb05fd927a65f8755d79af2c78ee1fdc276c929e616
7
+ data.tar.gz: 76d58902b9e279203b0ac030cb2e692984176416a397de480d373c15c5c9b57d8db143e66daaad1905e8a7ea6bc4ee3062a1843a9127cf8544aa7a661d7c06fb
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
4
+ Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
5
+
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the "Software"), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
@@ -0,0 +1,102 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
5
+ # The Docsplit module delegates to the Java PDF extractors.
6
+ module Docsplit
7
+
8
+ VERSION = '0.7.5' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
+
12
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
14
+
15
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
16
+
17
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
+
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
20
+
21
+ # Check for all dependencies, and note their absence.
22
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
23
+ DEPENDENCIES.each_key do |dep|
24
+ dirs.each do |dir|
25
+ if File.executable?(File.join(dir, dep.to_s))
26
+ DEPENDENCIES[dep] = true
27
+ break
28
+ end
29
+ end
30
+ end
31
+
32
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
+ # broke.
34
+ class ExtractionFailed < StandardError; end
35
+
36
+ # Use the ExtractPages Java class to burst a PDF into single pages.
37
+ def self.extract_pages(pdfs, opts={})
38
+ pdfs = ensure_pdfs(pdfs)
39
+ PageExtractor.new.extract(pdfs, opts)
40
+ end
41
+
42
+ # Use the ExtractText Java class to write out all embedded text.
43
+ def self.extract_text(pdfs, opts={})
44
+ pdfs = ensure_pdfs(pdfs)
45
+ TextExtractor.new.extract(pdfs, opts)
46
+ end
47
+
48
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
49
+ def self.extract_images(pdfs, opts={})
50
+ pdfs = ensure_pdfs(pdfs)
51
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
52
+ ImageExtractor.new.extract(pdfs, opts)
53
+ end
54
+
55
+ # Use JODCConverter to extract the documents as PDFs.
56
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
57
+ def self.extract_pdf(docs, opts={})
58
+ PdfExtractor.new.extract(docs, opts)
59
+ end
60
+
61
+ # Define custom methods for each of the metadata keys that we support.
62
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
63
+ METADATA_KEYS.each do |key|
64
+ instance_eval <<-EOS
65
+ def self.extract_#{key}(pdfs, opts={})
66
+ pdfs = ensure_pdfs(pdfs)
67
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
68
+ end
69
+ EOS
70
+ end
71
+
72
+ def self.extract_info(pdfs, opts={})
73
+ pdfs = ensure_pdfs(pdfs)
74
+ InfoExtractor.new.extract_all(pdfs, opts)
75
+ end
76
+
77
+ # Utility method to clean OCR'd text with garbage characters.
78
+ def self.clean_text(text)
79
+ TextCleaner.new.clean(text)
80
+ end
81
+
82
+ private
83
+
84
+ # Normalize a value in an options hash for the command line.
85
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
86
+ def self.normalize_value(value)
87
+ case value
88
+ when Range then value.to_a.join(',')
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
90
+ else value.to_s
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
97
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
98
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
99
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
100
+ require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
101
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
102
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -0,0 +1,123 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
91
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
+ @options[:ocr] = o
93
+ end
94
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
+ @options[:clean] = false
96
+ end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ @options[:clean] = false
100
+ end
101
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
102
+ @options[:rolling] = true
103
+ end
104
+ opts.on_tail('-v', '--version', 'display docsplit version') do
105
+ puts "Docsplit version #{Docsplit::VERSION}"
106
+ exit
107
+ end
108
+ opts.on_tail('-h', '--help', 'display this help message') do
109
+ usage
110
+ end
111
+ end
112
+ @option_parser.banner = BANNER
113
+ begin
114
+ @option_parser.parse!(ARGV)
115
+ rescue OptionParser::InvalidOption => e
116
+ puts e.message
117
+ exit(1)
118
+ end
119
+ end
120
+
121
+ end
122
+
123
+ end
@@ -0,0 +1,103 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
8
+ DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
36
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
+ if previous
39
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 #{"gm" unless ENV["toolchain"] == "imagemagick"} mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
+ raise ExtractionFailed, result if $? != 0
42
+ else
43
+ page_list(pages).each do |page|
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 #{"gm" unless ENV["toolchain"] == "imagemagick"} convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
+ result = `#{cmd}`.chomp
47
+ raise ExtractionFailed, result if $? != 0
48
+ end
49
+ end
50
+ ensure
51
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
52
+ end
53
+
54
+
55
+ private
56
+
57
+ # Extract the relevant GraphicsMagick options from the options hash.
58
+ def extract_options(options)
59
+ @output = options[:output] || '.'
60
+ @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
62
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
63
+ @sizes = [options[:size]].flatten.compact
64
+ @sizes = [nil] if @sizes.empty?
65
+ @rolling = !!options[:rolling]
66
+ end
67
+
68
+ # If there's only one size requested, generate the images directly into
69
+ # the output directory. Multiple sizes each get a directory of their own.
70
+ def directory_for(size)
71
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
72
+ File.expand_path(path)
73
+ end
74
+
75
+ # Generate the resize argument.
76
+ def resize_arg(size)
77
+ size.nil? ? '' : "-resize #{size}"
78
+ end
79
+
80
+ # Generate the appropriate quality argument for the image format.
81
+ def quality_arg(format)
82
+ case format.to_s
83
+ when /jpe?g/ then "-quality 85"
84
+ when /png/ then "-quality 100"
85
+ else ""
86
+ end
87
+ end
88
+
89
+ # Generate the expanded list of requested page numbers.
90
+ def page_list(pages)
91
+ pages.split(',').map { |range|
92
+ if range.include?('-')
93
+ range = range.split('-')
94
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
95
+ else
96
+ range.to_i
97
+ end
98
+ }.flatten.uniq.sort
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -0,0 +1,50 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ extract_all(pdfs, opts)[key]
21
+ end
22
+
23
+ def extract_all(pdfs, opts)
24
+ pdf = [pdfs].flatten.first
25
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26
+ result = `#{cmd}`.chomp
27
+ raise ExtractionFailed, result if $? != 0
28
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
+ if String.method_defined?(:encode)
30
+ result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31
+ else
32
+ require 'iconv' unless defined?(Iconv)
33
+ ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34
+ result = ic.iconv(result)
35
+ end
36
+ info = {}
37
+ MATCHERS.each do |key, matcher|
38
+ match = result.match(matcher)
39
+ answer = match && match[1]
40
+ if answer
41
+ answer = answer.to_i if key == :length
42
+ info[key] = answer
43
+ end
44
+ end
45
+ info
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,36 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
20
+ result = `#{cmd}`.chomp
21
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
+ raise ExtractionFailed, result if $? != 0
23
+ result
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def extract_options(options)
31
+ @output = options[:output] || '.'
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,163 @@
1
+ require 'rbconfig'
2
+
3
+ module Docsplit
4
+ class PdfExtractor
5
+ @@executable = nil
6
+ @@version_string = nil
7
+
8
+ # Provide a set of helper functions to determine the OS.
9
+ HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
10
+ def windows?
11
+ !!HOST_OS.match(/mswin|windows|cygwin/i)
12
+ end
13
+ def osx?
14
+ !!HOST_OS.match(/darwin/i)
15
+ end
16
+ def linux?
17
+ !!HOST_OS.match(/linux/i)
18
+ end
19
+
20
+ # The first line of the help output holds the name and version number
21
+ # of the office software to be used for extraction.
22
+ def version_string
23
+ unless @@version_string
24
+ null = windows? ? "NUL" : "/dev/null"
25
+ @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
+ if !!@@version_string.match(/[0-9]*/)
27
+ @@version_string = `#{office_executable} --version`.split("\n").first
28
+ end
29
+ end
30
+ @@version_string
31
+ end
32
+ def libre_office?
33
+ !!version_string.match(/^LibreOffice/)
34
+ end
35
+ def open_office?
36
+ !!version_string.match(/^OpenOffice.org/)
37
+ end
38
+
39
+ # A set of default locations to search for office software
40
+ # These have been extracted from JODConverter. Each listed
41
+ # path should contain a directory "program" which in turn
42
+ # contains the "soffice" executable.
43
+ # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
44
+ def office_search_paths
45
+ if windows?
46
+ office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
47
+ program_files_path = ENV["CommonProgramFiles"]
48
+ search_paths = office_names.map{ |program| File.join(program_files_path, program) }
49
+ elsif osx?
50
+ search_paths = %w(
51
+ /Applications/LibreOffice.app/Contents
52
+ /Applications/OpenOffice.org.app/Contents
53
+ )
54
+ else # probably linux/unix
55
+ # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
56
+ search_paths = %w(
57
+ /usr/lib/libreoffice
58
+ /usr/lib64/libreoffice
59
+ /opt/libreoffice
60
+ /usr/lib/openoffice
61
+ /usr/lib64/openoffice
62
+ /opt/openoffice.org3
63
+ /app/vendor/libreoffice
64
+ )
65
+ end
66
+ search_paths
67
+ end
68
+
69
+ # Identify the path to a working office executable.
70
+ def office_executable
71
+ paths = office_search_paths
72
+
73
+ # If an OFFICE_PATH has been specified on the commandline
74
+ # raise an error if that path isn't valid, otherwise, add
75
+ # it to the front of our search paths.
76
+ if ENV['OFFICE_PATH']
77
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
78
+ paths.unshift(ENV['OFFICE_PATH'])
79
+ end
80
+
81
+ # The location of the office executable is OS dependent
82
+ path_pieces = ["soffice"]
83
+ if windows?
84
+ path_pieces += [["program", "soffice.bin"]]
85
+ elsif osx?
86
+ path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
87
+ else
88
+ path_pieces += [["program", "soffice"]]
89
+ end
90
+
91
+ # Search for the first suitable office executable
92
+ # and short circuit an executable is found.
93
+ paths.each do |path|
94
+ if File.exists? path
95
+ @@executable ||= path unless File.directory? path
96
+ path_pieces.each do |pieces|
97
+ check_path = File.join(path, pieces)
98
+ @@executable ||= check_path if File.exists? check_path
99
+ end
100
+ end
101
+ break if @@executable
102
+ end
103
+ raise OfficeNotFound, "No office software found" unless @@executable
104
+ @@executable
105
+ end
106
+
107
+ # Used to specify the office location for JODConverter
108
+ def office_path
109
+ File.dirname(File.dirname(office_executable))
110
+ end
111
+
112
+ # Convert documents to PDF.
113
+ def extract(docs, opts)
114
+ out = opts[:output] || '.'
115
+ FileUtils.mkdir_p out unless File.exists?(out)
116
+ [docs].flatten.each do |doc|
117
+ ext = File.extname(doc)
118
+ basename = File.basename(doc, ext)
119
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
120
+
121
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
122
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
123
+ `#{"gm" unless ENV["toolchain"] == "imagemagick"} convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
124
+ else
125
+ if libre_office?
126
+ # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
127
+ ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
128
+
129
+ options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
130
+ cmd = "#{office_executable} #{options} 2>&1"
131
+ result = `#{cmd}`.chomp
132
+ raise ExtractionFailed, result if $? != 0
133
+ true
134
+ else # open office presumably, rely on JODConverter to figure it out.
135
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
136
+ run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
143
+
144
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
145
+
146
+ HEADLESS = "-Djava.awt.headless=true"
147
+
148
+ private
149
+
150
+ # Runs a Java command, with quieted logging, and the classpath set properly.
151
+ def run_jod(command, pdfs, opts, return_output=false)
152
+
153
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
154
+ office = osx? ? "-Doffice.home=#{office_path}" : office_path
155
+ cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
156
+ result = `#{cmd}`.chomp
157
+ raise ExtractionFailed, result if $? != 0
158
+ return return_output ? (result.empty? ? nil : result) : true
159
+ end
160
+
161
+ class OfficeNotFound < StandardError; end
162
+ end
163
+ end
@@ -0,0 +1,99 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ if String.method_defined?(:encode)
39
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
40
+ else
41
+ require 'iconv' unless defined?(Iconv)
42
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
43
+ end
44
+
45
+ scanner = StringScanner.new(text)
46
+ cleaned = []
47
+ spaced = false
48
+ loop do
49
+ if space = scanner.scan(SPACE)
50
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
51
+ spaced = true
52
+ elsif word = scanner.scan(WORD)
53
+ unless garbage(word)
54
+ cleaned.push(word)
55
+ spaced = false
56
+ end
57
+ elsif scanner.eos?
58
+ return cleaned.join('').gsub(REPEATED, '')
59
+ end
60
+ end
61
+ end
62
+
63
+ # Is a given word OCR garbage?
64
+ def garbage(w)
65
+ acronym = w =~ ACRONYM
66
+
67
+ # More than 30 bytes in length.
68
+ (w.length > 30) ||
69
+
70
+ # If there are three or more identical characters in a row in the string.
71
+ (w =~ REPEAT) ||
72
+
73
+ # More punctuation than alpha numerics.
74
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
75
+
76
+ # Ignoring the first and last characters in the string, if there are three or
77
+ # more different punctuation characters in the string.
78
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
79
+
80
+ # Four or more consecutive vowels, or five or more consecutive consonants.
81
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
82
+
83
+ # Number of uppercase letters greater than lowercase letters, but the word is
84
+ # not all uppercase + punctuation.
85
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
86
+
87
+ # Single letters that are not A or I.
88
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
89
+
90
+ # All characters are alphabetic and there are 8 times more vowels than
91
+ # consonants, or 8 times more consonants than vowels.
92
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
93
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
94
+ (cons > vows * 8)))
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -0,0 +1,138 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
63
+ if pages
64
+ pages.each do |page|
65
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
67
+ file = "#{base_path}_#{page}"
68
+ if ENV["toolchain"] == 'graphicsmagick'
69
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
70
+ else
71
+ run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
72
+ end
73
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
74
+ clean_text(file + '.txt') if @clean_ocr
75
+ FileUtils.remove_entry_secure tiff
76
+ end
77
+ else
78
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
79
+ escaped_tiff = ESCAPE[tiff]
80
+ if ENV["toolchain"] == 'graphicsmagick'
81
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
82
+ else
83
+ run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
84
+ end
85
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
86
+ clean_text(base_path + '.txt') if @clean_ocr
87
+ end
88
+ ensure
89
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
90
+ end
91
+
92
+
93
+ private
94
+
95
+ def clean_text(file)
96
+ File.open(file, 'r+') do |f|
97
+ text = f.read
98
+ f.truncate(0)
99
+ f.rewind
100
+ f.write(Docsplit.clean_text(text))
101
+ end
102
+ end
103
+
104
+ # Run an external process and raise an exception if it fails.
105
+ def run(command)
106
+ result = `#{command}`
107
+ raise ExtractionFailed, result if $? != 0
108
+ result
109
+ end
110
+
111
+ # Extract the full contents of a pdf as a single file, directly.
112
+ def extract_full(pdf)
113
+ text_path = File.join(@output, "#{@pdf_name}.txt")
114
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
115
+ end
116
+
117
+ # Extract the contents of a single page of text, directly, adding it to
118
+ # the `@pages_to_ocr` list if the text length is inadequate.
119
+ def extract_page(pdf, page)
120
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
121
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
122
+ unless @forbid_ocr
123
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
124
+ end
125
+ end
126
+
127
+ def extract_options(options)
128
+ @output = options[:output] || '.'
129
+ @pages = options[:pages]
130
+ @force_ocr = options[:ocr] == true
131
+ @forbid_ocr = options[:ocr] == false
132
+ @clean_ocr = !(options[:clean] == false)
133
+ @language = options[:language] || 'eng'
134
+ end
135
+
136
+ end
137
+
138
+ end
@@ -0,0 +1,29 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ if is_pdf?(doc)
12
+ doc
13
+ else
14
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
15
+ extract_pdf([doc], {:output => tempdir})
16
+ File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
17
+ end
18
+ end
19
+ end
20
+
21
+ def is_pdf?(doc)
22
+ File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
23
+ end
24
+
25
+ end
26
+
27
+ extend TransparentPDFs
28
+
29
+ end
@@ -0,0 +1,233 @@
1
+ [
2
+ {
3
+ "name": "Portable Document Format",
4
+ "extension": "pdf",
5
+ "mediaType": "application/pdf",
6
+ "storePropertiesByFamily": {
7
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
8
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
9
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
10
+ "TEXT": {"FilterName": "writer_pdf_Export"}
11
+ }
12
+ },
13
+ {
14
+ "name": "Macromedia Flash",
15
+ "extension": "swf",
16
+ "mediaType": "application/x-shockwave-flash",
17
+ "storePropertiesByFamily": {
18
+ "DRAWING": {"FilterName": "draw_flash_Export"},
19
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
20
+ }
21
+ },
22
+ {
23
+ "name": "HTML",
24
+ "extension": "html",
25
+ "mediaType": "text/html",
26
+ "inputFamily": "TEXT",
27
+ "storePropertiesByFamily": {
28
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
29
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
30
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
31
+ }
32
+ },
33
+ {
34
+ "name": "OpenDocument Text",
35
+ "extension": "odt",
36
+ "mediaType": "application/vnd.oasis.opendocument.text",
37
+ "inputFamily": "TEXT",
38
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
39
+ },
40
+ {
41
+ "name": "OpenOffice.org 1.0 Text Document",
42
+ "extension": "sxw",
43
+ "mediaType": "application/vnd.sun.xml.writer",
44
+ "inputFamily": "TEXT",
45
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
46
+ },
47
+ {
48
+ "name": "Microsoft Word",
49
+ "extension": "doc",
50
+ "mediaType": "application/msword",
51
+ "inputFamily": "TEXT",
52
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
53
+ },
54
+ {
55
+ "name": "Microsoft Word 2007 XML",
56
+ "extension": "docx",
57
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
58
+ "inputFamily": "TEXT"
59
+ },
60
+ {
61
+ "name": "Rich Text Format",
62
+ "extension": "rtf",
63
+ "mediaType": "text/rtf",
64
+ "inputFamily": "TEXT",
65
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
66
+ },
67
+ {
68
+ "name": "WordPerfect",
69
+ "extension": "wpd",
70
+ "mediaType": "application/wordperfect",
71
+ "inputFamily": "TEXT"
72
+ },
73
+ {
74
+ "name": "Plain Text",
75
+ "extension": "txt",
76
+ "mediaType": "text/plain",
77
+ "inputFamily": "TEXT",
78
+ "loadProperties": {
79
+ "FilterName": "Text (encoded)",
80
+ "FilterOptions": "utf8"
81
+ },
82
+ "storePropertiesByFamily": {"TEXT": {
83
+ "FilterName": "Text (encoded)",
84
+ "FilterOptions": "utf8"
85
+ }}
86
+ },
87
+ {
88
+ "name": "MediaWiki wikitext",
89
+ "extension": "wiki",
90
+ "mediaType": "text/x-wiki",
91
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
92
+ },
93
+ {
94
+ "name": "OpenDocument Spreadsheet",
95
+ "extension": "ods",
96
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
97
+ "inputFamily": "SPREADSHEET",
98
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
99
+ },
100
+ {
101
+ "name": "OpenOffice.org 1.0 Spreadsheet",
102
+ "extension": "sxc",
103
+ "mediaType": "application/vnd.sun.xml.calc",
104
+ "inputFamily": "SPREADSHEET",
105
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106
+ },
107
+ {
108
+ "name": "Microsoft Excel",
109
+ "extension": "xls",
110
+ "mediaType": "application/vnd.ms-excel",
111
+ "inputFamily": "SPREADSHEET",
112
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113
+ },
114
+ {
115
+ "name": "Microsoft Excel 2007 XML",
116
+ "extension": "xlsx",
117
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118
+ "inputFamily": "SPREADSHEET"
119
+ },
120
+ {
121
+ "name": "Comma Separated Values",
122
+ "extension": "csv",
123
+ "mediaType": "text/csv",
124
+ "inputFamily": "SPREADSHEET",
125
+ "loadProperties": {
126
+ "FilterName": "Text - txt - csv (StarCalc)",
127
+ "FilterOptions": "44,34,0"
128
+ },
129
+ "storePropertiesByFamily": {"SPREADSHEET": {
130
+ "FilterName": "Text - txt - csv (StarCalc)",
131
+ "FilterOptions": "44,34,0"
132
+ }}
133
+ },
134
+ {
135
+ "name": "Tab Separated Values",
136
+ "extension": "tsv",
137
+ "mediaType": "text/tab-separated-values",
138
+ "inputFamily": "SPREADSHEET",
139
+ "loadProperties": {
140
+ "FilterName": "Text - txt - csv (StarCalc)",
141
+ "FilterOptions": "9,34,0"
142
+ },
143
+ "storePropertiesByFamily": {"SPREADSHEET": {
144
+ "FilterName": "Text - txt - csv (StarCalc)",
145
+ "FilterOptions": "9,34,0"
146
+ }}
147
+ },
148
+ {
149
+ "name": "OpenDocument Presentation",
150
+ "extension": "odp",
151
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
152
+ "inputFamily": "PRESENTATION",
153
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154
+ },
155
+ {
156
+ "name": "OpenOffice.org 1.0 Presentation",
157
+ "extension": "sxi",
158
+ "mediaType": "application/vnd.sun.xml.impress",
159
+ "inputFamily": "PRESENTATION",
160
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161
+ },
162
+ {
163
+ "name": "Microsoft PowerPoint",
164
+ "extension": "ppt",
165
+ "mediaType": "application/vnd.ms-powerpoint",
166
+ "inputFamily": "PRESENTATION",
167
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168
+ },
169
+ {
170
+ "name": "Microsoft PowerPoint 2007 XML",
171
+ "extension": "pptx",
172
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173
+ "inputFamily": "PRESENTATION"
174
+ },
175
+ {
176
+ "name": "OpenDocument Drawing",
177
+ "extension": "odg",
178
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
179
+ "inputFamily": "DRAWING",
180
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181
+ },
182
+ {
183
+ "name": "Scalable Vector Graphics",
184
+ "extension": "svg",
185
+ "mediaType": "image/svg+xml",
186
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187
+ },
188
+ {
189
+ "name": "Portable Network Graphic",
190
+ "extension": "png",
191
+ "mediaType": "image/png",
192
+ "storePropertiesByFamily": {
193
+ "DRAWING": {"FilterName": "draw_png_Export"},
194
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
195
+ }
196
+ },
197
+ {
198
+ "name": "Graphics Interchange Format",
199
+ "extension": "gif",
200
+ "mediaType": "image/gif",
201
+ "storePropertiesByFamily": {
202
+ "DRAWING": {"FilterName": "draw_gif_Export"},
203
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
204
+ }
205
+ },
206
+ {
207
+ "name": "Joint Photographic Experts Group",
208
+ "extension": "jpg",
209
+ "mediaType": "image/jpeg",
210
+ "storePropertiesByFamily": {
211
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
212
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213
+ }
214
+ },
215
+ {
216
+ "name": "Windows Bitmap",
217
+ "extension": "bmp",
218
+ "mediaType": "image/bmp",
219
+ "storePropertiesByFamily": {
220
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
221
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222
+ }
223
+ },
224
+ {
225
+ "name": "Tagged Image File Format",
226
+ "extension": "tif",
227
+ "mediaType": "image/tiff",
228
+ "storePropertiesByFamily": {
229
+ "DRAWING": {"FilterName": "draw_tif_Export"},
230
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
231
+ }
232
+ }
233
+ ]
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: concerto_docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.5
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Ashkenas
8
+ - Samuel Clay
9
+ - Ted Han
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2014-05-28 00:00:00.000000000 Z
14
+ dependencies: []
15
+ description: |2
16
+ Docsplit is a command-line utility and Ruby library for splitting apart
17
+ documents into their component parts: searchable UTF-8 plain text, page
18
+ images or thumbnails in any format, PDFs, single pages, and document
19
+ metadata (title, author, number of pages...)
20
+ email: opensource@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/command_line.rb
27
+ - lib/docsplit/image_extractor.rb
28
+ - lib/docsplit/info_extractor.rb
29
+ - lib/docsplit/page_extractor.rb
30
+ - lib/docsplit/pdf_extractor.rb
31
+ - lib/docsplit/text_cleaner.rb
32
+ - lib/docsplit/text_extractor.rb
33
+ - lib/docsplit/transparent_pdfs.rb
34
+ - lib/docsplit.rb
35
+ - bin/docsplit
36
+ - vendor/conf/document-formats.js
37
+ - vendor/jodconverter/commons-cli-1.1.jar
38
+ - vendor/jodconverter/commons-io-1.4.jar
39
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
40
+ - vendor/jodconverter/json-20090211.jar
41
+ - vendor/jodconverter/juh-3.2.1.jar
42
+ - vendor/jodconverter/jurt-3.2.1.jar
43
+ - vendor/jodconverter/ridl-3.2.1.jar
44
+ - vendor/jodconverter/unoil-3.2.1.jar
45
+ - vendor/logging.properties
46
+ - LICENSE
47
+ - README
48
+ homepage: http://documentcloud.github.com/docsplit/
49
+ licenses:
50
+ - MIT
51
+ metadata: {}
52
+ post_install_message:
53
+ rdoc_options: []
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubyforge_project: docsplit
68
+ rubygems_version: 2.0.14
69
+ signing_key:
70
+ specification_version: 4
71
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
72
+ test_files: []