burisu-docsplit 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 364308f838aa67a65ff4a478073b9011f6696aec
4
+ data.tar.gz: 949f366310ece3bf924296148ba2e2346e3dba77
5
+ SHA512:
6
+ metadata.gz: 71516f45bf021f608c76989dbd7032de6adcc0eae38e5d07b645f26d8819a2f637252f0e52e350bf4deb80ae9d5e29eb92f7aef193f0f12f5736712181fb28de
7
+ data.tar.gz: 5408cb91169a00ce40294631106dc70ee3f657cbb9a50ef650f1fe8547bf6d19c3cdbe1bb3e51c4512da8ca24682a5b7f6157b0218b4cf5c0ddc7bd2e6dcb935
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
4
+ Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
5
+
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the "Software"), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'burisu-docsplit'
3
+ s.version = '0.7.5' # Keep version in sync with docsplit.rb
4
+
5
+ s.homepage = "http://documentcloud.github.com/docsplit/"
6
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
7
+ s.description = <<-EOS
8
+ Docsplit is a command-line utility and Ruby library for splitting apart
9
+ documents into their component parts: searchable UTF-8 plain text, page
10
+ images or thumbnails in any format, PDFs, single pages, and document
11
+ metadata (title, author, number of pages...)
12
+ EOS
13
+
14
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
15
+ s.email = 'opensource@documentcloud.org'
16
+ s.license = 'MIT'
17
+
18
+ s.require_paths = ['lib']
19
+ s.executables = ['docsplit']
20
+
21
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
22
+ 'docsplit.gemspec', 'LICENSE', 'README']
23
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,102 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
5
+ # The Docsplit module delegates to the Java PDF extractors.
6
+ module Docsplit
7
+
8
+ VERSION = '0.7.4' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
+
12
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
14
+
15
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
16
+
17
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
+
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
20
+
21
+ # Check for all dependencies, and note their absence.
22
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
23
+ DEPENDENCIES.each_key do |dep|
24
+ dirs.each do |dir|
25
+ if File.executable?(File.join(dir, dep.to_s))
26
+ DEPENDENCIES[dep] = true
27
+ break
28
+ end
29
+ end
30
+ end
31
+
32
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
+ # broke.
34
+ class ExtractionFailed < StandardError; end
35
+
36
+ # Use the ExtractPages Java class to burst a PDF into single pages.
37
+ def self.extract_pages(pdfs, opts={})
38
+ pdfs = ensure_pdfs(pdfs)
39
+ PageExtractor.new.extract(pdfs, opts)
40
+ end
41
+
42
+ # Use the ExtractText Java class to write out all embedded text.
43
+ def self.extract_text(pdfs, opts={})
44
+ pdfs = ensure_pdfs(pdfs)
45
+ TextExtractor.new.extract(pdfs, opts)
46
+ end
47
+
48
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
49
+ def self.extract_images(pdfs, opts={})
50
+ pdfs = ensure_pdfs(pdfs)
51
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
52
+ ImageExtractor.new.extract(pdfs, opts)
53
+ end
54
+
55
+ # Use JODCConverter to extract the documents as PDFs.
56
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
57
+ def self.extract_pdf(docs, opts={})
58
+ PdfExtractor.new.extract(docs, opts)
59
+ end
60
+
61
+ # Define custom methods for each of the metadata keys that we support.
62
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
63
+ METADATA_KEYS.each do |key|
64
+ instance_eval <<-EOS
65
+ def self.extract_#{key}(pdfs, opts={})
66
+ pdfs = ensure_pdfs(pdfs)
67
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
68
+ end
69
+ EOS
70
+ end
71
+
72
+ def self.extract_info(pdfs, opts={})
73
+ pdfs = ensure_pdfs(pdfs)
74
+ InfoExtractor.new.extract_all(pdfs, opts)
75
+ end
76
+
77
+ # Utility method to clean OCR'd text with garbage characters.
78
+ def self.clean_text(text)
79
+ TextCleaner.new.clean(text)
80
+ end
81
+
82
+ private
83
+
84
+ # Normalize a value in an options hash for the command line.
85
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
86
+ def self.normalize_value(value)
87
+ case value
88
+ when Range then value.to_a.join(',')
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
90
+ else value.to_s
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
97
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
98
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
99
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
100
+ require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
101
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
102
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -0,0 +1,123 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
91
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
+ @options[:ocr] = o
93
+ end
94
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
+ @options[:clean] = false
96
+ end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ @options[:clean] = false
100
+ end
101
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
102
+ @options[:rolling] = true
103
+ end
104
+ opts.on_tail('-v', '--version', 'display docsplit version') do
105
+ puts "Docsplit version #{Docsplit::VERSION}"
106
+ exit
107
+ end
108
+ opts.on_tail('-h', '--help', 'display this help message') do
109
+ usage
110
+ end
111
+ end
112
+ @option_parser.banner = BANNER
113
+ begin
114
+ @option_parser.parse!(ARGV)
115
+ rescue OptionParser::InvalidOption => e
116
+ puts e.message
117
+ exit(1)
118
+ end
119
+ end
120
+
121
+ end
122
+
123
+ end
@@ -0,0 +1,103 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
8
+ DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
36
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
+ if previous
39
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
+ raise ExtractionFailed, result if $? != 0
42
+ else
43
+ page_list(pages).each do |page|
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
+ result = `#{cmd}`.chomp
47
+ raise ExtractionFailed, result if $? != 0
48
+ end
49
+ end
50
+ ensure
51
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
52
+ end
53
+
54
+
55
+ private
56
+
57
+ # Extract the relevant GraphicsMagick options from the options hash.
58
+ def extract_options(options)
59
+ @output = options[:output] || '.'
60
+ @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
62
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
63
+ @sizes = [options[:size]].flatten.compact
64
+ @sizes = [nil] if @sizes.empty?
65
+ @rolling = !!options[:rolling]
66
+ end
67
+
68
+ # If there's only one size requested, generate the images directly into
69
+ # the output directory. Multiple sizes each get a directory of their own.
70
+ def directory_for(size)
71
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
72
+ File.expand_path(path)
73
+ end
74
+
75
+ # Generate the resize argument.
76
+ def resize_arg(size)
77
+ size.nil? ? '' : "-resize #{size}"
78
+ end
79
+
80
+ # Generate the appropriate quality argument for the image format.
81
+ def quality_arg(format)
82
+ case format.to_s
83
+ when /jpe?g/ then "-quality 85"
84
+ when /png/ then "-quality 100"
85
+ else ""
86
+ end
87
+ end
88
+
89
+ # Generate the expanded list of requested page numbers.
90
+ def page_list(pages)
91
+ pages.split(',').map { |range|
92
+ if range.include?('-')
93
+ range = range.split('-')
94
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
95
+ else
96
+ range.to_i
97
+ end
98
+ }.flatten.uniq.sort
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -0,0 +1,50 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ extract_all(pdfs, opts)[key]
21
+ end
22
+
23
+ def extract_all(pdfs, opts)
24
+ pdf = [pdfs].flatten.first
25
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26
+ result = `#{cmd}`.chomp
27
+ raise ExtractionFailed, result if $? != 0
28
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
+ if String.method_defined?(:encode)
30
+ result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31
+ else
32
+ require 'iconv' unless defined?(Iconv)
33
+ ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34
+ result = ic.iconv(result)
35
+ end
36
+ info = {}
37
+ MATCHERS.each do |key, matcher|
38
+ match = result.match(matcher)
39
+ answer = match && match[1]
40
+ if answer
41
+ answer = answer.to_i if key == :length
42
+ info[key] = answer
43
+ end
44
+ end
45
+ info
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,36 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
20
+ result = `#{cmd}`.chomp
21
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
+ raise ExtractionFailed, result if $? != 0
23
+ result
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def extract_options(options)
31
+ @output = options[:output] || '.'
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,162 @@
1
+ require 'rbconfig'
2
+
3
+ module Docsplit
4
+ class PdfExtractor
5
+ @@executable = nil
6
+ @@version_string = nil
7
+
8
+ # Provide a set of helper functions to determine the OS.
9
+ HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
10
+ def windows?
11
+ !!HOST_OS.match(/mswin|windows|cygwin/i)
12
+ end
13
+ def osx?
14
+ !!HOST_OS.match(/darwin/i)
15
+ end
16
+ def linux?
17
+ !!HOST_OS.match(/linux/i)
18
+ end
19
+
20
+ # The first line of the help output holds the name and version number
21
+ # of the office software to be used for extraction.
22
+ def version_string
23
+ unless @@version_string
24
+ null = windows? ? "NUL" : "/dev/null"
25
+ @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
+ if !!@@version_string.match(/[0-9]*/)
27
+ @@version_string = `#{office_executable} --version`.split("\n").first
28
+ end
29
+ end
30
+ @@version_string
31
+ end
32
+ def libre_office?
33
+ !!version_string.match(/^LibreOffice/)
34
+ end
35
+ def open_office?
36
+ !!version_string.match(/^OpenOffice.org/)
37
+ end
38
+
39
+ # A set of default locations to search for office software
40
+ # These have been extracted from JODConverter. Each listed
41
+ # path should contain a directory "program" which in turn
42
+ # contains the "soffice" executable.
43
+ # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
44
+ def office_search_paths
45
+ if windows?
46
+ office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
47
+ program_files_path = ENV["CommonProgramFiles"]
48
+ search_paths = office_names.map{ |program| File.join(program_files_path, program) }
49
+ elsif osx?
50
+ search_paths = %w(
51
+ /Applications/LibreOffice.app/Contents
52
+ /Applications/OpenOffice.org.app/Contents
53
+ )
54
+ else # probably linux/unix
55
+ # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
56
+ search_paths = %w(
57
+ /usr/lib/libreoffice
58
+ /usr/lib64/libreoffice
59
+ /opt/libreoffice
60
+ /usr/lib/openoffice
61
+ /usr/lib64/openoffice
62
+ /opt/openoffice.org3
63
+ /app/vendor/libreoffice
64
+ )
65
+ end
66
+ search_paths
67
+ end
68
+
69
+ # Identify the path to a working office executable.
70
+ def office_executable
71
+ paths = office_search_paths
72
+
73
+ # If an OFFICE_PATH has been specified on the commandline
74
+ # raise an error if that path isn't valid, otherwise, add
75
+ # it to the front of our search paths.
76
+ if ENV['OFFICE_PATH']
77
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
78
+ paths.unshift(ENV['OFFICE_PATH'])
79
+ end
80
+
81
+ # The location of the office executable is OS dependent
82
+ path_pieces = ["soffice"]
83
+ if windows?
84
+ path_pieces += [["program", "soffice.bin"]]
85
+ elsif osx?
86
+ path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
87
+ else
88
+ path_pieces += [["program", "soffice"]]
89
+ end
90
+
91
+ # Search for the first suitable office executable
92
+ # and short circuit an executable is found.
93
+ paths.each do |path|
94
+ if File.exists? path
95
+ @@executable ||= path unless File.directory? path
96
+ path_pieces.each do |pieces|
97
+ check_path = File.join(path, pieces)
98
+ @@executable ||= check_path if File.exists? check_path
99
+ end
100
+ end
101
+ break if @@executable
102
+ end
103
+ raise OfficeNotFound, "No office software found" unless @@executable
104
+ @@executable
105
+ end
106
+
107
+ # Used to specify the office location for JODConverter
108
+ def office_path
109
+ File.dirname(File.dirname(office_executable))
110
+ end
111
+
112
+ # Convert documents to PDF.
113
+ def extract(docs, opts)
114
+ out = opts[:output] || '.'
115
+ FileUtils.mkdir_p out unless File.exists?(out)
116
+ [docs].flatten.each do |doc|
117
+ ext = File.extname(doc)
118
+ basename = File.basename(doc, ext)
119
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
120
+
121
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
122
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
123
+ else
124
+ if libre_office?
125
+ # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
126
+ ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
127
+
128
+ options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
129
+ cmd = "#{office_executable} #{options} 2>&1"
130
+ result = `#{cmd}`.chomp
131
+ raise ExtractionFailed, result if $? != 0
132
+ true
133
+ else # open office presumably, rely on JODConverter to figure it out.
134
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
135
+ run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
142
+
143
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
144
+
145
+ HEADLESS = "-Djava.awt.headless=true"
146
+
147
+ private
148
+
149
+ # Runs a Java command, with quieted logging, and the classpath set properly.
150
+ def run_jod(command, pdfs, opts, return_output=false)
151
+
152
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
153
+ office = osx? ? "-Doffice.home=#{office_path}" : office_path
154
+ cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
155
+ result = `#{cmd}`.chomp
156
+ raise ExtractionFailed, result if $? != 0
157
+ return return_output ? (result.empty? ? nil : result) : true
158
+ end
159
+
160
+ class OfficeNotFound < StandardError; end
161
+ end
162
+ end
@@ -0,0 +1,99 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ if String.method_defined?(:encode)
39
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
40
+ else
41
+ require 'iconv' unless defined?(Iconv)
42
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
43
+ end
44
+
45
+ scanner = StringScanner.new(text)
46
+ cleaned = []
47
+ spaced = false
48
+ loop do
49
+ if space = scanner.scan(SPACE)
50
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
51
+ spaced = true
52
+ elsif word = scanner.scan(WORD)
53
+ unless garbage(word)
54
+ cleaned.push(word)
55
+ spaced = false
56
+ end
57
+ elsif scanner.eos?
58
+ return cleaned.join('').gsub(REPEATED, '')
59
+ end
60
+ end
61
+ end
62
+
63
+ # Is a given word OCR garbage?
64
+ def garbage(w)
65
+ acronym = w =~ ACRONYM
66
+
67
+ # More than 30 bytes in length.
68
+ (w.length > 30) ||
69
+
70
+ # If there are three or more identical characters in a row in the string.
71
+ (w =~ REPEAT) ||
72
+
73
+ # More punctuation than alpha numerics.
74
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
75
+
76
+ # Ignoring the first and last characters in the string, if there are three or
77
+ # more different punctuation characters in the string.
78
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
79
+
80
+ # Four or more consecutive vowels, or five or more consecutive consonants.
81
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
82
+
83
+ # Number of uppercase letters greater than lowercase letters, but the word is
84
+ # not all uppercase + punctuation.
85
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
86
+
87
+ # Single letters that are not A or I.
88
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
89
+
90
+ # All characters are alphabetic and there are 8 times more vowels than
91
+ # consonants, or 8 times more consonants than vowels.
92
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
93
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
94
+ (cons > vows * 8)))
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -0,0 +1,130 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
63
+ if pages
64
+ pages.each do |page|
65
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
67
+ file = "#{base_path}_#{page}"
68
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ clean_text(file + '.txt') if @clean_ocr
71
+ FileUtils.remove_entry_secure tiff
72
+ end
73
+ else
74
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
75
+ escaped_tiff = ESCAPE[tiff]
76
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ clean_text(base_path + '.txt') if @clean_ocr
79
+ end
80
+ ensure
81
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
82
+ end
83
+
84
+
85
+ private
86
+
87
+ def clean_text(file)
88
+ File.open(file, 'r+') do |f|
89
+ text = f.read
90
+ f.truncate(0)
91
+ f.rewind
92
+ f.write(Docsplit.clean_text(text))
93
+ end
94
+ end
95
+
96
+ # Run an external process and raise an exception if it fails.
97
+ def run(command)
98
+ result = `#{command}`
99
+ raise ExtractionFailed, result if $? != 0
100
+ result
101
+ end
102
+
103
+ # Extract the full contents of a pdf as a single file, directly.
104
+ def extract_full(pdf)
105
+ text_path = File.join(@output, "#{@pdf_name}.txt")
106
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
107
+ end
108
+
109
+ # Extract the contents of a single page of text, directly, adding it to
110
+ # the `@pages_to_ocr` list if the text length is inadequate.
111
+ def extract_page(pdf, page)
112
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
113
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
114
+ unless @forbid_ocr
115
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
116
+ end
117
+ end
118
+
119
+ def extract_options(options)
120
+ @output = options[:output] || '.'
121
+ @pages = options[:pages]
122
+ @force_ocr = options[:ocr] == true
123
+ @forbid_ocr = options[:ocr] == false
124
+ @clean_ocr = !(options[:clean] == false)
125
+ @language = options[:language] || 'eng'
126
+ end
127
+
128
+ end
129
+
130
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext.downcase == '.pdf' || File.open(doc, &:readline) =~ /\A\%PDF-\d+(\.\d+)?$/
13
+ doc
14
+ else
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
@@ -0,0 +1,233 @@
1
+ [
2
+ {
3
+ "name": "Portable Document Format",
4
+ "extension": "pdf",
5
+ "mediaType": "application/pdf",
6
+ "storePropertiesByFamily": {
7
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
8
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
9
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
10
+ "TEXT": {"FilterName": "writer_pdf_Export"}
11
+ }
12
+ },
13
+ {
14
+ "name": "Macromedia Flash",
15
+ "extension": "swf",
16
+ "mediaType": "application/x-shockwave-flash",
17
+ "storePropertiesByFamily": {
18
+ "DRAWING": {"FilterName": "draw_flash_Export"},
19
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
20
+ }
21
+ },
22
+ {
23
+ "name": "HTML",
24
+ "extension": "html",
25
+ "mediaType": "text/html",
26
+ "inputFamily": "TEXT",
27
+ "storePropertiesByFamily": {
28
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
29
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
30
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
31
+ }
32
+ },
33
+ {
34
+ "name": "OpenDocument Text",
35
+ "extension": "odt",
36
+ "mediaType": "application/vnd.oasis.opendocument.text",
37
+ "inputFamily": "TEXT",
38
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
39
+ },
40
+ {
41
+ "name": "OpenOffice.org 1.0 Text Document",
42
+ "extension": "sxw",
43
+ "mediaType": "application/vnd.sun.xml.writer",
44
+ "inputFamily": "TEXT",
45
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
46
+ },
47
+ {
48
+ "name": "Microsoft Word",
49
+ "extension": "doc",
50
+ "mediaType": "application/msword",
51
+ "inputFamily": "TEXT",
52
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
53
+ },
54
+ {
55
+ "name": "Microsoft Word 2007 XML",
56
+ "extension": "docx",
57
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
58
+ "inputFamily": "TEXT"
59
+ },
60
+ {
61
+ "name": "Rich Text Format",
62
+ "extension": "rtf",
63
+ "mediaType": "text/rtf",
64
+ "inputFamily": "TEXT",
65
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
66
+ },
67
+ {
68
+ "name": "WordPerfect",
69
+ "extension": "wpd",
70
+ "mediaType": "application/wordperfect",
71
+ "inputFamily": "TEXT"
72
+ },
73
+ {
74
+ "name": "Plain Text",
75
+ "extension": "txt",
76
+ "mediaType": "text/plain",
77
+ "inputFamily": "TEXT",
78
+ "loadProperties": {
79
+ "FilterName": "Text (encoded)",
80
+ "FilterOptions": "utf8"
81
+ },
82
+ "storePropertiesByFamily": {"TEXT": {
83
+ "FilterName": "Text (encoded)",
84
+ "FilterOptions": "utf8"
85
+ }}
86
+ },
87
+ {
88
+ "name": "MediaWiki wikitext",
89
+ "extension": "wiki",
90
+ "mediaType": "text/x-wiki",
91
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
92
+ },
93
+ {
94
+ "name": "OpenDocument Spreadsheet",
95
+ "extension": "ods",
96
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
97
+ "inputFamily": "SPREADSHEET",
98
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
99
+ },
100
+ {
101
+ "name": "OpenOffice.org 1.0 Spreadsheet",
102
+ "extension": "sxc",
103
+ "mediaType": "application/vnd.sun.xml.calc",
104
+ "inputFamily": "SPREADSHEET",
105
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106
+ },
107
+ {
108
+ "name": "Microsoft Excel",
109
+ "extension": "xls",
110
+ "mediaType": "application/vnd.ms-excel",
111
+ "inputFamily": "SPREADSHEET",
112
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113
+ },
114
+ {
115
+ "name": "Microsoft Excel 2007 XML",
116
+ "extension": "xlsx",
117
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118
+ "inputFamily": "SPREADSHEET"
119
+ },
120
+ {
121
+ "name": "Comma Separated Values",
122
+ "extension": "csv",
123
+ "mediaType": "text/csv",
124
+ "inputFamily": "SPREADSHEET",
125
+ "loadProperties": {
126
+ "FilterName": "Text - txt - csv (StarCalc)",
127
+ "FilterOptions": "44,34,0"
128
+ },
129
+ "storePropertiesByFamily": {"SPREADSHEET": {
130
+ "FilterName": "Text - txt - csv (StarCalc)",
131
+ "FilterOptions": "44,34,0"
132
+ }}
133
+ },
134
+ {
135
+ "name": "Tab Separated Values",
136
+ "extension": "tsv",
137
+ "mediaType": "text/tab-separated-values",
138
+ "inputFamily": "SPREADSHEET",
139
+ "loadProperties": {
140
+ "FilterName": "Text - txt - csv (StarCalc)",
141
+ "FilterOptions": "9,34,0"
142
+ },
143
+ "storePropertiesByFamily": {"SPREADSHEET": {
144
+ "FilterName": "Text - txt - csv (StarCalc)",
145
+ "FilterOptions": "9,34,0"
146
+ }}
147
+ },
148
+ {
149
+ "name": "OpenDocument Presentation",
150
+ "extension": "odp",
151
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
152
+ "inputFamily": "PRESENTATION",
153
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154
+ },
155
+ {
156
+ "name": "OpenOffice.org 1.0 Presentation",
157
+ "extension": "sxi",
158
+ "mediaType": "application/vnd.sun.xml.impress",
159
+ "inputFamily": "PRESENTATION",
160
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161
+ },
162
+ {
163
+ "name": "Microsoft PowerPoint",
164
+ "extension": "ppt",
165
+ "mediaType": "application/vnd.ms-powerpoint",
166
+ "inputFamily": "PRESENTATION",
167
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168
+ },
169
+ {
170
+ "name": "Microsoft PowerPoint 2007 XML",
171
+ "extension": "pptx",
172
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173
+ "inputFamily": "PRESENTATION"
174
+ },
175
+ {
176
+ "name": "OpenDocument Drawing",
177
+ "extension": "odg",
178
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
179
+ "inputFamily": "DRAWING",
180
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181
+ },
182
+ {
183
+ "name": "Scalable Vector Graphics",
184
+ "extension": "svg",
185
+ "mediaType": "image/svg+xml",
186
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187
+ },
188
+ {
189
+ "name": "Portable Network Graphic",
190
+ "extension": "png",
191
+ "mediaType": "image/png",
192
+ "storePropertiesByFamily": {
193
+ "DRAWING": {"FilterName": "draw_png_Export"},
194
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
195
+ }
196
+ },
197
+ {
198
+ "name": "Graphics Interchange Format",
199
+ "extension": "gif",
200
+ "mediaType": "image/gif",
201
+ "storePropertiesByFamily": {
202
+ "DRAWING": {"FilterName": "draw_gif_Export"},
203
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
204
+ }
205
+ },
206
+ {
207
+ "name": "Joint Photographic Experts Group",
208
+ "extension": "jpg",
209
+ "mediaType": "image/jpeg",
210
+ "storePropertiesByFamily": {
211
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
212
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213
+ }
214
+ },
215
+ {
216
+ "name": "Windows Bitmap",
217
+ "extension": "bmp",
218
+ "mediaType": "image/bmp",
219
+ "storePropertiesByFamily": {
220
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
221
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222
+ }
223
+ },
224
+ {
225
+ "name": "Tagged Image File Format",
226
+ "extension": "tif",
227
+ "mediaType": "image/tiff",
228
+ "storePropertiesByFamily": {
229
+ "DRAWING": {"FilterName": "draw_tif_Export"},
230
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
231
+ }
232
+ }
233
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: burisu-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.5
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Ashkenas
8
+ - Samuel Clay
9
+ - Ted Han
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2014-03-27 00:00:00.000000000 Z
14
+ dependencies: []
15
+ description: |2
16
+ Docsplit is a command-line utility and Ruby library for splitting apart
17
+ documents into their component parts: searchable UTF-8 plain text, page
18
+ images or thumbnails in any format, PDFs, single pages, and document
19
+ metadata (title, author, number of pages...)
20
+ email: opensource@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/image_extractor.rb
27
+ - lib/docsplit/info_extractor.rb
28
+ - lib/docsplit/transparent_pdfs.rb
29
+ - lib/docsplit/text_extractor.rb
30
+ - lib/docsplit/text_cleaner.rb
31
+ - lib/docsplit/page_extractor.rb
32
+ - lib/docsplit/pdf_extractor.rb
33
+ - lib/docsplit/command_line.rb
34
+ - lib/docsplit.rb
35
+ - bin/docsplit
36
+ - vendor/logging.properties
37
+ - vendor/conf/document-formats.js
38
+ - vendor/jodconverter/jurt-3.2.1.jar
39
+ - vendor/jodconverter/unoil-3.2.1.jar
40
+ - vendor/jodconverter/commons-cli-1.1.jar
41
+ - vendor/jodconverter/json-20090211.jar
42
+ - vendor/jodconverter/ridl-3.2.1.jar
43
+ - vendor/jodconverter/commons-io-1.4.jar
44
+ - vendor/jodconverter/juh-3.2.1.jar
45
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
46
+ - docsplit.gemspec
47
+ - LICENSE
48
+ - README
49
+ homepage: http://documentcloud.github.com/docsplit/
50
+ licenses:
51
+ - MIT
52
+ metadata: {}
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 2.0.14
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
73
+ test_files: []