talentbox-docsplit 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ JODConverter is licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
4
+
5
+ Permission is hereby granted, free of charge, to any person
6
+ obtaining a copy of this software and associated documentation
7
+ files (the "Software"), to deal in the Software without
8
+ restriction, including without limitation the rights to use,
9
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the
11
+ Software is furnished to do so, subject to the following
12
+ conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'talentbox-docsplit'
3
+ s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-05-13'
5
+
6
+ s.homepage = "http://documentcloud.github.com/docsplit/"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+
19
+ s.require_paths = ['lib']
20
+ s.executables = ['docsplit']
21
+
22
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
+ 'docsplit.gemspec', 'LICENSE', 'README']
24
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,122 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ VERSION = '0.5.2' # Keep in sync with gemspec.
5
+
6
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
+
8
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
9
+
10
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
11
+
12
+ HEADLESS = "-Djava.awt.headless=true"
13
+
14
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
+
16
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
+
18
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
19
+
20
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
+
22
+ # Check for all dependencies, and warn of their absence.
23
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
24
+ DEPENDENCIES.each_key do |dep|
25
+ dirs.each do |dir|
26
+ if File.executable?(File.join(dir, dep.to_s))
27
+ DEPENDENCIES[dep] = true
28
+ break
29
+ end
30
+ end
31
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
32
+ end
33
+
34
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
35
+ # broke.
36
+ class ExtractionFailed < StandardError; end
37
+
38
+ # Use the ExtractPages Java class to burst a PDF into single pages.
39
+ def self.extract_pages(pdfs, opts={})
40
+ pdfs = ensure_pdfs(pdfs)
41
+ PageExtractor.new.extract(pdfs, opts)
42
+ end
43
+
44
+ # Use the ExtractText Java class to write out all embedded text.
45
+ def self.extract_text(pdfs, opts={})
46
+ pdfs = ensure_pdfs(pdfs)
47
+ TextExtractor.new.extract(pdfs, opts)
48
+ end
49
+
50
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
51
+ def self.extract_images(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
54
+ ImageExtractor.new.extract(pdfs, opts)
55
+ end
56
+
57
+ # Use JODCConverter to extract the documents as PDFs.
58
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
59
+ def self.extract_pdf(docs, opts={})
60
+ out = opts[:output] || '.'
61
+ FileUtils.mkdir_p out unless File.exists?(out)
62
+ [docs].flatten.each do |doc|
63
+ ext = File.extname(doc)
64
+ basename = File.basename(doc, ext)
65
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
+ else
68
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
70
+ end
71
+ end
72
+ end
73
+
74
+ # Define custom methods for each of the metadata keys that we support.
75
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
76
+ METADATA_KEYS.each do |key|
77
+ instance_eval <<-EOS
78
+ def self.extract_#{key}(pdfs, opts={})
79
+ pdfs = ensure_pdfs(pdfs)
80
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
81
+ end
82
+ EOS
83
+ end
84
+
85
+ # Utility method to clean OCR'd text with garbage characters.
86
+ def self.clean_text(text)
87
+ TextCleaner.new.clean(text)
88
+ end
89
+
90
+
91
+ private
92
+
93
+ # Runs a Java command, with quieted logging, and the classpath set properly.
94
+ def self.run(command, pdfs, opts, return_output=false)
95
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
96
+ cmd = "java #{HEADLESS} #{LOGGING} #{OfficeUtils.new.get_office_path} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
97
+ result = `#{cmd}`.chomp
98
+ raise ExtractionFailed, result if $? != 0
99
+ return return_output ? (result.empty? ? nil : result) : true
100
+ end
101
+
102
+ # Normalize a value in an options hash for the command line.
103
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
104
+ def self.normalize_value(value)
105
+ case value
106
+ when Range then normalize_range(value)
107
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
108
+ else value.to_s
109
+ end
110
+ end
111
+
112
+ end
113
+
114
+ require 'tmpdir'
115
+ require 'fileutils'
116
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
119
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
120
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
121
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
122
+ require "#{Docsplit::ROOT}/lib/docsplit/office_utils"
@@ -0,0 +1,142 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ <<<<<<< HEAD
5
+ VERSION = '0.5.2' # Keep in sync with gemspec.
6
+ =======
7
+ VERSION = '0.5.4' # Keep in sync with gemspec.
8
+ >>>>>>> karteek/master
9
+
10
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
11
+
12
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
13
+
14
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
15
+
16
+ HEADLESS = "-Djava.awt.headless=true"
17
+
18
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
19
+
20
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
21
+
22
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
23
+
24
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
25
+
26
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
27
+
28
+ # Check for all dependencies, and warn of their absence.
29
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
30
+ DEPENDENCIES.each_key do |dep|
31
+ dirs.each do |dir|
32
+ if File.executable?(File.join(dir, dep.to_s))
33
+ DEPENDENCIES[dep] = true
34
+ break
35
+ end
36
+ end
37
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
38
+ end
39
+
40
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
41
+ # broke.
42
+ class ExtractionFailed < StandardError; end
43
+
44
+ # Use the ExtractPages Java class to burst a PDF into single pages.
45
+ def self.extract_pages(pdfs, opts={})
46
+ pdfs = ensure_pdfs(pdfs)
47
+ PageExtractor.new.extract(pdfs, opts)
48
+ end
49
+
50
+ # Use the ExtractText Java class to write out all embedded text.
51
+ def self.extract_text(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ TextExtractor.new.extract(pdfs, opts)
54
+ end
55
+
56
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
57
+ def self.extract_images(pdfs, opts={})
58
+ pdfs = ensure_pdfs(pdfs)
59
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
60
+ ImageExtractor.new.extract(pdfs, opts)
61
+ end
62
+
63
+ # Use JODCConverter to extract the documents as PDFs.
64
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
65
+ def self.extract_pdf(docs, opts={})
66
+ out = opts[:output] || '.'
67
+ FileUtils.mkdir_p out unless File.exists?(out)
68
+ [docs].flatten.each do |doc|
69
+ <<<<<<< HEAD
70
+ ext = File.extname(doc)
71
+ basename = File.basename(doc, ext)
72
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
73
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
74
+ else
75
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
76
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
77
+ =======
78
+ out = opts[:output] || '.'
79
+ FileUtils.mkdir_p out unless File.exists?(out)
80
+ [docs].flatten.each do |doc|
81
+ ext = File.extname(doc)
82
+ basename = File.basename(doc, ext)
83
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
84
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
85
+ else
86
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
87
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
88
+ end
89
+ >>>>>>> karteek/master
90
+ end
91
+ end
92
+ end
93
+
94
+ # Define custom methods for each of the metadata keys that we support.
95
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
96
+ METADATA_KEYS.each do |key|
97
+ instance_eval <<-EOS
98
+ def self.extract_#{key}(pdfs, opts={})
99
+ pdfs = ensure_pdfs(pdfs)
100
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
101
+ end
102
+ EOS
103
+ end
104
+
105
+ # Utility method to clean OCR'd text with garbage characters.
106
+ def self.clean_text(text)
107
+ TextCleaner.new.clean(text)
108
+ end
109
+
110
+
111
+ private
112
+
113
+ # Runs a Java command, with quieted logging, and the classpath set properly.
114
+ def self.run(command, pdfs, opts, return_output=false)
115
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
116
+ cmd = "java #{HEADLESS} #{LOGGING} #{OfficeUtils.new.get_office_path} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
117
+ result = `#{cmd}`.chomp
118
+ raise ExtractionFailed, result if $? != 0
119
+ return return_output ? (result.empty? ? nil : result) : true
120
+ end
121
+
122
+ # Normalize a value in an options hash for the command line.
123
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
124
+ def self.normalize_value(value)
125
+ case value
126
+ when Range then normalize_range(value)
127
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
128
+ else value.to_s
129
+ end
130
+ end
131
+
132
+ end
133
+
134
+ require 'tmpdir'
135
+ require 'fileutils'
136
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
137
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
138
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
139
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
140
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
141
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
142
+ require "#{Docsplit::ROOT}/lib/docsplit/office_utils"
@@ -0,0 +1,116 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
+ @options[:ocr] = o
90
+ end
91
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
92
+ @options[:clean] = false
93
+ end
94
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
95
+ @options[:rolling] = true
96
+ end
97
+ opts.on_tail('-v', '--version', 'display docsplit version') do
98
+ puts "Docsplit version #{Docsplit::VERSION}"
99
+ exit
100
+ end
101
+ opts.on_tail('-h', '--help', 'display this help message') do
102
+ usage
103
+ end
104
+ end
105
+ @option_parser.banner = BANNER
106
+ begin
107
+ @option_parser.parse!(ARGV)
108
+ rescue OptionParser::InvalidOption => e
109
+ puts e.message
110
+ exit(1)
111
+ end
112
+ end
113
+
114
+ end
115
+
116
+ end
@@ -0,0 +1,101 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ DENSITY_ARG = "-density 150"
8
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
+ DEFAULT_FORMAT = :png
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
36
+ common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
37
+ if previous
38
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
39
+ result = `MAGICK_TMPDIR="#{tempdir}" OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 "#{directory}/*.#{format}" 2>&1`.chomp
40
+ raise ExtractionFailed, result if $? != 0
41
+ else
42
+ page_list(pages).each do |page|
43
+ out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
+ cmd = "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
45
+ result = `#{cmd}`.chomp
46
+ raise ExtractionFailed, result if $? != 0
47
+ end
48
+ end
49
+ ensure
50
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
51
+ end
52
+
53
+
54
+ private
55
+
56
+ # Extract the relevant GraphicsMagick options from the options hash.
57
+ def extract_options(options)
58
+ @output = options[:output] || '.'
59
+ @pages = options[:pages]
60
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
61
+ @sizes = [options[:size]].flatten.compact
62
+ @sizes = [nil] if @sizes.empty?
63
+ @rolling = !!options[:rolling]
64
+ end
65
+
66
+ # If there's only one size requested, generate the images directly into
67
+ # the output directory. Multiple sizes each get a directory of their own.
68
+ def directory_for(size)
69
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
70
+ File.expand_path(path)
71
+ end
72
+
73
+ # Generate the resize argument.
74
+ def resize_arg(size)
75
+ size.nil? ? '' : "-resize #{size}"
76
+ end
77
+
78
+ # Generate the appropriate quality argument for the image format.
79
+ def quality_arg(format)
80
+ case format.to_s
81
+ when /jpe?g/ then "-quality 85"
82
+ when /png/ then "-quality 100"
83
+ else ""
84
+ end
85
+ end
86
+
87
+ # Generate the expanded list of requested page numbers.
88
+ def page_list(pages)
89
+ pages.split(',').map { |range|
90
+ if range.include?('-')
91
+ range = range.split('-')
92
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
93
+ else
94
+ range.to_i
95
+ end
96
+ }.flatten.uniq.sort
97
+ end
98
+
99
+ end
100
+
101
+ end
@@ -0,0 +1,32 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo \"#{pdf}\" 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ match = result.match(MATCHERS[key])
25
+ answer = match && match[1]
26
+ answer = answer.to_i if answer && key == :length
27
+ answer
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,23 @@
1
+ module Docsplit
2
+ class OfficeUtils
3
+ # Lets check if the user is using LibreOffice
4
+ # If he is using Openoffice,org, jodconverter will take care of things
5
+ def get_office_path
6
+ mac_libre_office_path = "/Applications/LibreOffice.app/Contents"
7
+ linux_libreoffice_path = "/usr/lib/libreoffice"
8
+ if RUBY_PLATFORM.match(/darwin/i)
9
+ if File.exists?(mac_libre_office_path)
10
+ "-Doffice.home=#{mac_libre_office_path}"
11
+ else
12
+ ""
13
+ end
14
+ else
15
+ if File.exists?(linux_libreoffice_path)
16
+ "-Doffice.home=#{linux_libre_office_path}"
17
+ else
18
+ "-Doffice.home=/usr/lib/openoffice"
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+ cmd = "pdftk \"#{pdf}\" burst output \"#{page_path}\" 2>&1"
15
+ result = `#{cmd}`.chomp
16
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
+ raise ExtractionFailed, result if $? != 0
18
+ result
19
+ end
20
+ end
21
+
22
+
23
+ private
24
+
25
+ def extract_options(options)
26
+ @output = options[:output] || '.'
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -0,0 +1,94 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,126 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts "#{pdf}" 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ if pages
63
+ pages.each do |page|
64
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ file = "#{base_path}_#{page}"
66
+ run "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} \"#{pdf}[#{page - 1}]\" \"#{tiff}\" 2>&1"
67
+ run "tesseract \"#{tiff}\" \"#{file}\" -l eng 2>&1"
68
+ clean_text(file + '.txt') if @clean_ocr
69
+ FileUtils.remove_entry_secure tiff
70
+ end
71
+ else
72
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
73
+ run "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} \"#{pdf}\" \"#{tiff}\" 2>&1"
74
+ run "tesseract \"#{tiff}\" \"#{base_path}\" -l eng 2>&1"
75
+ clean_text(base_path + '.txt') if @clean_ocr
76
+ end
77
+ ensure
78
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
79
+ end
80
+
81
+
82
+ private
83
+
84
+ def clean_text(file)
85
+ File.open(file, 'r+') do |f|
86
+ text = f.read
87
+ f.truncate(0)
88
+ f.rewind
89
+ f.write(Docsplit.clean_text(text))
90
+ end
91
+ end
92
+
93
+ # Run an external process and raise an exception if it fails.
94
+ def run(command)
95
+ result = `#{command}`
96
+ raise ExtractionFailed, result if $? != 0
97
+ result
98
+ end
99
+
100
+ # Extract the full contents of a pdf as a single file, directly.
101
+ def extract_full(pdf)
102
+ text_path = File.join(@output, "#{@pdf_name}.txt")
103
+ run "pdftotext -enc UTF-8 \"#{pdf}\" \"#{text_path}\" 2>&1"
104
+ end
105
+
106
+ # Extract the contents of a single page of text, directly, adding it to
107
+ # the `@pages_to_ocr` list if the text length is inadequate.
108
+ def extract_page(pdf, page)
109
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
110
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} \"#{pdf}\" \"#{text_path}\" 2>&1"
111
+ unless @forbid_ocr
112
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
113
+ end
114
+ end
115
+
116
+ def extract_options(options)
117
+ @output = options[:output] || '.'
118
+ @pages = options[:pages]
119
+ @force_ocr = options[:ocr] == true
120
+ @forbid_ocr = options[:ocr] == false
121
+ @clean_ocr = !(options[:clean] == false)
122
+ end
123
+
124
+ end
125
+
126
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext.downcase == '.pdf'
13
+ doc
14
+ else
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
@@ -0,0 +1,236 @@
1
+ //
2
+ // JODConverter Document Formats Configuration
3
+ //
4
+ [
5
+ {
6
+ "name": "Portable Document Format",
7
+ "extension": "pdf",
8
+ "mediaType": "application/pdf",
9
+ "storePropertiesByFamily": {
10
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
11
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
12
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
13
+ "TEXT": {"FilterName": "writer_pdf_Export"}
14
+ }
15
+ },
16
+ {
17
+ "name": "Macromedia Flash",
18
+ "extension": "swf",
19
+ "mediaType": "application/x-shockwave-flash",
20
+ "storePropertiesByFamily": {
21
+ "DRAWING": {"FilterName": "draw_flash_Export"},
22
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
23
+ }
24
+ },
25
+ {
26
+ "name": "HTML",
27
+ "extension": "html",
28
+ "mediaType": "text/html",
29
+ "inputFamily": "TEXT",
30
+ "storePropertiesByFamily": {
31
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
32
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
33
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
34
+ }
35
+ },
36
+ {
37
+ "name": "OpenDocument Text",
38
+ "extension": "odt",
39
+ "mediaType": "application/vnd.oasis.opendocument.text",
40
+ "inputFamily": "TEXT",
41
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
42
+ },
43
+ {
44
+ "name": "OpenOffice.org 1.0 Text Document",
45
+ "extension": "sxw",
46
+ "mediaType": "application/vnd.sun.xml.writer",
47
+ "inputFamily": "TEXT",
48
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
49
+ },
50
+ {
51
+ "name": "Microsoft Word",
52
+ "extension": "doc",
53
+ "mediaType": "application/msword",
54
+ "inputFamily": "TEXT",
55
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
56
+ },
57
+ {
58
+ "name": "Microsoft Word 2007 XML",
59
+ "extension": "docx",
60
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
61
+ "inputFamily": "TEXT"
62
+ },
63
+ {
64
+ "name": "Rich Text Format",
65
+ "extension": "rtf",
66
+ "mediaType": "text/rtf",
67
+ "inputFamily": "TEXT",
68
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
69
+ },
70
+ {
71
+ "name": "WordPerfect",
72
+ "extension": "wpd",
73
+ "mediaType": "application/wordperfect",
74
+ "inputFamily": "TEXT"
75
+ },
76
+ {
77
+ "name": "Plain Text",
78
+ "extension": "txt",
79
+ "mediaType": "text/plain",
80
+ "inputFamily": "TEXT",
81
+ "loadProperties": {
82
+ "FilterName": "Text (encoded)",
83
+ "FilterOptions": "utf8"
84
+ },
85
+ "storePropertiesByFamily": {"TEXT": {
86
+ "FilterName": "Text (encoded)",
87
+ "FilterOptions": "utf8"
88
+ }}
89
+ },
90
+ {
91
+ "name": "MediaWiki wikitext",
92
+ "extension": "wiki",
93
+ "mediaType": "text/x-wiki",
94
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
95
+ },
96
+ {
97
+ "name": "OpenDocument Spreadsheet",
98
+ "extension": "ods",
99
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
100
+ "inputFamily": "SPREADSHEET",
101
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
102
+ },
103
+ {
104
+ "name": "OpenOffice.org 1.0 Spreadsheet",
105
+ "extension": "sxc",
106
+ "mediaType": "application/vnd.sun.xml.calc",
107
+ "inputFamily": "SPREADSHEET",
108
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
109
+ },
110
+ {
111
+ "name": "Microsoft Excel",
112
+ "extension": "xls",
113
+ "mediaType": "application/vnd.ms-excel",
114
+ "inputFamily": "SPREADSHEET",
115
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
116
+ },
117
+ {
118
+ "name": "Microsoft Excel 2007 XML",
119
+ "extension": "xlsx",
120
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
121
+ "inputFamily": "SPREADSHEET"
122
+ },
123
+ {
124
+ "name": "Comma Separated Values",
125
+ "extension": "csv",
126
+ "mediaType": "text/csv",
127
+ "inputFamily": "SPREADSHEET",
128
+ "loadProperties": {
129
+ "FilterName": "Text - txt - csv (StarCalc)",
130
+ "FilterOptions": "44,34,0"
131
+ },
132
+ "storePropertiesByFamily": {"SPREADSHEET": {
133
+ "FilterName": "Text - txt - csv (StarCalc)",
134
+ "FilterOptions": "44,34,0"
135
+ }}
136
+ },
137
+ {
138
+ "name": "Tab Separated Values",
139
+ "extension": "tsv",
140
+ "mediaType": "text/tab-separated-values",
141
+ "inputFamily": "SPREADSHEET",
142
+ "loadProperties": {
143
+ "FilterName": "Text - txt - csv (StarCalc)",
144
+ "FilterOptions": "9,34,0"
145
+ },
146
+ "storePropertiesByFamily": {"SPREADSHEET": {
147
+ "FilterName": "Text - txt - csv (StarCalc)",
148
+ "FilterOptions": "9,34,0"
149
+ }}
150
+ },
151
+ {
152
+ "name": "OpenDocument Presentation",
153
+ "extension": "odp",
154
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
155
+ "inputFamily": "PRESENTATION",
156
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
157
+ },
158
+ {
159
+ "name": "OpenOffice.org 1.0 Presentation",
160
+ "extension": "sxi",
161
+ "mediaType": "application/vnd.sun.xml.impress",
162
+ "inputFamily": "PRESENTATION",
163
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
164
+ },
165
+ {
166
+ "name": "Microsoft PowerPoint",
167
+ "extension": "ppt",
168
+ "mediaType": "application/vnd.ms-powerpoint",
169
+ "inputFamily": "PRESENTATION",
170
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
171
+ },
172
+ {
173
+ "name": "Microsoft PowerPoint 2007 XML",
174
+ "extension": "pptx",
175
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
176
+ "inputFamily": "PRESENTATION"
177
+ },
178
+ {
179
+ "name": "OpenDocument Drawing",
180
+ "extension": "odg",
181
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
182
+ "inputFamily": "DRAWING",
183
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
184
+ },
185
+ {
186
+ "name": "Scalable Vector Graphics",
187
+ "extension": "svg",
188
+ "mediaType": "image/svg+xml",
189
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
190
+ },
191
+ {
192
+ "name": "Portable Network Graphic",
193
+ "extension": "png",
194
+ "mediaType": "image/png",
195
+ "storePropertiesByFamily": {
196
+ "DRAWING": {"FilterName": "draw_png_Export"},
197
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
198
+ }
199
+ },
200
+ {
201
+ "name": "Graphics Interchange Format",
202
+ "extension": "gif",
203
+ "mediaType": "image/gif",
204
+ "storePropertiesByFamily": {
205
+ "DRAWING": {"FilterName": "draw_gif_Export"},
206
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
207
+ }
208
+ },
209
+ {
210
+ "name": "Joint Photographic Experts Group",
211
+ "extension": "jpg",
212
+ "mediaType": "image/jpeg",
213
+ "storePropertiesByFamily": {
214
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
215
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
216
+ }
217
+ },
218
+ {
219
+ "name": "Windows Bitmap",
220
+ "extension": "bmp",
221
+ "mediaType": "image/bmp",
222
+ "storePropertiesByFamily": {
223
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
224
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
225
+ }
226
+ },
227
+ {
228
+ "name": "Tagged Image File Format",
229
+ "extension": "tif",
230
+ "mediaType": "image/tiff",
231
+ "storePropertiesByFamily": {
232
+ "DRAWING": {"FilterName": "draw_tif_Export"},
233
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
234
+ }
235
+ }
236
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: talentbox-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeremy Ashkenas
9
+ - Samuel Clay
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2011-05-13 00:00:00.000000000 +02:00
14
+ default_executable:
15
+ dependencies: []
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: jeremy@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/command_line.rb
27
+ - lib/docsplit/image_extractor.rb
28
+ - lib/docsplit/info_extractor.rb
29
+ - lib/docsplit/office_utils.rb
30
+ - lib/docsplit/page_extractor.rb
31
+ - lib/docsplit/text_cleaner.rb
32
+ - lib/docsplit/text_extractor.rb
33
+ - lib/docsplit/transparent_pdfs.rb
34
+ - lib/docsplit.rb
35
+ - lib/docsplit.rb.orig
36
+ - bin/docsplit
37
+ - vendor/conf/document-formats.js
38
+ - vendor/jodconverter/commons-cli-1.1.jar
39
+ - vendor/jodconverter/commons-io-1.4.jar
40
+ - vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
41
+ - vendor/jodconverter/json-20080701.jar
42
+ - vendor/jodconverter/juh-3.1.0.jar
43
+ - vendor/jodconverter/jurt-3.1.0.jar
44
+ - vendor/jodconverter/ridl-3.1.0.jar
45
+ - vendor/jodconverter/unoil-3.1.0.jar
46
+ - vendor/logging.properties
47
+ - docsplit.gemspec
48
+ - LICENSE
49
+ - README
50
+ has_rdoc: true
51
+ homepage: http://documentcloud.github.com/docsplit/
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project: docsplit
71
+ rubygems_version: 1.6.2
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
75
+ test_files: []