talentbox-docsplit 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ JODConverter is licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
4
+
5
+ Permission is hereby granted, free of charge, to any person
6
+ obtaining a copy of this software and associated documentation
7
+ files (the "Software"), to deal in the Software without
8
+ restriction, including without limitation the rights to use,
9
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the
11
+ Software is furnished to do so, subject to the following
12
+ conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'talentbox-docsplit'
3
+ s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-05-13'
5
+
6
+ s.homepage = "http://documentcloud.github.com/docsplit/"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+
19
+ s.require_paths = ['lib']
20
+ s.executables = ['docsplit']
21
+
22
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
+ 'docsplit.gemspec', 'LICENSE', 'README']
24
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,122 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ VERSION = '0.5.2' # Keep in sync with gemspec.
5
+
6
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
+
8
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
9
+
10
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
11
+
12
+ HEADLESS = "-Djava.awt.headless=true"
13
+
14
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
+
16
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
+
18
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
19
+
20
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
+
22
+ # Check for all dependencies, and warn of their absence.
23
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
24
+ DEPENDENCIES.each_key do |dep|
25
+ dirs.each do |dir|
26
+ if File.executable?(File.join(dir, dep.to_s))
27
+ DEPENDENCIES[dep] = true
28
+ break
29
+ end
30
+ end
31
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
32
+ end
33
+
34
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
35
+ # broke.
36
+ class ExtractionFailed < StandardError; end
37
+
38
+ # Use the ExtractPages Java class to burst a PDF into single pages.
39
+ def self.extract_pages(pdfs, opts={})
40
+ pdfs = ensure_pdfs(pdfs)
41
+ PageExtractor.new.extract(pdfs, opts)
42
+ end
43
+
44
+ # Use the ExtractText Java class to write out all embedded text.
45
+ def self.extract_text(pdfs, opts={})
46
+ pdfs = ensure_pdfs(pdfs)
47
+ TextExtractor.new.extract(pdfs, opts)
48
+ end
49
+
50
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
51
+ def self.extract_images(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
54
+ ImageExtractor.new.extract(pdfs, opts)
55
+ end
56
+
57
+ # Use JODCConverter to extract the documents as PDFs.
58
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
59
+ def self.extract_pdf(docs, opts={})
60
+ out = opts[:output] || '.'
61
+ FileUtils.mkdir_p out unless File.exists?(out)
62
+ [docs].flatten.each do |doc|
63
+ ext = File.extname(doc)
64
+ basename = File.basename(doc, ext)
65
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
+ else
68
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
70
+ end
71
+ end
72
+ end
73
+
74
+ # Define custom methods for each of the metadata keys that we support.
75
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
76
+ METADATA_KEYS.each do |key|
77
+ instance_eval <<-EOS
78
+ def self.extract_#{key}(pdfs, opts={})
79
+ pdfs = ensure_pdfs(pdfs)
80
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
81
+ end
82
+ EOS
83
+ end
84
+
85
+ # Utility method to clean OCR'd text with garbage characters.
86
+ def self.clean_text(text)
87
+ TextCleaner.new.clean(text)
88
+ end
89
+
90
+
91
+ private
92
+
93
+ # Runs a Java command, with quieted logging, and the classpath set properly.
94
+ def self.run(command, pdfs, opts, return_output=false)
95
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
96
+ cmd = "java #{HEADLESS} #{LOGGING} #{OfficeUtils.new.get_office_path} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
97
+ result = `#{cmd}`.chomp
98
+ raise ExtractionFailed, result if $? != 0
99
+ return return_output ? (result.empty? ? nil : result) : true
100
+ end
101
+
102
+ # Normalize a value in an options hash for the command line.
103
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
104
+ def self.normalize_value(value)
105
+ case value
106
+ when Range then normalize_range(value)
107
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
108
+ else value.to_s
109
+ end
110
+ end
111
+
112
+ end
113
+
114
+ require 'tmpdir'
115
+ require 'fileutils'
116
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
119
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
120
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
121
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
122
+ require "#{Docsplit::ROOT}/lib/docsplit/office_utils"
@@ -0,0 +1,142 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ <<<<<<< HEAD
5
+ VERSION = '0.5.2' # Keep in sync with gemspec.
6
+ =======
7
+ VERSION = '0.5.4' # Keep in sync with gemspec.
8
+ >>>>>>> karteek/master
9
+
10
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
11
+
12
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
13
+
14
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
15
+
16
+ HEADLESS = "-Djava.awt.headless=true"
17
+
18
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
19
+
20
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
21
+
22
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
23
+
24
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
25
+
26
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
27
+
28
+ # Check for all dependencies, and warn of their absence.
29
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
30
+ DEPENDENCIES.each_key do |dep|
31
+ dirs.each do |dir|
32
+ if File.executable?(File.join(dir, dep.to_s))
33
+ DEPENDENCIES[dep] = true
34
+ break
35
+ end
36
+ end
37
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
38
+ end
39
+
40
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
41
+ # broke.
42
+ class ExtractionFailed < StandardError; end
43
+
44
+ # Use the ExtractPages Java class to burst a PDF into single pages.
45
+ def self.extract_pages(pdfs, opts={})
46
+ pdfs = ensure_pdfs(pdfs)
47
+ PageExtractor.new.extract(pdfs, opts)
48
+ end
49
+
50
+ # Use the ExtractText Java class to write out all embedded text.
51
+ def self.extract_text(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ TextExtractor.new.extract(pdfs, opts)
54
+ end
55
+
56
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
57
+ def self.extract_images(pdfs, opts={})
58
+ pdfs = ensure_pdfs(pdfs)
59
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
60
+ ImageExtractor.new.extract(pdfs, opts)
61
+ end
62
+
63
+ # Use JODCConverter to extract the documents as PDFs.
64
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
65
+ def self.extract_pdf(docs, opts={})
66
+ out = opts[:output] || '.'
67
+ FileUtils.mkdir_p out unless File.exists?(out)
68
+ [docs].flatten.each do |doc|
69
+ <<<<<<< HEAD
70
+ ext = File.extname(doc)
71
+ basename = File.basename(doc, ext)
72
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
73
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
74
+ else
75
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
76
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
77
+ =======
78
+ out = opts[:output] || '.'
79
+ FileUtils.mkdir_p out unless File.exists?(out)
80
+ [docs].flatten.each do |doc|
81
+ ext = File.extname(doc)
82
+ basename = File.basename(doc, ext)
83
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
84
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
85
+ else
86
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
87
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
88
+ end
89
+ >>>>>>> karteek/master
90
+ end
91
+ end
92
+ end
93
+
94
+ # Define custom methods for each of the metadata keys that we support.
95
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
96
+ METADATA_KEYS.each do |key|
97
+ instance_eval <<-EOS
98
+ def self.extract_#{key}(pdfs, opts={})
99
+ pdfs = ensure_pdfs(pdfs)
100
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
101
+ end
102
+ EOS
103
+ end
104
+
105
+ # Utility method to clean OCR'd text with garbage characters.
106
+ def self.clean_text(text)
107
+ TextCleaner.new.clean(text)
108
+ end
109
+
110
+
111
+ private
112
+
113
+ # Runs a Java command, with quieted logging, and the classpath set properly.
114
+ def self.run(command, pdfs, opts, return_output=false)
115
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
116
+ cmd = "java #{HEADLESS} #{LOGGING} #{OfficeUtils.new.get_office_path} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
117
+ result = `#{cmd}`.chomp
118
+ raise ExtractionFailed, result if $? != 0
119
+ return return_output ? (result.empty? ? nil : result) : true
120
+ end
121
+
122
+ # Normalize a value in an options hash for the command line.
123
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
124
+ def self.normalize_value(value)
125
+ case value
126
+ when Range then normalize_range(value)
127
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
128
+ else value.to_s
129
+ end
130
+ end
131
+
132
+ end
133
+
134
+ require 'tmpdir'
135
+ require 'fileutils'
136
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
137
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
138
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
139
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
140
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
141
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
142
+ require "#{Docsplit::ROOT}/lib/docsplit/office_utils"
@@ -0,0 +1,116 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
+ @options[:ocr] = o
90
+ end
91
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
92
+ @options[:clean] = false
93
+ end
94
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
95
+ @options[:rolling] = true
96
+ end
97
+ opts.on_tail('-v', '--version', 'display docsplit version') do
98
+ puts "Docsplit version #{Docsplit::VERSION}"
99
+ exit
100
+ end
101
+ opts.on_tail('-h', '--help', 'display this help message') do
102
+ usage
103
+ end
104
+ end
105
+ @option_parser.banner = BANNER
106
+ begin
107
+ @option_parser.parse!(ARGV)
108
+ rescue OptionParser::InvalidOption => e
109
+ puts e.message
110
+ exit(1)
111
+ end
112
+ end
113
+
114
+ end
115
+
116
+ end
@@ -0,0 +1,101 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ DENSITY_ARG = "-density 150"
8
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
+ DEFAULT_FORMAT = :png
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
36
+ common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
37
+ if previous
38
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
39
+ result = `MAGICK_TMPDIR="#{tempdir}" OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 "#{directory}/*.#{format}" 2>&1`.chomp
40
+ raise ExtractionFailed, result if $? != 0
41
+ else
42
+ page_list(pages).each do |page|
43
+ out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
+ cmd = "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
45
+ result = `#{cmd}`.chomp
46
+ raise ExtractionFailed, result if $? != 0
47
+ end
48
+ end
49
+ ensure
50
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
51
+ end
52
+
53
+
54
+ private
55
+
56
+ # Extract the relevant GraphicsMagick options from the options hash.
57
+ def extract_options(options)
58
+ @output = options[:output] || '.'
59
+ @pages = options[:pages]
60
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
61
+ @sizes = [options[:size]].flatten.compact
62
+ @sizes = [nil] if @sizes.empty?
63
+ @rolling = !!options[:rolling]
64
+ end
65
+
66
+ # If there's only one size requested, generate the images directly into
67
+ # the output directory. Multiple sizes each get a directory of their own.
68
+ def directory_for(size)
69
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
70
+ File.expand_path(path)
71
+ end
72
+
73
+ # Generate the resize argument.
74
+ def resize_arg(size)
75
+ size.nil? ? '' : "-resize #{size}"
76
+ end
77
+
78
+ # Generate the appropriate quality argument for the image format.
79
+ def quality_arg(format)
80
+ case format.to_s
81
+ when /jpe?g/ then "-quality 85"
82
+ when /png/ then "-quality 100"
83
+ else ""
84
+ end
85
+ end
86
+
87
+ # Generate the expanded list of requested page numbers.
88
+ def page_list(pages)
89
+ pages.split(',').map { |range|
90
+ if range.include?('-')
91
+ range = range.split('-')
92
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
93
+ else
94
+ range.to_i
95
+ end
96
+ }.flatten.uniq.sort
97
+ end
98
+
99
+ end
100
+
101
+ end
@@ -0,0 +1,32 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo \"#{pdf}\" 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ match = result.match(MATCHERS[key])
25
+ answer = match && match[1]
26
+ answer = answer.to_i if answer && key == :length
27
+ answer
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,23 @@
1
+ module Docsplit
2
+ class OfficeUtils
3
+ # Lets check if the user is using LibreOffice
4
+ # If he is using Openoffice,org, jodconverter will take care of things
5
+ def get_office_path
6
+ mac_libre_office_path = "/Applications/LibreOffice.app/Contents"
7
+ linux_libreoffice_path = "/usr/lib/libreoffice"
8
+ if RUBY_PLATFORM.match(/darwin/i)
9
+ if File.exists?(mac_libre_office_path)
10
+ "-Doffice.home=#{mac_libre_office_path}"
11
+ else
12
+ ""
13
+ end
14
+ else
15
+ if File.exists?(linux_libreoffice_path)
16
+ "-Doffice.home=#{linux_libre_office_path}"
17
+ else
18
+ "-Doffice.home=/usr/lib/openoffice"
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+ cmd = "pdftk \"#{pdf}\" burst output \"#{page_path}\" 2>&1"
15
+ result = `#{cmd}`.chomp
16
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
+ raise ExtractionFailed, result if $? != 0
18
+ result
19
+ end
20
+ end
21
+
22
+
23
+ private
24
+
25
+ def extract_options(options)
26
+ @output = options[:output] || '.'
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -0,0 +1,94 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,126 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts "#{pdf}" 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ if pages
63
+ pages.each do |page|
64
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ file = "#{base_path}_#{page}"
66
+ run "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} \"#{pdf}[#{page - 1}]\" \"#{tiff}\" 2>&1"
67
+ run "tesseract \"#{tiff}\" \"#{file}\" -l eng 2>&1"
68
+ clean_text(file + '.txt') if @clean_ocr
69
+ FileUtils.remove_entry_secure tiff
70
+ end
71
+ else
72
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
73
+ run "MAGICK_TMPDIR=\"#{tempdir}\" OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} \"#{pdf}\" \"#{tiff}\" 2>&1"
74
+ run "tesseract \"#{tiff}\" \"#{base_path}\" -l eng 2>&1"
75
+ clean_text(base_path + '.txt') if @clean_ocr
76
+ end
77
+ ensure
78
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
79
+ end
80
+
81
+
82
+ private
83
+
84
+ def clean_text(file)
85
+ File.open(file, 'r+') do |f|
86
+ text = f.read
87
+ f.truncate(0)
88
+ f.rewind
89
+ f.write(Docsplit.clean_text(text))
90
+ end
91
+ end
92
+
93
+ # Run an external process and raise an exception if it fails.
94
+ def run(command)
95
+ result = `#{command}`
96
+ raise ExtractionFailed, result if $? != 0
97
+ result
98
+ end
99
+
100
+ # Extract the full contents of a pdf as a single file, directly.
101
+ def extract_full(pdf)
102
+ text_path = File.join(@output, "#{@pdf_name}.txt")
103
+ run "pdftotext -enc UTF-8 \"#{pdf}\" \"#{text_path}\" 2>&1"
104
+ end
105
+
106
+ # Extract the contents of a single page of text, directly, adding it to
107
+ # the `@pages_to_ocr` list if the text length is inadequate.
108
+ def extract_page(pdf, page)
109
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
110
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} \"#{pdf}\" \"#{text_path}\" 2>&1"
111
+ unless @forbid_ocr
112
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
113
+ end
114
+ end
115
+
116
+ def extract_options(options)
117
+ @output = options[:output] || '.'
118
+ @pages = options[:pages]
119
+ @force_ocr = options[:ocr] == true
120
+ @forbid_ocr = options[:ocr] == false
121
+ @clean_ocr = !(options[:clean] == false)
122
+ end
123
+
124
+ end
125
+
126
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext.downcase == '.pdf'
13
+ doc
14
+ else
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
@@ -0,0 +1,236 @@
1
+ //
2
+ // JODConverter Document Formats Configuration
3
+ //
4
+ [
5
+ {
6
+ "name": "Portable Document Format",
7
+ "extension": "pdf",
8
+ "mediaType": "application/pdf",
9
+ "storePropertiesByFamily": {
10
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
11
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
12
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
13
+ "TEXT": {"FilterName": "writer_pdf_Export"}
14
+ }
15
+ },
16
+ {
17
+ "name": "Macromedia Flash",
18
+ "extension": "swf",
19
+ "mediaType": "application/x-shockwave-flash",
20
+ "storePropertiesByFamily": {
21
+ "DRAWING": {"FilterName": "draw_flash_Export"},
22
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
23
+ }
24
+ },
25
+ {
26
+ "name": "HTML",
27
+ "extension": "html",
28
+ "mediaType": "text/html",
29
+ "inputFamily": "TEXT",
30
+ "storePropertiesByFamily": {
31
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
32
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
33
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
34
+ }
35
+ },
36
+ {
37
+ "name": "OpenDocument Text",
38
+ "extension": "odt",
39
+ "mediaType": "application/vnd.oasis.opendocument.text",
40
+ "inputFamily": "TEXT",
41
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
42
+ },
43
+ {
44
+ "name": "OpenOffice.org 1.0 Text Document",
45
+ "extension": "sxw",
46
+ "mediaType": "application/vnd.sun.xml.writer",
47
+ "inputFamily": "TEXT",
48
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
49
+ },
50
+ {
51
+ "name": "Microsoft Word",
52
+ "extension": "doc",
53
+ "mediaType": "application/msword",
54
+ "inputFamily": "TEXT",
55
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
56
+ },
57
+ {
58
+ "name": "Microsoft Word 2007 XML",
59
+ "extension": "docx",
60
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
61
+ "inputFamily": "TEXT"
62
+ },
63
+ {
64
+ "name": "Rich Text Format",
65
+ "extension": "rtf",
66
+ "mediaType": "text/rtf",
67
+ "inputFamily": "TEXT",
68
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
69
+ },
70
+ {
71
+ "name": "WordPerfect",
72
+ "extension": "wpd",
73
+ "mediaType": "application/wordperfect",
74
+ "inputFamily": "TEXT"
75
+ },
76
+ {
77
+ "name": "Plain Text",
78
+ "extension": "txt",
79
+ "mediaType": "text/plain",
80
+ "inputFamily": "TEXT",
81
+ "loadProperties": {
82
+ "FilterName": "Text (encoded)",
83
+ "FilterOptions": "utf8"
84
+ },
85
+ "storePropertiesByFamily": {"TEXT": {
86
+ "FilterName": "Text (encoded)",
87
+ "FilterOptions": "utf8"
88
+ }}
89
+ },
90
+ {
91
+ "name": "MediaWiki wikitext",
92
+ "extension": "wiki",
93
+ "mediaType": "text/x-wiki",
94
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
95
+ },
96
+ {
97
+ "name": "OpenDocument Spreadsheet",
98
+ "extension": "ods",
99
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
100
+ "inputFamily": "SPREADSHEET",
101
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
102
+ },
103
+ {
104
+ "name": "OpenOffice.org 1.0 Spreadsheet",
105
+ "extension": "sxc",
106
+ "mediaType": "application/vnd.sun.xml.calc",
107
+ "inputFamily": "SPREADSHEET",
108
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
109
+ },
110
+ {
111
+ "name": "Microsoft Excel",
112
+ "extension": "xls",
113
+ "mediaType": "application/vnd.ms-excel",
114
+ "inputFamily": "SPREADSHEET",
115
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
116
+ },
117
+ {
118
+ "name": "Microsoft Excel 2007 XML",
119
+ "extension": "xlsx",
120
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
121
+ "inputFamily": "SPREADSHEET"
122
+ },
123
+ {
124
+ "name": "Comma Separated Values",
125
+ "extension": "csv",
126
+ "mediaType": "text/csv",
127
+ "inputFamily": "SPREADSHEET",
128
+ "loadProperties": {
129
+ "FilterName": "Text - txt - csv (StarCalc)",
130
+ "FilterOptions": "44,34,0"
131
+ },
132
+ "storePropertiesByFamily": {"SPREADSHEET": {
133
+ "FilterName": "Text - txt - csv (StarCalc)",
134
+ "FilterOptions": "44,34,0"
135
+ }}
136
+ },
137
+ {
138
+ "name": "Tab Separated Values",
139
+ "extension": "tsv",
140
+ "mediaType": "text/tab-separated-values",
141
+ "inputFamily": "SPREADSHEET",
142
+ "loadProperties": {
143
+ "FilterName": "Text - txt - csv (StarCalc)",
144
+ "FilterOptions": "9,34,0"
145
+ },
146
+ "storePropertiesByFamily": {"SPREADSHEET": {
147
+ "FilterName": "Text - txt - csv (StarCalc)",
148
+ "FilterOptions": "9,34,0"
149
+ }}
150
+ },
151
+ {
152
+ "name": "OpenDocument Presentation",
153
+ "extension": "odp",
154
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
155
+ "inputFamily": "PRESENTATION",
156
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
157
+ },
158
+ {
159
+ "name": "OpenOffice.org 1.0 Presentation",
160
+ "extension": "sxi",
161
+ "mediaType": "application/vnd.sun.xml.impress",
162
+ "inputFamily": "PRESENTATION",
163
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
164
+ },
165
+ {
166
+ "name": "Microsoft PowerPoint",
167
+ "extension": "ppt",
168
+ "mediaType": "application/vnd.ms-powerpoint",
169
+ "inputFamily": "PRESENTATION",
170
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
171
+ },
172
+ {
173
+ "name": "Microsoft PowerPoint 2007 XML",
174
+ "extension": "pptx",
175
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
176
+ "inputFamily": "PRESENTATION"
177
+ },
178
+ {
179
+ "name": "OpenDocument Drawing",
180
+ "extension": "odg",
181
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
182
+ "inputFamily": "DRAWING",
183
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
184
+ },
185
+ {
186
+ "name": "Scalable Vector Graphics",
187
+ "extension": "svg",
188
+ "mediaType": "image/svg+xml",
189
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
190
+ },
191
+ {
192
+ "name": "Portable Network Graphic",
193
+ "extension": "png",
194
+ "mediaType": "image/png",
195
+ "storePropertiesByFamily": {
196
+ "DRAWING": {"FilterName": "draw_png_Export"},
197
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
198
+ }
199
+ },
200
+ {
201
+ "name": "Graphics Interchange Format",
202
+ "extension": "gif",
203
+ "mediaType": "image/gif",
204
+ "storePropertiesByFamily": {
205
+ "DRAWING": {"FilterName": "draw_gif_Export"},
206
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
207
+ }
208
+ },
209
+ {
210
+ "name": "Joint Photographic Experts Group",
211
+ "extension": "jpg",
212
+ "mediaType": "image/jpeg",
213
+ "storePropertiesByFamily": {
214
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
215
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
216
+ }
217
+ },
218
+ {
219
+ "name": "Windows Bitmap",
220
+ "extension": "bmp",
221
+ "mediaType": "image/bmp",
222
+ "storePropertiesByFamily": {
223
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
224
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
225
+ }
226
+ },
227
+ {
228
+ "name": "Tagged Image File Format",
229
+ "extension": "tif",
230
+ "mediaType": "image/tiff",
231
+ "storePropertiesByFamily": {
232
+ "DRAWING": {"FilterName": "draw_tif_Export"},
233
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
234
+ }
235
+ }
236
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: talentbox-docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeremy Ashkenas
9
+ - Samuel Clay
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2011-05-13 00:00:00.000000000 +02:00
14
+ default_executable:
15
+ dependencies: []
16
+ description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
+ apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
+ \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
+ (title, author, number of pages...)\n"
20
+ email: jeremy@documentcloud.org
21
+ executables:
22
+ - docsplit
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/docsplit/command_line.rb
27
+ - lib/docsplit/image_extractor.rb
28
+ - lib/docsplit/info_extractor.rb
29
+ - lib/docsplit/office_utils.rb
30
+ - lib/docsplit/page_extractor.rb
31
+ - lib/docsplit/text_cleaner.rb
32
+ - lib/docsplit/text_extractor.rb
33
+ - lib/docsplit/transparent_pdfs.rb
34
+ - lib/docsplit.rb
35
+ - lib/docsplit.rb.orig
36
+ - bin/docsplit
37
+ - vendor/conf/document-formats.js
38
+ - vendor/jodconverter/commons-cli-1.1.jar
39
+ - vendor/jodconverter/commons-io-1.4.jar
40
+ - vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
41
+ - vendor/jodconverter/json-20080701.jar
42
+ - vendor/jodconverter/juh-3.1.0.jar
43
+ - vendor/jodconverter/jurt-3.1.0.jar
44
+ - vendor/jodconverter/ridl-3.1.0.jar
45
+ - vendor/jodconverter/unoil-3.1.0.jar
46
+ - vendor/logging.properties
47
+ - docsplit.gemspec
48
+ - LICENSE
49
+ - README
50
+ has_rdoc: true
51
+ homepage: http://documentcloud.github.com/docsplit/
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project: docsplit
71
+ rubygems_version: 1.6.2
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
75
+ test_files: []