docsplit-ng 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8e86b1030bf8eb1c2fa79e0499bf1a4d4e758adf1f76beffac337bf32b339ea0
4
+ data.tar.gz: 758265547380cba476c2755fb9084aff301a6edafe7bef2b6636547e3772b813
5
+ SHA512:
6
+ metadata.gz: 8a23ad1f03a5f2d2f3bba40837bcffe658d67a73925a6c72e95862d497869322bfe0b7c99483d417d333b009612b98262f288104502b9c6b4141992d860c55f3
7
+ data.tar.gz: 33c207a89959b182a988ad09dc417aff708074054f805b0f4c7576904b15e588e53ace1add4cdea3bf2e2241ff42c5fb9282ec33a10f61cfcdb672c4e32f1c24
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+
3
+ Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
4
+ Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
5
+
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the "Software"), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,25 @@
1
+ ==
2
+
3
+ __ __ _ _
4
+ | ] [ | (_) / |_
5
+ .--.| | .--. .---. .--. _ .--. | | __ `| |-'______ _ .--. .--./)
6
+ / /'`\' |/ .'`\ \/ /'`\]( (`\][ '/'`\ \| | [ | | | |______|[ `.-. | / /'`\;
7
+ | \__/ || \__. || \__. `'.'. | \__/ || | | | | |, | | | | \ \._//
8
+ '.__.;__]'.__.' '.___.'[\__) )| ;.__/[___][___]\__/ [___||__].',__`
9
+ [__| ( ( __))
10
+
11
+
12
+ Docsplit-ng is a command-line utility and Ruby library for splitting apart
13
+ documents into their component parts: searchable UTF-8 plain text, page
14
+ images or thumbnails in any format, PDFs, single pages, and document
15
+ metadata (title, author, number of pages...)
16
+
17
+ Installation:
18
+ gem install docsplit-ng
19
+
20
+ For documentation, usage, and examples, see:
21
+ https://github.com/HLFH/docsplit-ng/
22
+
23
+ To suggest a feature or report a bug:
24
+ https://github.com/HLFH/docsplit-ng/issues/
25
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
@@ -0,0 +1,25 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'docsplit-ng'
3
+ s.version = '0.8.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2023-05-03'
5
+
6
+ s.homepage = "https://github.com/HLFH/docsplit-ng/"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit-ng is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ["Gaspard d'Hautefeuille", 'Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
+ s.email = 'contact@hlfh.space'
17
+ s.rubyforge_project = 'docsplit-ng'
18
+ s.license = 'MIT'
19
+
20
+ s.require_paths = ['lib']
21
+ s.executables = ['docsplit']
22
+
23
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
24
+ 'docsplit-ng.gemspec', 'LICENSE', 'README']
25
+ end
@@ -0,0 +1,125 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {:ocr => :default, :clean => true}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
91
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
92
+ @options[:ocr] = o
93
+ end
94
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
+ @options[:clean] = false
96
+ end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ end
100
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
101
+ @options[:detect_orientation] = false
102
+ end
103
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
104
+ @options[:rolling] = true
105
+ end
106
+ opts.on_tail('-v', '--version', 'display docsplit version') do
107
+ puts "Docsplit version #{Docsplit::VERSION}"
108
+ exit
109
+ end
110
+ opts.on_tail('-h', '--help', 'display this help message') do
111
+ usage
112
+ end
113
+ end
114
+ @option_parser.banner = BANNER
115
+ begin
116
+ @option_parser.parse!(ARGV)
117
+ rescue OptionParser::InvalidOption => e
118
+ puts e.message
119
+ exit(1)
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,103 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
8
+ DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
+
11
+ # Extract a list of PDFs as rasterized page images, according to the
12
+ # configuration in options.
13
+ def extract(pdfs, options)
14
+ @pdfs = [pdfs].flatten
15
+ extract_options(options)
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
23
+ end
24
+
25
+ # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
30
+ def convert(pdf, size, format, previous=nil)
31
+ tempdir = Dir.mktmpdir
32
+ basename = File.basename(pdf, File.extname(pdf))
33
+ directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
36
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
38
+ if previous
39
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
40
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41
+ raise ExtractionFailed, result if $? != 0
42
+ else
43
+ page_list(pages).each do |page|
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
+ result = `#{cmd}`.chomp
47
+ raise ExtractionFailed, result if $? != 0
48
+ end
49
+ end
50
+ ensure
51
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
52
+ end
53
+
54
+
55
+ private
56
+
57
+ # Extract the relevant GraphicsMagick options from the options hash.
58
+ def extract_options(options)
59
+ @output = options[:output] || '.'
60
+ @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
62
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
63
+ @sizes = [options[:size]].flatten.compact
64
+ @sizes = [nil] if @sizes.empty?
65
+ @rolling = !!options[:rolling]
66
+ end
67
+
68
+ # If there's only one size requested, generate the images directly into
69
+ # the output directory. Multiple sizes each get a directory of their own.
70
+ def directory_for(size)
71
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
72
+ File.expand_path(path)
73
+ end
74
+
75
+ # Generate the resize argument.
76
+ def resize_arg(size)
77
+ size.nil? ? '' : "-resize #{size}"
78
+ end
79
+
80
+ # Generate the appropriate quality argument for the image format.
81
+ def quality_arg(format)
82
+ case format.to_s
83
+ when /jpe?g/ then "-quality 85"
84
+ when /png/ then "-quality 100"
85
+ else ""
86
+ end
87
+ end
88
+
89
+ # Generate the expanded list of requested page numbers.
90
+ def page_list(pages)
91
+ pages.split(',').map { |range|
92
+ if range.include?('-')
93
+ range = range.split('-')
94
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
95
+ else
96
+ range.to_i
97
+ end
98
+ }.flatten.uniq.sort
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -0,0 +1,50 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ extract_all(pdfs, opts)[key]
21
+ end
22
+
23
+ def extract_all(pdfs, opts)
24
+ pdf = [pdfs].flatten.first
25
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26
+ result = `#{cmd}`.chomp
27
+ raise ExtractionFailed, result if $? != 0
28
+ # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
+ if String.method_defined?(:encode)
30
+ result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31
+ else
32
+ require 'iconv' unless defined?(Iconv)
33
+ ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34
+ result = ic.iconv(result)
35
+ end
36
+ info = {}
37
+ MATCHERS.each do |key, matcher|
38
+ match = result.match(matcher)
39
+ answer = match && match[1]
40
+ if answer
41
+ answer = answer.to_i if key == :length
42
+ info[key] = answer
43
+ end
44
+ end
45
+ info
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,36 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
+ end
20
+ result = `#{cmd}`.chomp
21
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
22
+ raise ExtractionFailed, result if $? != 0
23
+ result
24
+ end
25
+ end
26
+
27
+
28
+ private
29
+
30
+ def extract_options(options)
31
+ @output = options[:output] || '.'
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,166 @@
1
+ require 'rbconfig'
2
+
3
+ module Docsplit
4
+ class PdfExtractor
5
+ @@executable = nil
6
+ @@version_string = nil
7
+
8
+ # Provide a set of helper functions to determine the OS.
9
+ HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
10
+ def windows?
11
+ !!HOST_OS.match(/mswin|windows|cygwin/i)
12
+ end
13
+ def osx?
14
+ !!HOST_OS.match(/darwin/i)
15
+ end
16
+ def linux?
17
+ !!HOST_OS.match(/linux/i)
18
+ end
19
+
20
+ # The first line of the help output holds the name and version number
21
+ # of the office software to be used for extraction.
22
+ def version_string
23
+ unless @@version_string
24
+ null = windows? ? "NUL" : "/dev/null"
25
+ @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
+ if !!@@version_string.to_s.match(/[0-9]*/)
27
+ @@version_string = `#{office_executable} --version`.split("\n").first
28
+ end
29
+ end
30
+ @@version_string
31
+ end
32
+ def libre_office?
33
+ !!version_string.match(/^LibreOffice/)
34
+ end
35
+ def open_office?
36
+ !!version_string.match(/^OpenOffice.org/)
37
+ end
38
+
39
+ # A set of default locations to search for office software
40
+ # These have been extracted from JODConverter. Each listed
41
+ # path should contain a directory "program" which in turn
42
+ # contains the "soffice" executable.
43
+ # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
44
+ def office_search_paths
45
+ if windows?
46
+ office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
47
+ program_files_path = ENV["CommonProgramFiles"]
48
+ search_paths = office_names.map{ |program| File.join(program_files_path, program) }
49
+ elsif osx?
50
+ search_paths = %w(
51
+ /Applications/LibreOffice.app/Contents
52
+ /Applications/OpenOffice.org.app/Contents
53
+ )
54
+ else # probably linux/unix
55
+ # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
56
+ search_paths = %w(
57
+ /usr/lib/libreoffice
58
+ /usr/lib64/libreoffice
59
+ /opt/libreoffice
60
+ /usr/lib/openoffice
61
+ /usr/lib64/openoffice
62
+ /opt/openoffice.org3
63
+ /app/vendor/libreoffice
64
+ /usr/bin/libreoffice
65
+ /usr/local/bin
66
+ /usr/lib64/libreoffice
67
+ /usr/lib64/openoffice.org3
68
+ )
69
+ end
70
+ search_paths
71
+ end
72
+
73
+ # Identify the path to a working office executable.
74
+ def office_executable
75
+ paths = office_search_paths
76
+
77
+ # If an OFFICE_PATH has been specified on the commandline
78
+ # raise an error if that path isn't valid, otherwise, add
79
+ # it to the front of our search paths.
80
+ if ENV['OFFICE_PATH']
81
+ raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
82
+ paths.unshift(ENV['OFFICE_PATH'])
83
+ end
84
+
85
+ # The location of the office executable is OS dependent
86
+ path_pieces = ["soffice"]
87
+ if windows?
88
+ path_pieces += [["program", "soffice.bin"]]
89
+ elsif osx?
90
+ path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
91
+ else
92
+ path_pieces += [["program", "soffice"]]
93
+ end
94
+
95
+ # Search for the first suitable office executable
96
+ # and short circuit an executable is found.
97
+ paths.each do |path|
98
+ if File.exists? path
99
+ @@executable ||= path unless File.directory? path
100
+ path_pieces.each do |pieces|
101
+ check_path = File.join(path, pieces)
102
+ @@executable ||= check_path if File.exists? check_path
103
+ end
104
+ end
105
+ break if @@executable
106
+ end
107
+ raise OfficeNotFound, "No office software found" unless @@executable
108
+ @@executable
109
+ end
110
+
111
+ # Used to specify the office location for JODConverter
112
+ def office_path
113
+ File.dirname(File.dirname(office_executable))
114
+ end
115
+
116
+ # Convert documents to PDF.
117
+ def extract(docs, opts)
118
+ out = opts[:output] || '.'
119
+ FileUtils.mkdir_p out unless File.exists?(out)
120
+ [docs].flatten.each do |doc|
121
+ ext = File.extname(doc)
122
+ basename = File.basename(doc, ext)
123
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
124
+
125
+ if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
126
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
127
+ else
128
+ if libre_office?
129
+ # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
130
+ ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
131
+
132
+ options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
133
+ cmd = "#{office_executable} #{options} 2>&1"
134
+ result = `#{cmd}`.chomp
135
+ raise ExtractionFailed, result if $? != 0
136
+ true
137
+ else # open office presumably, rely on JODConverter to figure it out.
138
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
139
+ run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
146
+
147
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
148
+
149
+ HEADLESS = "-Djava.awt.headless=true"
150
+
151
+ private
152
+
153
+ # Runs a Java command, with quieted logging, and the classpath set properly.
154
+ def run_jod(command, pdfs, opts, return_output=false)
155
+
156
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
157
+ office = osx? ? "-Doffice.home=#{office_path}" : office_path
158
+ cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
159
+ result = `#{cmd}`.chomp
160
+ raise ExtractionFailed, result if $? != 0
161
+ return return_output ? (result.empty? ? nil : result) : true
162
+ end
163
+
164
+ class OfficeNotFound < StandardError; end
165
+ end
166
+ end
@@ -0,0 +1,99 @@
1
+ require 'strscan'
2
+
3
+ module Docsplit
4
+
5
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
6
+ # words. Algorithms taken from:
7
+ #
8
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9
+ # -- Taghva, Nartker, Condit, and Borsack
10
+ #
11
+ # Improving Search and Retrieval Performance through Shortening Documents,
12
+ # Detecting Garbage, and Throwing out Jargon
13
+ # -- Kulp
14
+ #
15
+ class TextCleaner
16
+
17
+ # Cached regexes we plan on using.
18
+ WORD = /\S+/
19
+ SPACE = /\s+/
20
+ NEWLINE = /[\r\n]/
21
+ ALNUM = /[a-z0-9]/i
22
+ PUNCT = /[[:punct:]]/i
23
+ REPEAT = /([^0-9])\1{2,}/
24
+ UPPER = /[A-Z]/
25
+ LOWER = /[a-z]/
26
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
27
+ ALL_ALPHA = /^[a-z]+$/i
28
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29
+ VOWEL = /([aeiou]|y$)/i
30
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31
+ VOWEL_5 = /[aeiou]{5}/i
32
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33
+ SINGLETONS = /^[AaIi]$/
34
+
35
+ # For the time being, `clean` uses the regular StringScanner, and not the
36
+ # multibyte-aware version, coercing to ASCII first.
37
+ def clean(text)
38
+ if String.method_defined?(:encode)
39
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
40
+ else
41
+ require 'iconv' unless defined?(Iconv)
42
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
43
+ end
44
+
45
+ scanner = StringScanner.new(text)
46
+ cleaned = []
47
+ spaced = false
48
+ loop do
49
+ if space = scanner.scan(SPACE)
50
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
51
+ spaced = true
52
+ elsif word = scanner.scan(WORD)
53
+ unless garbage(word)
54
+ cleaned.push(word)
55
+ spaced = false
56
+ end
57
+ elsif scanner.eos?
58
+ return cleaned.join('').gsub(REPEATED, '')
59
+ end
60
+ end
61
+ end
62
+
63
+ # Is a given word OCR garbage?
64
+ def garbage(w)
65
+ acronym = w =~ ACRONYM
66
+
67
+ # More than 30 bytes in length.
68
+ (w.length > 30) ||
69
+
70
+ # If there are three or more identical characters in a row in the string.
71
+ (w =~ REPEAT) ||
72
+
73
+ # More punctuation than alpha numerics.
74
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
75
+
76
+ # Ignoring the first and last characters in the string, if there are three or
77
+ # more different punctuation characters in the string.
78
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
79
+
80
+ # Four or more consecutive vowels, or five or more consecutive consonants.
81
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
82
+
83
+ # Number of uppercase letters greater than lowercase letters, but the word is
84
+ # not all uppercase + punctuation.
85
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
86
+
87
+ # Single letters that are not A or I.
88
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
89
+
90
+ # All characters are alphabetic and there are 8 times more vowels than
91
+ # consonants, or 8 times more consonants than vowels.
92
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
93
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
94
+ (cons > vows * 8)))
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -0,0 +1,143 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
15
+ class TextExtractor
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
+
22
+ MIN_TEXT_PER_PAGE = 100 # in bytes
23
+
24
+ def initialize
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
29
+ def extract(pdfs, opts)
30
+ extract_options opts
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
37
+ else
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does a PDF have any text embedded?
47
+ def contains_text?(pdf)
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
+ !fonts.match(NO_TEXT_DETECTED)
50
+ end
51
+
52
+ # Extract a page range worth of text from a PDF, directly.
53
+ def extract_from_pdf(pdf, pages)
54
+ return extract_full(pdf) unless pages
55
+ pages.each {|page| extract_page(pdf, page) }
56
+ end
57
+
58
+ # Extract a page range worth of text from a PDF via OCR.
59
+ def extract_from_ocr(pdf, pages)
60
+ tempdir = Dir.mktmpdir
61
+ base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
63
+ psm = @detect_orientation ? "-psm 1" : ""
64
+ if pages
65
+ pages.each do |page|
66
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
67
+ escaped_tiff = ESCAPE[tiff]
68
+ file = "#{base_path}_#{page}"
69
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
70
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
71
+ clean_text(file + '.txt') if @clean_ocr
72
+ FileUtils.remove_entry_secure tiff
73
+ end
74
+ else
75
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
76
+ escaped_tiff = ESCAPE[tiff]
77
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
78
+ #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
80
+ clean_text(base_path + '.txt') if @clean_ocr
81
+ end
82
+ ensure
83
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
84
+ end
85
+
86
+
87
+ private
88
+
89
+ def clean_text(file)
90
+ File.open(file, 'r+') do |f|
91
+ text = f.read
92
+ f.truncate(0)
93
+ f.rewind
94
+ f.write(Docsplit.clean_text(text))
95
+ end
96
+ end
97
+
98
+ # Run an external process and raise an exception if it fails.
99
+ def run(command)
100
+ result = `#{command}`
101
+ raise ExtractionFailed, result if $? != 0
102
+ result
103
+ end
104
+
105
+ # Run pdftotext command
106
+ def run_pdftotext(pdf, text_path, options=[])
107
+ options << '-enc UTF-8'
108
+ options << '-layout' if @keep_layout
109
+
110
+ run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111
+ end
112
+
113
+ # Extract the full contents of a pdf as a single file, directly.
114
+ def extract_full(pdf)
115
+ text_path = File.join(@output, "#{@pdf_name}.txt")
116
+ run_pdftotext pdf, text_path
117
+ end
118
+
119
+ # Extract the contents of a single page of text, directly, adding it to
120
+ # the `@pages_to_ocr` list if the text length is inadequate.
121
+ def extract_page(pdf, page)
122
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
123
+ run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
124
+
125
+ unless @forbid_ocr
126
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
127
+ end
128
+ end
129
+
130
+ def extract_options(options)
131
+ @output = options[:output] || '.'
132
+ @pages = options[:pages]
133
+ @force_ocr = options[:ocr] == true
134
+ @forbid_ocr = options[:ocr] == false
135
+ @language = options[:language] || 'eng'
136
+ @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
137
+ @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
138
+ @keep_layout = options.fetch(:layout, false)
139
+ end
140
+
141
+ end
142
+
143
+ end
@@ -0,0 +1,29 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ if is_pdf?(doc)
12
+ doc
13
+ else
14
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
15
+ extract_pdf([doc], {:output => tempdir})
16
+ File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
17
+ end
18
+ end
19
+ end
20
+
21
+ def is_pdf?(doc)
22
+ File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
23
+ end
24
+
25
+ end
26
+
27
+ extend TransparentPDFs
28
+
29
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,109 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
5
+ # The Docsplit module delegates to the Java PDF extractors.
6
+ module Docsplit
7
+
8
+ VERSION = '0.8.0' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
+
12
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
14
+
15
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
16
+
17
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
+
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
20
+
21
+ # Check for all dependencies, and note their absence.
22
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
23
+ DEPENDENCIES.each_key do |dep|
24
+ dirs.each do |dir|
25
+ if File.executable?(File.join(dir, dep.to_s))
26
+ DEPENDENCIES[dep] = true
27
+ break
28
+ end
29
+ end
30
+ end
31
+
32
+ # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
+ if DEPENDENCIES[:tesseract]
34
+ # osd will be listed in tesseract --listlangs
35
+ val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
36
+ DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
+ end
38
+
39
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
40
+ # broke.
41
+ class ExtractionFailed < StandardError; end
42
+
43
+ # Use the ExtractPages Java class to burst a PDF into single pages.
44
+ def self.extract_pages(pdfs, opts={})
45
+ pdfs = ensure_pdfs(pdfs)
46
+ PageExtractor.new.extract(pdfs, opts)
47
+ end
48
+
49
+ # Use the ExtractText Java class to write out all embedded text.
50
+ def self.extract_text(pdfs, opts={})
51
+ pdfs = ensure_pdfs(pdfs)
52
+ TextExtractor.new.extract(pdfs, opts)
53
+ end
54
+
55
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
56
+ def self.extract_images(pdfs, opts={})
57
+ pdfs = ensure_pdfs(pdfs)
58
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
59
+ ImageExtractor.new.extract(pdfs, opts)
60
+ end
61
+
62
+ # Use JODCConverter to extract the documents as PDFs.
63
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
64
+ def self.extract_pdf(docs, opts={})
65
+ PdfExtractor.new.extract(docs, opts)
66
+ end
67
+
68
+ # Define custom methods for each of the metadata keys that we support.
69
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
70
+ METADATA_KEYS.each do |key|
71
+ instance_eval <<-EOS
72
+ def self.extract_#{key}(pdfs, opts={})
73
+ pdfs = ensure_pdfs(pdfs)
74
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
75
+ end
76
+ EOS
77
+ end
78
+
79
+ def self.extract_info(pdfs, opts={})
80
+ pdfs = ensure_pdfs(pdfs)
81
+ InfoExtractor.new.extract_all(pdfs, opts)
82
+ end
83
+
84
+ # Utility method to clean OCR'd text with garbage characters.
85
+ def self.clean_text(text)
86
+ TextCleaner.new.clean(text)
87
+ end
88
+
89
+ private
90
+
91
+ # Normalize a value in an options hash for the command line.
92
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
93
+ def self.normalize_value(value)
94
+ case value
95
+ when Range then value.to_a.join(',')
96
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
97
+ else value.to_s
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
104
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
105
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
106
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
107
+ require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
108
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
109
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -0,0 +1,233 @@
1
+ [
2
+ {
3
+ "name": "Portable Document Format",
4
+ "extension": "pdf",
5
+ "mediaType": "application/pdf",
6
+ "storePropertiesByFamily": {
7
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
8
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
9
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
10
+ "TEXT": {"FilterName": "writer_pdf_Export"}
11
+ }
12
+ },
13
+ {
14
+ "name": "Macromedia Flash",
15
+ "extension": "swf",
16
+ "mediaType": "application/x-shockwave-flash",
17
+ "storePropertiesByFamily": {
18
+ "DRAWING": {"FilterName": "draw_flash_Export"},
19
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
20
+ }
21
+ },
22
+ {
23
+ "name": "HTML",
24
+ "extension": "html",
25
+ "mediaType": "text/html",
26
+ "inputFamily": "TEXT",
27
+ "storePropertiesByFamily": {
28
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
29
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
30
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
31
+ }
32
+ },
33
+ {
34
+ "name": "OpenDocument Text",
35
+ "extension": "odt",
36
+ "mediaType": "application/vnd.oasis.opendocument.text",
37
+ "inputFamily": "TEXT",
38
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
39
+ },
40
+ {
41
+ "name": "OpenOffice.org 1.0 Text Document",
42
+ "extension": "sxw",
43
+ "mediaType": "application/vnd.sun.xml.writer",
44
+ "inputFamily": "TEXT",
45
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
46
+ },
47
+ {
48
+ "name": "Microsoft Word",
49
+ "extension": "doc",
50
+ "mediaType": "application/msword",
51
+ "inputFamily": "TEXT",
52
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
53
+ },
54
+ {
55
+ "name": "Microsoft Word 2007 XML",
56
+ "extension": "docx",
57
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
58
+ "inputFamily": "TEXT"
59
+ },
60
+ {
61
+ "name": "Rich Text Format",
62
+ "extension": "rtf",
63
+ "mediaType": "text/rtf",
64
+ "inputFamily": "TEXT",
65
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
66
+ },
67
+ {
68
+ "name": "WordPerfect",
69
+ "extension": "wpd",
70
+ "mediaType": "application/wordperfect",
71
+ "inputFamily": "TEXT"
72
+ },
73
+ {
74
+ "name": "Plain Text",
75
+ "extension": "txt",
76
+ "mediaType": "text/plain",
77
+ "inputFamily": "TEXT",
78
+ "loadProperties": {
79
+ "FilterName": "Text (encoded)",
80
+ "FilterOptions": "utf8"
81
+ },
82
+ "storePropertiesByFamily": {"TEXT": {
83
+ "FilterName": "Text (encoded)",
84
+ "FilterOptions": "utf8"
85
+ }}
86
+ },
87
+ {
88
+ "name": "MediaWiki wikitext",
89
+ "extension": "wiki",
90
+ "mediaType": "text/x-wiki",
91
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
92
+ },
93
+ {
94
+ "name": "OpenDocument Spreadsheet",
95
+ "extension": "ods",
96
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
97
+ "inputFamily": "SPREADSHEET",
98
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
99
+ },
100
+ {
101
+ "name": "OpenOffice.org 1.0 Spreadsheet",
102
+ "extension": "sxc",
103
+ "mediaType": "application/vnd.sun.xml.calc",
104
+ "inputFamily": "SPREADSHEET",
105
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106
+ },
107
+ {
108
+ "name": "Microsoft Excel",
109
+ "extension": "xls",
110
+ "mediaType": "application/vnd.ms-excel",
111
+ "inputFamily": "SPREADSHEET",
112
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113
+ },
114
+ {
115
+ "name": "Microsoft Excel 2007 XML",
116
+ "extension": "xlsx",
117
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118
+ "inputFamily": "SPREADSHEET"
119
+ },
120
+ {
121
+ "name": "Comma Separated Values",
122
+ "extension": "csv",
123
+ "mediaType": "text/csv",
124
+ "inputFamily": "SPREADSHEET",
125
+ "loadProperties": {
126
+ "FilterName": "Text - txt - csv (StarCalc)",
127
+ "FilterOptions": "44,34,0"
128
+ },
129
+ "storePropertiesByFamily": {"SPREADSHEET": {
130
+ "FilterName": "Text - txt - csv (StarCalc)",
131
+ "FilterOptions": "44,34,0"
132
+ }}
133
+ },
134
+ {
135
+ "name": "Tab Separated Values",
136
+ "extension": "tsv",
137
+ "mediaType": "text/tab-separated-values",
138
+ "inputFamily": "SPREADSHEET",
139
+ "loadProperties": {
140
+ "FilterName": "Text - txt - csv (StarCalc)",
141
+ "FilterOptions": "9,34,0"
142
+ },
143
+ "storePropertiesByFamily": {"SPREADSHEET": {
144
+ "FilterName": "Text - txt - csv (StarCalc)",
145
+ "FilterOptions": "9,34,0"
146
+ }}
147
+ },
148
+ {
149
+ "name": "OpenDocument Presentation",
150
+ "extension": "odp",
151
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
152
+ "inputFamily": "PRESENTATION",
153
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154
+ },
155
+ {
156
+ "name": "OpenOffice.org 1.0 Presentation",
157
+ "extension": "sxi",
158
+ "mediaType": "application/vnd.sun.xml.impress",
159
+ "inputFamily": "PRESENTATION",
160
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161
+ },
162
+ {
163
+ "name": "Microsoft PowerPoint",
164
+ "extension": "ppt",
165
+ "mediaType": "application/vnd.ms-powerpoint",
166
+ "inputFamily": "PRESENTATION",
167
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168
+ },
169
+ {
170
+ "name": "Microsoft PowerPoint 2007 XML",
171
+ "extension": "pptx",
172
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173
+ "inputFamily": "PRESENTATION"
174
+ },
175
+ {
176
+ "name": "OpenDocument Drawing",
177
+ "extension": "odg",
178
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
179
+ "inputFamily": "DRAWING",
180
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181
+ },
182
+ {
183
+ "name": "Scalable Vector Graphics",
184
+ "extension": "svg",
185
+ "mediaType": "image/svg+xml",
186
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187
+ },
188
+ {
189
+ "name": "Portable Network Graphic",
190
+ "extension": "png",
191
+ "mediaType": "image/png",
192
+ "storePropertiesByFamily": {
193
+ "DRAWING": {"FilterName": "draw_png_Export"},
194
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
195
+ }
196
+ },
197
+ {
198
+ "name": "Graphics Interchange Format",
199
+ "extension": "gif",
200
+ "mediaType": "image/gif",
201
+ "storePropertiesByFamily": {
202
+ "DRAWING": {"FilterName": "draw_gif_Export"},
203
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
204
+ }
205
+ },
206
+ {
207
+ "name": "Joint Photographic Experts Group",
208
+ "extension": "jpg",
209
+ "mediaType": "image/jpeg",
210
+ "storePropertiesByFamily": {
211
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
212
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213
+ }
214
+ },
215
+ {
216
+ "name": "Windows Bitmap",
217
+ "extension": "bmp",
218
+ "mediaType": "image/bmp",
219
+ "storePropertiesByFamily": {
220
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
221
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222
+ }
223
+ },
224
+ {
225
+ "name": "Tagged Image File Format",
226
+ "extension": "tif",
227
+ "mediaType": "image/tiff",
228
+ "storePropertiesByFamily": {
229
+ "DRAWING": {"FilterName": "draw_tif_Export"},
230
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
231
+ }
232
+ }
233
+ ]
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docsplit-ng
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.8.0
5
+ platform: ruby
6
+ authors:
7
+ - Gaspard d'Hautefeuille
8
+ - Jeremy Ashkenas
9
+ - Samuel Clay
10
+ - Ted Han
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2023-05-03 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description: |2
17
+ Docsplit-ng is a command-line utility and Ruby library for splitting apart
18
+ documents into their component parts: searchable UTF-8 plain text, page
19
+ images or thumbnails in any format, PDFs, single pages, and document
20
+ metadata (title, author, number of pages...)
21
+ email: contact@hlfh.space
22
+ executables:
23
+ - docsplit
24
+ extensions: []
25
+ extra_rdoc_files: []
26
+ files:
27
+ - LICENSE
28
+ - README
29
+ - bin/docsplit
30
+ - docsplit-ng.gemspec
31
+ - lib/docsplit.rb
32
+ - lib/docsplit/command_line.rb
33
+ - lib/docsplit/image_extractor.rb
34
+ - lib/docsplit/info_extractor.rb
35
+ - lib/docsplit/page_extractor.rb
36
+ - lib/docsplit/pdf_extractor.rb
37
+ - lib/docsplit/text_cleaner.rb
38
+ - lib/docsplit/text_extractor.rb
39
+ - lib/docsplit/transparent_pdfs.rb
40
+ - vendor/conf/document-formats.js
41
+ - vendor/jodconverter/commons-cli-1.1.jar
42
+ - vendor/jodconverter/commons-io-1.4.jar
43
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
44
+ - vendor/jodconverter/json-20090211.jar
45
+ - vendor/jodconverter/juh-3.2.1.jar
46
+ - vendor/jodconverter/jurt-3.2.1.jar
47
+ - vendor/jodconverter/ridl-3.2.1.jar
48
+ - vendor/jodconverter/unoil-3.2.1.jar
49
+ - vendor/logging.properties
50
+ homepage: https://github.com/HLFH/docsplit-ng/
51
+ licenses:
52
+ - MIT
53
+ metadata: {}
54
+ post_install_message:
55
+ rdoc_options: []
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubygems_version: 3.0.3
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
73
+ test_files: []