docsplit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +0 -1
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +30 -7
- data/lib/docsplit/command_line.rb +9 -3
- data/lib/docsplit/image_extractor.rb +27 -8
- data/lib/docsplit/info_extractor.rb +32 -0
- data/lib/docsplit/page_extractor.rb +31 -0
- data/lib/docsplit/text_extractor.rb +93 -35
- metadata +7 -26
- data/build/org/documentcloud/ExtractInfo$1.class +0 -0
- data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
- data/build/org/documentcloud/ExtractInfo.class +0 -0
- data/build/org/documentcloud/ExtractPages.class +0 -0
- data/build/org/documentcloud/ExtractText.class +0 -0
- data/build/org/documentcloud/Extractor.class +0 -0
- data/lib/docsplit/ExtractInfo.java +0 -63
- data/lib/docsplit/ExtractPages.java +0 -54
- data/lib/docsplit/ExtractText.java +0 -80
- data/lib/docsplit/Extractor.java +0 -91
- data/lib/docsplit/argument_parser.rb +0 -31
- data/vendor/bcmail.jar +0 -0
- data/vendor/bcprov.jar +0 -0
- data/vendor/commons-logging.jar +0 -0
- data/vendor/fontbox.jar +0 -0
- data/vendor/pdfbox.jar +0 -0
data/LICENSE
CHANGED
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2010-
|
3
|
+
s.version = '0.3.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-5'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.3.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -13,6 +13,20 @@ module Docsplit
|
|
13
13
|
|
14
14
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
15
15
|
|
16
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
17
|
+
|
18
|
+
# Check for all dependencies, and warn of their absence.
|
19
|
+
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
20
|
+
DEPENDENCIES.each_key do |dep|
|
21
|
+
dirs.each do |dir|
|
22
|
+
if File.executable?(File.join(dir, dep.to_s))
|
23
|
+
DEPENDENCIES[dep] = true
|
24
|
+
break
|
25
|
+
end
|
26
|
+
end
|
27
|
+
warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
|
28
|
+
end
|
29
|
+
|
16
30
|
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
17
31
|
# broke.
|
18
32
|
class ExtractionFailed < StandardError; end
|
@@ -20,7 +34,7 @@ module Docsplit
|
|
20
34
|
# Use the ExtractPages Java class to burst a PDF into single pages.
|
21
35
|
def self.extract_pages(pdfs, opts={})
|
22
36
|
pdfs = ensure_pdfs(pdfs)
|
23
|
-
|
37
|
+
PageExtractor.new.extract(pdfs, opts)
|
24
38
|
end
|
25
39
|
|
26
40
|
# Use the ExtractText Java class to write out all embedded text.
|
@@ -50,8 +64,7 @@ module Docsplit
|
|
50
64
|
instance_eval <<-EOS
|
51
65
|
def self.extract_#{key}(pdfs, opts={})
|
52
66
|
pdfs = ensure_pdfs(pdfs)
|
53
|
-
|
54
|
-
:#{key} == :length ? result.to_i : result
|
67
|
+
InfoExtractor.new.extract(:#{key}, pdfs, opts)
|
55
68
|
end
|
56
69
|
EOS
|
57
70
|
end
|
@@ -62,18 +75,28 @@ module Docsplit
|
|
62
75
|
# Runs a Java command, with quieted logging, and the classpath set properly.
|
63
76
|
def self.run(command, pdfs, opts, return_output=false)
|
64
77
|
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
65
|
-
|
66
|
-
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
|
78
|
+
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
67
79
|
result = `#{cmd}`.chomp
|
68
80
|
raise ExtractionFailed, result if $? != 0
|
69
81
|
return return_output ? (result.empty? ? nil : result) : true
|
70
82
|
end
|
71
83
|
|
84
|
+
# Normalize a value in an options hash for the command line.
|
85
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
86
|
+
def self.normalize_value(value)
|
87
|
+
case value
|
88
|
+
when Range then normalize_range(value)
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
90
|
+
else value.to_s
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
72
94
|
end
|
73
95
|
|
74
96
|
require 'tmpdir'
|
75
97
|
require 'fileutils'
|
76
98
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
77
|
-
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
|
78
99
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
79
100
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
101
|
+
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
102
|
+
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
@@ -8,7 +8,7 @@ module Docsplit
|
|
8
8
|
|
9
9
|
BANNER = <<-EOS
|
10
10
|
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
-
It wraps
|
11
|
+
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
12
|
|
13
13
|
Usage:
|
14
14
|
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
@@ -71,7 +71,7 @@ Options:
|
|
71
71
|
# Use the OptionParser library to parse out all supported options. Return
|
72
72
|
# options formatted for the Ruby API.
|
73
73
|
def parse_options
|
74
|
-
@options = {}
|
74
|
+
@options = {:ocr => :default}
|
75
75
|
@option_parser = OptionParser.new do |opts|
|
76
76
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
77
|
@options[:output] = d
|
@@ -85,8 +85,14 @@ Options:
|
|
85
85
|
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
86
|
@options[:format] = t.split(',')
|
87
87
|
end
|
88
|
+
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
89
|
+
@options[:ocr] = o
|
90
|
+
end
|
91
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
92
|
+
@options[:rolling] = true
|
93
|
+
end
|
88
94
|
opts.on_tail('-v', '--version', 'display docsplit version') do
|
89
|
-
puts "
|
95
|
+
puts "Docsplit version #{Docsplit::VERSION}"
|
90
96
|
exit
|
91
97
|
end
|
92
98
|
opts.on_tail('-h', '--help', 'display this help message') do
|
@@ -4,26 +4,37 @@ module Docsplit
|
|
4
4
|
# nicely sized images.
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
|
-
DENSITY_ARG
|
8
|
-
MEMORY_ARGS
|
9
|
-
DEFAULT_FORMAT
|
7
|
+
DENSITY_ARG = "-density 150"
|
8
|
+
MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
|
9
|
+
DEFAULT_FORMAT = :png
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
12
|
# configuration in options.
|
13
13
|
def extract(pdfs, options)
|
14
14
|
@pdfs = [pdfs].flatten
|
15
15
|
extract_options(options)
|
16
|
-
@pdfs.each
|
16
|
+
@pdfs.each do |pdf|
|
17
|
+
previous = nil
|
18
|
+
@sizes.each_with_index do |size, i|
|
19
|
+
@formats.each {|format| convert(pdf, size, format, previous) }
|
20
|
+
previous = size if @rolling
|
21
|
+
end
|
22
|
+
end
|
17
23
|
end
|
18
24
|
|
19
25
|
# Convert a single PDF into page images at the specified size and format.
|
20
|
-
def convert(pdf, size, format)
|
26
|
+
def convert(pdf, size, format, previous=nil)
|
21
27
|
basename = File.basename(pdf, File.extname(pdf))
|
22
|
-
|
23
|
-
directory = File.join(@output, subfolder)
|
28
|
+
directory = directory_for(size)
|
24
29
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
25
30
|
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
26
|
-
|
31
|
+
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
32
|
+
if previous
|
33
|
+
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
34
|
+
cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
|
+
else
|
36
|
+
cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
|
+
end
|
27
38
|
result = `#{cmd}`.chomp
|
28
39
|
raise ExtractionFailed, result if $? != 0
|
29
40
|
renumber_images(out_file, format)
|
@@ -39,6 +50,14 @@ module Docsplit
|
|
39
50
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
40
51
|
@sizes = [options[:size]].flatten.compact
|
41
52
|
@sizes = [nil] if @sizes.empty?
|
53
|
+
@rolling = !!options[:rolling]
|
54
|
+
end
|
55
|
+
|
56
|
+
# If there's only one size requested, generate the images directly into
|
57
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
58
|
+
def directory_for(size)
|
59
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
60
|
+
File.expand_path(path)
|
42
61
|
end
|
43
62
|
|
44
63
|
# Generate the resize argument.
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
|
+
class InfoExtractor
|
5
|
+
|
6
|
+
# Regex matchers for different bits of information.
|
7
|
+
MATCHERS = {
|
8
|
+
:author => /^Author:\s+([^\n]+)/,
|
9
|
+
:date => /^CreationDate:\s+([^\n]+)/,
|
10
|
+
:creator => /^Creator:\s+([^\n]+)/,
|
11
|
+
:keywords => /^Keywords:\s+([^\n]+)/,
|
12
|
+
:producer => /^Producer:\s+([^\n]+)/,
|
13
|
+
:subject => /^Subject:\s+([^\n]+)/,
|
14
|
+
:title => /^Title:\s+([^\n]+)/,
|
15
|
+
:length => /^Pages:\s+([^\n]+)/,
|
16
|
+
}
|
17
|
+
|
18
|
+
# Pull out a single datum from a pdf.
|
19
|
+
def extract(key, pdfs, opts)
|
20
|
+
pdf = [pdfs].flatten.first
|
21
|
+
cmd = "pdfinfo #{pdf} 2>&1"
|
22
|
+
result = `#{cmd}`.chomp
|
23
|
+
raise ExtractionFailed, result if $? != 0
|
24
|
+
match = result.match(MATCHERS[key])
|
25
|
+
answer = match && match[1]
|
26
|
+
answer = answer.to_i if answer && key == :length
|
27
|
+
answer
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftk** in order to create bursted single pages from
|
4
|
+
# a PDF document.
|
5
|
+
class PageExtractor
|
6
|
+
|
7
|
+
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
|
+
def extract(pdfs, opts)
|
9
|
+
extract_options opts
|
10
|
+
[pdfs].flatten.each do |pdf|
|
11
|
+
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
+
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
+
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
|
15
|
+
result = `#{cmd}`.chomp
|
16
|
+
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
|
+
raise ExtractionFailed, result if $? != 0
|
18
|
+
result
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def extract_options(options)
|
26
|
+
@output = options[:output] || '.'
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -1,53 +1,111 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
2
|
+
|
3
|
+
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
|
+
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
|
+
# forbid OCR extraction, but by default the heuristic works like this:
|
6
|
+
#
|
7
|
+
# * Check for the presence of fonts in the PDF. If no fonts are detected,
|
8
|
+
# OCR is used automatically.
|
9
|
+
# * Extract the text of each page with **pdftotext**, if the page has less
|
10
|
+
# than 100 bytes of text (a scanned image page, or a page that just
|
11
|
+
# contains a filename and a page number), then add it to the list of
|
12
|
+
# `@pages_to_ocr`.
|
13
|
+
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
|
+
#
|
3
15
|
class TextExtractor
|
4
|
-
|
5
|
-
|
6
|
-
|
16
|
+
|
17
|
+
NO_TEXT_DETECTED = /---------\n\Z/
|
18
|
+
|
19
|
+
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
+
|
21
|
+
MIN_TEXT_PER_PAGE = 100 # in bytes
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@tiffs_generated = false
|
25
|
+
@pages_to_ocr = []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Extract text from a list of PDFs.
|
7
29
|
def extract(pdfs, opts)
|
8
30
|
extract_options opts
|
9
|
-
|
10
|
-
pdfs.each do |pdf|
|
11
|
-
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
if @pages
|
16
|
-
pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
|
17
|
-
pages.each do |page|
|
18
|
-
extract_page pdf, page, pdf_name
|
19
|
-
end
|
31
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
32
|
+
[pdfs].flatten.each do |pdf|
|
33
|
+
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
+
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
|
35
|
+
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
|
+
extract_from_ocr(pdf, pages)
|
20
37
|
else
|
21
|
-
|
22
|
-
|
23
|
-
|
38
|
+
extract_from_pdf(pdf, pages)
|
39
|
+
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
|
40
|
+
extract_from_ocr(pdf, @pages_to_ocr)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.remove_entry_secure @tempdir if @tempdir
|
45
|
+
end
|
46
|
+
|
47
|
+
# Does a PDF have any text embedded?
|
48
|
+
def contains_text?(pdf)
|
49
|
+
fonts = `pdffonts #{pdf} 2>&1`
|
50
|
+
!fonts.match(NO_TEXT_DETECTED)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Extract a page range worth of text from a PDF, directly.
|
54
|
+
def extract_from_pdf(pdf, pages)
|
55
|
+
return extract_full(pdf) unless pages
|
56
|
+
pages.each {|page| extract_page(pdf, page) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extract a page range worth of text from a PDF via OCR.
|
60
|
+
def extract_from_ocr(pdf, pages)
|
61
|
+
@tempdir ||= Dir.mktmpdir
|
62
|
+
base_path = File.join(@output, @pdf_name)
|
63
|
+
if pages
|
64
|
+
run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
+
@tiffs_generated = true
|
66
|
+
pages.each do |page|
|
67
|
+
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
24
68
|
end
|
69
|
+
else
|
70
|
+
tiff = "#{@tempdir}/#{@pdf_name}.tif"
|
71
|
+
run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
|
+
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
25
73
|
end
|
26
74
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
75
|
+
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
# Run an external process and raise an exception if it fails.
|
80
|
+
def run(command)
|
81
|
+
result = `#{command}`
|
32
82
|
raise ExtractionFailed, result if $? != 0
|
33
83
|
result
|
34
84
|
end
|
35
85
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
raise ExtractionFailed if match.nil?
|
41
|
-
match[1].to_i
|
86
|
+
# Extract the full contents of a pdf as a single file, directly.
|
87
|
+
def extract_full(pdf)
|
88
|
+
text_path = File.join(@output, "#{@pdf_name}.txt")
|
89
|
+
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
|
42
90
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
91
|
+
|
92
|
+
# Extract the contents of a single page of text, directly, adding it to
|
93
|
+
# the `@pages_to_ocr` list if the text length is inadequate.
|
94
|
+
def extract_page(pdf, page)
|
95
|
+
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
96
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
|
97
|
+
unless @forbid_ocr
|
98
|
+
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
46
102
|
def extract_options(options)
|
47
|
-
@output
|
48
|
-
@pages
|
103
|
+
@output = options[:output] || '.'
|
104
|
+
@pages = options[:pages]
|
105
|
+
@force_ocr = options[:ocr] == true
|
106
|
+
@forbid_ocr = options[:ocr] == false
|
49
107
|
end
|
50
108
|
|
51
109
|
end
|
52
|
-
|
110
|
+
|
53
111
|
end
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 23
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
7
|
+
- 3
|
9
8
|
- 0
|
10
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jeremy Ashkenas
|
@@ -16,7 +15,7 @@ autorequire:
|
|
16
15
|
bindir: bin
|
17
16
|
cert_chain: []
|
18
17
|
|
19
|
-
date: 2010-
|
18
|
+
date: 2010-08-05 00:00:00 -04:00
|
20
19
|
default_executable:
|
21
20
|
dependencies: []
|
22
21
|
|
@@ -29,27 +28,14 @@ extensions: []
|
|
29
28
|
extra_rdoc_files: []
|
30
29
|
|
31
30
|
files:
|
32
|
-
- build/org/documentcloud/ExtractInfo$1.class
|
33
|
-
- build/org/documentcloud/ExtractInfo$Keys.class
|
34
|
-
- build/org/documentcloud/ExtractInfo.class
|
35
|
-
- build/org/documentcloud/Extractor.class
|
36
|
-
- build/org/documentcloud/ExtractPages.class
|
37
|
-
- build/org/documentcloud/ExtractText.class
|
38
|
-
- lib/docsplit/argument_parser.rb
|
39
31
|
- lib/docsplit/command_line.rb
|
40
|
-
- lib/docsplit/ExtractInfo.java
|
41
|
-
- lib/docsplit/Extractor.java
|
42
|
-
- lib/docsplit/ExtractPages.java
|
43
|
-
- lib/docsplit/ExtractText.java
|
44
32
|
- lib/docsplit/image_extractor.rb
|
33
|
+
- lib/docsplit/info_extractor.rb
|
34
|
+
- lib/docsplit/page_extractor.rb
|
45
35
|
- lib/docsplit/text_extractor.rb
|
46
36
|
- lib/docsplit/transparent_pdfs.rb
|
47
37
|
- lib/docsplit.rb
|
48
38
|
- bin/docsplit
|
49
|
-
- vendor/bcmail.jar
|
50
|
-
- vendor/bcprov.jar
|
51
|
-
- vendor/commons-logging.jar
|
52
|
-
- vendor/fontbox.jar
|
53
39
|
- vendor/jodconverter/commons-cli-1.2.jar
|
54
40
|
- vendor/jodconverter/commons-io-1.4.jar
|
55
41
|
- vendor/jodconverter/jodconverter-2.2.2.jar
|
@@ -61,11 +47,10 @@ files:
|
|
61
47
|
- vendor/jodconverter/slf4j-jdk14-1.5.6.jar
|
62
48
|
- vendor/jodconverter/unoil-3.0.1.jar
|
63
49
|
- vendor/logging.properties
|
64
|
-
- vendor/pdfbox.jar
|
65
50
|
- docsplit.gemspec
|
66
51
|
- LICENSE
|
67
52
|
- README
|
68
|
-
has_rdoc:
|
53
|
+
has_rdoc: false
|
69
54
|
homepage: http://documentcloud.github.com/docsplit/
|
70
55
|
licenses: []
|
71
56
|
|
@@ -75,27 +60,23 @@ rdoc_options: []
|
|
75
60
|
require_paths:
|
76
61
|
- lib
|
77
62
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
-
none: false
|
79
63
|
requirements:
|
80
64
|
- - ">="
|
81
65
|
- !ruby/object:Gem::Version
|
82
|
-
hash: 3
|
83
66
|
segments:
|
84
67
|
- 0
|
85
68
|
version: "0"
|
86
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
-
none: false
|
88
70
|
requirements:
|
89
71
|
- - ">="
|
90
72
|
- !ruby/object:Gem::Version
|
91
|
-
hash: 3
|
92
73
|
segments:
|
93
74
|
- 0
|
94
75
|
version: "0"
|
95
76
|
requirements: []
|
96
77
|
|
97
78
|
rubyforge_project: docsplit
|
98
|
-
rubygems_version: 1.3.
|
79
|
+
rubygems_version: 1.3.6
|
99
80
|
signing_key:
|
100
81
|
specification_version: 3
|
101
82
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,63 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.IOException;
|
5
|
-
import java.text.SimpleDateFormat;
|
6
|
-
|
7
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
8
|
-
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
9
|
-
|
10
|
-
// Extracts metadata from a PDF file.
|
11
|
-
public class ExtractInfo extends Extractor {
|
12
|
-
|
13
|
-
private PDDocument doc;
|
14
|
-
private PDDocumentInformation info;
|
15
|
-
private String key;
|
16
|
-
|
17
|
-
// The list of metadata keys we know how to extract.
|
18
|
-
private enum Keys {
|
19
|
-
AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
|
20
|
-
}
|
21
|
-
|
22
|
-
// The mainline.
|
23
|
-
public static void main(String[] args) {
|
24
|
-
(new ExtractInfo()).run(args);
|
25
|
-
}
|
26
|
-
|
27
|
-
// The first argument is always the name of the metadata key.
|
28
|
-
protected void parseArguments(List<String> args) {
|
29
|
-
super.parseArguments(args);
|
30
|
-
key = args.remove(0).toUpperCase();
|
31
|
-
}
|
32
|
-
|
33
|
-
// Extract the configured bit of metadata from a PDF, decrypting if necessary.
|
34
|
-
public void extract(String pdfPath) {
|
35
|
-
try {
|
36
|
-
doc = PDDocument.load(pdfPath, false);
|
37
|
-
decrypt(doc);
|
38
|
-
info = doc.getDocumentInformation();
|
39
|
-
String val = extractInfo();
|
40
|
-
if (val != null) System.out.println(val);
|
41
|
-
doc.close();
|
42
|
-
} catch(IOException e) {
|
43
|
-
System.out.println(e.getMessage());
|
44
|
-
System.exit(1);
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
// Use the PDDocumentInformation object to fetch metadata values as strings.
|
49
|
-
public String extractInfo() throws IOException {
|
50
|
-
switch(Keys.valueOf(key)) {
|
51
|
-
case AUTHOR: return info.getAuthor();
|
52
|
-
case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
|
53
|
-
case CREATOR: return info.getCreator();
|
54
|
-
case KEYWORDS: return info.getKeywords();
|
55
|
-
case PRODUCER: return info.getProducer();
|
56
|
-
case SUBJECT: return info.getSubject();
|
57
|
-
case TITLE: return info.getTitle();
|
58
|
-
case LENGTH: return String.valueOf(doc.getNumberOfPages());
|
59
|
-
default: return null;
|
60
|
-
}
|
61
|
-
}
|
62
|
-
|
63
|
-
}
|
@@ -1,54 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.File;
|
5
|
-
import java.io.FileOutputStream;
|
6
|
-
import java.io.IOException;
|
7
|
-
|
8
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
9
|
-
import org.apache.pdfbox.util.Splitter;
|
10
|
-
import org.apache.pdfbox.pdfwriter.COSWriter;
|
11
|
-
import org.apache.pdfbox.exceptions.COSVisitorException;
|
12
|
-
|
13
|
-
// Use PDFBox's Splitter to break apart a large PDF into individual pages.
|
14
|
-
public class ExtractPages extends Extractor {
|
15
|
-
|
16
|
-
private PDDocument doc;
|
17
|
-
private String basename;
|
18
|
-
|
19
|
-
// The mainline.
|
20
|
-
public static void main(String[] args) {
|
21
|
-
(new ExtractPages()).run(args);
|
22
|
-
}
|
23
|
-
|
24
|
-
// Extract each page of the given PDF.
|
25
|
-
public void extract(String pdfPath) {
|
26
|
-
try {
|
27
|
-
basename = getBasename(pdfPath);
|
28
|
-
doc = PDDocument.load(pdfPath);
|
29
|
-
decrypt(doc);
|
30
|
-
List pages = (new Splitter()).split(doc);
|
31
|
-
if (pageNumbers != null) {
|
32
|
-
for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
|
33
|
-
} else {
|
34
|
-
for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
|
35
|
-
}
|
36
|
-
doc.close();
|
37
|
-
} catch(Exception e) {
|
38
|
-
System.out.println(e.getMessage());
|
39
|
-
System.exit(1);
|
40
|
-
}
|
41
|
-
}
|
42
|
-
|
43
|
-
// Writes out a page as a single-page PDF.
|
44
|
-
private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
|
45
|
-
String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
|
46
|
-
FileOutputStream out = new FileOutputStream(outputFile(pageName));
|
47
|
-
COSWriter writer = new COSWriter(out);
|
48
|
-
writer.write(page);
|
49
|
-
out.close();
|
50
|
-
writer.close();
|
51
|
-
page.close();
|
52
|
-
}
|
53
|
-
|
54
|
-
}
|
@@ -1,80 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.File;
|
5
|
-
import java.io.FileOutputStream;
|
6
|
-
import java.io.IOException;
|
7
|
-
import java.io.OutputStreamWriter;
|
8
|
-
|
9
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
-
import org.apache.pdfbox.util.PDFTextStripper;
|
11
|
-
|
12
|
-
// Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
|
13
|
-
// PDF document. Pass --pages to write out the plain text for each individual
|
14
|
-
// page; --pages-only to omit the text for the entire document.
|
15
|
-
public class ExtractText extends Extractor {
|
16
|
-
|
17
|
-
private PDDocument doc;
|
18
|
-
private String basename;
|
19
|
-
|
20
|
-
// The mainline.
|
21
|
-
public static void main(String[] args) {
|
22
|
-
(new ExtractText()).run(args);
|
23
|
-
}
|
24
|
-
|
25
|
-
// Extract the plain text for a PDF, and write it into the requested output
|
26
|
-
// sizes.
|
27
|
-
public void extract(String pdfPath) {
|
28
|
-
try {
|
29
|
-
basename = getBasename(pdfPath);
|
30
|
-
doc = PDDocument.load(pdfPath, false);
|
31
|
-
decrypt(doc);
|
32
|
-
if (allPages || (pageNumbers != null)) {
|
33
|
-
writePageText();
|
34
|
-
} else {
|
35
|
-
writeFullText();
|
36
|
-
}
|
37
|
-
doc.close();
|
38
|
-
} catch(IOException e) {
|
39
|
-
System.out.println(e.getMessage());
|
40
|
-
System.exit(1);
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
// Write out the extracted full text for the entire PDF.
|
45
|
-
public void writeFullText() throws IOException {
|
46
|
-
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
|
47
|
-
extractTextForPageRange(output, 1, Integer.MAX_VALUE);
|
48
|
-
output.close();
|
49
|
-
}
|
50
|
-
|
51
|
-
// Write out the full text for each specified page.
|
52
|
-
public void writePageText() throws IOException {
|
53
|
-
if (pageNumbers != null) {
|
54
|
-
for (Integer num : pageNumbers) writePageText(num.intValue());
|
55
|
-
} else {
|
56
|
-
int pages = doc.getNumberOfPages();
|
57
|
-
for (int i=1; i<=pages; i++) writePageText(i);
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
// Write out the full text for a single page.
|
62
|
-
public void writePageText(int pageNumber) throws IOException {
|
63
|
-
File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
|
64
|
-
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
|
65
|
-
extractTextForPageRange(output, pageNumber, pageNumber);
|
66
|
-
output.close();
|
67
|
-
}
|
68
|
-
|
69
|
-
// Internal method to writes out text from the PDF for a given page range
|
70
|
-
// to a provided output stream.
|
71
|
-
private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
|
72
|
-
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
|
73
|
-
stripper.setSortByPosition(false);
|
74
|
-
stripper.setShouldSeparateByBeads(true);
|
75
|
-
stripper.setStartPage(startPage);
|
76
|
-
stripper.setEndPage(endPage);
|
77
|
-
stripper.writeText(doc, output);
|
78
|
-
}
|
79
|
-
|
80
|
-
}
|
data/lib/docsplit/Extractor.java
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.io.File;
|
4
|
-
import java.util.List;
|
5
|
-
import java.util.Arrays;
|
6
|
-
import java.util.ArrayList;
|
7
|
-
import java.util.Iterator;
|
8
|
-
|
9
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
-
|
11
|
-
// The base Extractor class contains the common functionality needed to run
|
12
|
-
// command-line extractors.
|
13
|
-
public abstract class Extractor {
|
14
|
-
|
15
|
-
protected File output;
|
16
|
-
protected boolean allPages = false;
|
17
|
-
protected ArrayList<Integer> pageNumbers;
|
18
|
-
|
19
|
-
// Running an extractor consists of converting the arguments array into a
|
20
|
-
// more manageable List, parsing arguments, and extracting pdfs.
|
21
|
-
public void run(String[] arguments) {
|
22
|
-
List<String> args = new ArrayList<String>(Arrays.asList(arguments));
|
23
|
-
parseArguments(args);
|
24
|
-
Iterator<String> iter = args.iterator();
|
25
|
-
while(iter.hasNext()) extract(iter.next());
|
26
|
-
}
|
27
|
-
|
28
|
-
// Subclasses must override "extract" to perform their specific extraction.
|
29
|
-
public abstract void extract(String pdfPath);
|
30
|
-
|
31
|
-
// The default "parseArguments" method handles common arguments.
|
32
|
-
protected void parseArguments(List<String> args) {
|
33
|
-
int dirLoc = args.indexOf("--output");
|
34
|
-
if (dirLoc >= 0) {
|
35
|
-
output = new File(args.remove(dirLoc + 1));
|
36
|
-
args.remove(dirLoc);
|
37
|
-
}
|
38
|
-
int pagesLoc = args.indexOf("--pages");
|
39
|
-
if (pagesLoc >= 0) {
|
40
|
-
parsePages(args.remove(pagesLoc + 1));
|
41
|
-
args.remove(pagesLoc);
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
// Utility function to get the basename of a file path.
|
46
|
-
// After File.basename in Ruby.
|
47
|
-
public String getBasename(String pdfPath) {
|
48
|
-
String basename = new File(pdfPath).getName();
|
49
|
-
return basename.substring(0, basename.lastIndexOf('.'));
|
50
|
-
}
|
51
|
-
|
52
|
-
// Get a reference to an output file, placed inside any configured directories,
|
53
|
-
// while ensuring that parent directories exist.
|
54
|
-
public File outputFile(String path) {
|
55
|
-
File file = output != null ? new File(output, path) : new File(path);
|
56
|
-
File parent = file.getParentFile();
|
57
|
-
if (parent != null) parent.mkdirs();
|
58
|
-
return file;
|
59
|
-
}
|
60
|
-
|
61
|
-
// Decrypt a non-passworded but still encrypted document.
|
62
|
-
public void decrypt(PDDocument doc) {
|
63
|
-
if (!doc.isEncrypted()) return;
|
64
|
-
try {
|
65
|
-
doc.decrypt("");
|
66
|
-
} catch (Exception e) {
|
67
|
-
System.out.println("Error decrypting document, details: " + e.getMessage());
|
68
|
-
System.exit(1);
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
private void parsePages(String pageList) {
|
73
|
-
if (pageList.equals("all")) {
|
74
|
-
allPages = true;
|
75
|
-
return;
|
76
|
-
}
|
77
|
-
pageNumbers = new ArrayList<Integer>();
|
78
|
-
String[] groups = pageList.split(",");
|
79
|
-
for (String group : groups) {
|
80
|
-
if (group.contains("-")) {
|
81
|
-
String[] range = group.split("-");
|
82
|
-
int start = Integer.parseInt(range[0]);
|
83
|
-
int end = Integer.parseInt(range[1]);
|
84
|
-
for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
|
85
|
-
} else {
|
86
|
-
pageNumbers.add(new Integer(Integer.parseInt(group)));
|
87
|
-
}
|
88
|
-
}
|
89
|
-
}
|
90
|
-
|
91
|
-
}
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module Docsplit
|
2
|
-
|
3
|
-
module ArgumentParser
|
4
|
-
|
5
|
-
# Flatten an options hash into an arguments string suitable for the command
|
6
|
-
# line.
|
7
|
-
def parse_options(opts)
|
8
|
-
opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
|
9
|
-
end
|
10
|
-
|
11
|
-
# Normalize a value in an options hash for the command line.
|
12
|
-
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
13
|
-
def normalize_value(value)
|
14
|
-
case value
|
15
|
-
when Range then normalize_range(value)
|
16
|
-
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
17
|
-
else value.to_s
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# Serialize a Ruby range into it's command-line equivalent.
|
22
|
-
def normalize_range(range)
|
23
|
-
arr = range.to_a
|
24
|
-
arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
extend ArgumentParser
|
30
|
-
|
31
|
-
end
|
data/vendor/bcmail.jar
DELETED
Binary file
|
data/vendor/bcprov.jar
DELETED
Binary file
|
data/vendor/commons-logging.jar
DELETED
Binary file
|
data/vendor/fontbox.jar
DELETED
Binary file
|
data/vendor/pdfbox.jar
DELETED
Binary file
|