docsplit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +0 -1
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +30 -7
- data/lib/docsplit/command_line.rb +9 -3
- data/lib/docsplit/image_extractor.rb +27 -8
- data/lib/docsplit/info_extractor.rb +32 -0
- data/lib/docsplit/page_extractor.rb +31 -0
- data/lib/docsplit/text_extractor.rb +93 -35
- metadata +7 -26
- data/build/org/documentcloud/ExtractInfo$1.class +0 -0
- data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
- data/build/org/documentcloud/ExtractInfo.class +0 -0
- data/build/org/documentcloud/ExtractPages.class +0 -0
- data/build/org/documentcloud/ExtractText.class +0 -0
- data/build/org/documentcloud/Extractor.class +0 -0
- data/lib/docsplit/ExtractInfo.java +0 -63
- data/lib/docsplit/ExtractPages.java +0 -54
- data/lib/docsplit/ExtractText.java +0 -80
- data/lib/docsplit/Extractor.java +0 -91
- data/lib/docsplit/argument_parser.rb +0 -31
- data/vendor/bcmail.jar +0 -0
- data/vendor/bcprov.jar +0 -0
- data/vendor/commons-logging.jar +0 -0
- data/vendor/fontbox.jar +0 -0
- data/vendor/pdfbox.jar +0 -0
data/LICENSE
CHANGED
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2010-
|
3
|
+
s.version = '0.3.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-5'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.3.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -13,6 +13,20 @@ module Docsplit
|
|
13
13
|
|
14
14
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
15
15
|
|
16
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
17
|
+
|
18
|
+
# Check for all dependencies, and warn of their absence.
|
19
|
+
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
20
|
+
DEPENDENCIES.each_key do |dep|
|
21
|
+
dirs.each do |dir|
|
22
|
+
if File.executable?(File.join(dir, dep.to_s))
|
23
|
+
DEPENDENCIES[dep] = true
|
24
|
+
break
|
25
|
+
end
|
26
|
+
end
|
27
|
+
warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
|
28
|
+
end
|
29
|
+
|
16
30
|
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
17
31
|
# broke.
|
18
32
|
class ExtractionFailed < StandardError; end
|
@@ -20,7 +34,7 @@ module Docsplit
|
|
20
34
|
# Use the ExtractPages Java class to burst a PDF into single pages.
|
21
35
|
def self.extract_pages(pdfs, opts={})
|
22
36
|
pdfs = ensure_pdfs(pdfs)
|
23
|
-
|
37
|
+
PageExtractor.new.extract(pdfs, opts)
|
24
38
|
end
|
25
39
|
|
26
40
|
# Use the ExtractText Java class to write out all embedded text.
|
@@ -50,8 +64,7 @@ module Docsplit
|
|
50
64
|
instance_eval <<-EOS
|
51
65
|
def self.extract_#{key}(pdfs, opts={})
|
52
66
|
pdfs = ensure_pdfs(pdfs)
|
53
|
-
|
54
|
-
:#{key} == :length ? result.to_i : result
|
67
|
+
InfoExtractor.new.extract(:#{key}, pdfs, opts)
|
55
68
|
end
|
56
69
|
EOS
|
57
70
|
end
|
@@ -62,18 +75,28 @@ module Docsplit
|
|
62
75
|
# Runs a Java command, with quieted logging, and the classpath set properly.
|
63
76
|
def self.run(command, pdfs, opts, return_output=false)
|
64
77
|
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
65
|
-
|
66
|
-
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
|
78
|
+
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
67
79
|
result = `#{cmd}`.chomp
|
68
80
|
raise ExtractionFailed, result if $? != 0
|
69
81
|
return return_output ? (result.empty? ? nil : result) : true
|
70
82
|
end
|
71
83
|
|
84
|
+
# Normalize a value in an options hash for the command line.
|
85
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
86
|
+
def self.normalize_value(value)
|
87
|
+
case value
|
88
|
+
when Range then normalize_range(value)
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
90
|
+
else value.to_s
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
72
94
|
end
|
73
95
|
|
74
96
|
require 'tmpdir'
|
75
97
|
require 'fileutils'
|
76
98
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
77
|
-
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
|
78
99
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
79
100
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
101
|
+
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
102
|
+
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
@@ -8,7 +8,7 @@ module Docsplit
|
|
8
8
|
|
9
9
|
BANNER = <<-EOS
|
10
10
|
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
-
It wraps
|
11
|
+
It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
|
12
12
|
|
13
13
|
Usage:
|
14
14
|
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
@@ -71,7 +71,7 @@ Options:
|
|
71
71
|
# Use the OptionParser library to parse out all supported options. Return
|
72
72
|
# options formatted for the Ruby API.
|
73
73
|
def parse_options
|
74
|
-
@options = {}
|
74
|
+
@options = {:ocr => :default}
|
75
75
|
@option_parser = OptionParser.new do |opts|
|
76
76
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
77
|
@options[:output] = d
|
@@ -85,8 +85,14 @@ Options:
|
|
85
85
|
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
86
|
@options[:format] = t.split(',')
|
87
87
|
end
|
88
|
+
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
89
|
+
@options[:ocr] = o
|
90
|
+
end
|
91
|
+
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
92
|
+
@options[:rolling] = true
|
93
|
+
end
|
88
94
|
opts.on_tail('-v', '--version', 'display docsplit version') do
|
89
|
-
puts "
|
95
|
+
puts "Docsplit version #{Docsplit::VERSION}"
|
90
96
|
exit
|
91
97
|
end
|
92
98
|
opts.on_tail('-h', '--help', 'display this help message') do
|
@@ -4,26 +4,37 @@ module Docsplit
|
|
4
4
|
# nicely sized images.
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
|
-
DENSITY_ARG
|
8
|
-
MEMORY_ARGS
|
9
|
-
DEFAULT_FORMAT
|
7
|
+
DENSITY_ARG = "-density 150"
|
8
|
+
MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
|
9
|
+
DEFAULT_FORMAT = :png
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
12
|
# configuration in options.
|
13
13
|
def extract(pdfs, options)
|
14
14
|
@pdfs = [pdfs].flatten
|
15
15
|
extract_options(options)
|
16
|
-
@pdfs.each
|
16
|
+
@pdfs.each do |pdf|
|
17
|
+
previous = nil
|
18
|
+
@sizes.each_with_index do |size, i|
|
19
|
+
@formats.each {|format| convert(pdf, size, format, previous) }
|
20
|
+
previous = size if @rolling
|
21
|
+
end
|
22
|
+
end
|
17
23
|
end
|
18
24
|
|
19
25
|
# Convert a single PDF into page images at the specified size and format.
|
20
|
-
def convert(pdf, size, format)
|
26
|
+
def convert(pdf, size, format, previous=nil)
|
21
27
|
basename = File.basename(pdf, File.extname(pdf))
|
22
|
-
|
23
|
-
directory = File.join(@output, subfolder)
|
28
|
+
directory = directory_for(size)
|
24
29
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
25
30
|
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
26
|
-
|
31
|
+
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
32
|
+
if previous
|
33
|
+
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
34
|
+
cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
|
+
else
|
36
|
+
cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
|
+
end
|
27
38
|
result = `#{cmd}`.chomp
|
28
39
|
raise ExtractionFailed, result if $? != 0
|
29
40
|
renumber_images(out_file, format)
|
@@ -39,6 +50,14 @@ module Docsplit
|
|
39
50
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
40
51
|
@sizes = [options[:size]].flatten.compact
|
41
52
|
@sizes = [nil] if @sizes.empty?
|
53
|
+
@rolling = !!options[:rolling]
|
54
|
+
end
|
55
|
+
|
56
|
+
# If there's only one size requested, generate the images directly into
|
57
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
58
|
+
def directory_for(size)
|
59
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
60
|
+
File.expand_path(path)
|
42
61
|
end
|
43
62
|
|
44
63
|
# Generate the resize argument.
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdfinfo** in order to extract information about a PDF file.
|
4
|
+
class InfoExtractor
|
5
|
+
|
6
|
+
# Regex matchers for different bits of information.
|
7
|
+
MATCHERS = {
|
8
|
+
:author => /^Author:\s+([^\n]+)/,
|
9
|
+
:date => /^CreationDate:\s+([^\n]+)/,
|
10
|
+
:creator => /^Creator:\s+([^\n]+)/,
|
11
|
+
:keywords => /^Keywords:\s+([^\n]+)/,
|
12
|
+
:producer => /^Producer:\s+([^\n]+)/,
|
13
|
+
:subject => /^Subject:\s+([^\n]+)/,
|
14
|
+
:title => /^Title:\s+([^\n]+)/,
|
15
|
+
:length => /^Pages:\s+([^\n]+)/,
|
16
|
+
}
|
17
|
+
|
18
|
+
# Pull out a single datum from a pdf.
|
19
|
+
def extract(key, pdfs, opts)
|
20
|
+
pdf = [pdfs].flatten.first
|
21
|
+
cmd = "pdfinfo #{pdf} 2>&1"
|
22
|
+
result = `#{cmd}`.chomp
|
23
|
+
raise ExtractionFailed, result if $? != 0
|
24
|
+
match = result.match(MATCHERS[key])
|
25
|
+
answer = match && match[1]
|
26
|
+
answer = answer.to_i if answer && key == :length
|
27
|
+
answer
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to **pdftk** in order to create bursted single pages from
|
4
|
+
# a PDF document.
|
5
|
+
class PageExtractor
|
6
|
+
|
7
|
+
# Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
|
8
|
+
def extract(pdfs, opts)
|
9
|
+
extract_options opts
|
10
|
+
[pdfs].flatten.each do |pdf|
|
11
|
+
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
+
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
+
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
|
15
|
+
result = `#{cmd}`.chomp
|
16
|
+
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
|
+
raise ExtractionFailed, result if $? != 0
|
18
|
+
result
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def extract_options(options)
|
26
|
+
@output = options[:output] || '.'
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -1,53 +1,111 @@
|
|
1
1
|
module Docsplit
|
2
|
-
|
2
|
+
|
3
|
+
# Delegates to **pdftotext** and **tesseract** in order to extract text from
|
4
|
+
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
|
5
|
+
# forbid OCR extraction, but by default the heuristic works like this:
|
6
|
+
#
|
7
|
+
# * Check for the presence of fonts in the PDF. If no fonts are detected,
|
8
|
+
# OCR is used automatically.
|
9
|
+
# * Extract the text of each page with **pdftotext**, if the page has less
|
10
|
+
# than 100 bytes of text (a scanned image page, or a page that just
|
11
|
+
# contains a filename and a page number), then add it to the list of
|
12
|
+
# `@pages_to_ocr`.
|
13
|
+
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
|
14
|
+
#
|
3
15
|
class TextExtractor
|
4
|
-
|
5
|
-
|
6
|
-
|
16
|
+
|
17
|
+
NO_TEXT_DETECTED = /---------\n\Z/
|
18
|
+
|
19
|
+
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
+
|
21
|
+
MIN_TEXT_PER_PAGE = 100 # in bytes
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@tiffs_generated = false
|
25
|
+
@pages_to_ocr = []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Extract text from a list of PDFs.
|
7
29
|
def extract(pdfs, opts)
|
8
30
|
extract_options opts
|
9
|
-
|
10
|
-
pdfs.each do |pdf|
|
11
|
-
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
if @pages
|
16
|
-
pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
|
17
|
-
pages.each do |page|
|
18
|
-
extract_page pdf, page, pdf_name
|
19
|
-
end
|
31
|
+
FileUtils.mkdir_p @output unless File.exists?(@output)
|
32
|
+
[pdfs].flatten.each do |pdf|
|
33
|
+
@pdf_name = File.basename(pdf, File.extname(pdf))
|
34
|
+
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
|
35
|
+
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
|
36
|
+
extract_from_ocr(pdf, pages)
|
20
37
|
else
|
21
|
-
|
22
|
-
|
23
|
-
|
38
|
+
extract_from_pdf(pdf, pages)
|
39
|
+
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
|
40
|
+
extract_from_ocr(pdf, @pages_to_ocr)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.remove_entry_secure @tempdir if @tempdir
|
45
|
+
end
|
46
|
+
|
47
|
+
# Does a PDF have any text embedded?
|
48
|
+
def contains_text?(pdf)
|
49
|
+
fonts = `pdffonts #{pdf} 2>&1`
|
50
|
+
!fonts.match(NO_TEXT_DETECTED)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Extract a page range worth of text from a PDF, directly.
|
54
|
+
def extract_from_pdf(pdf, pages)
|
55
|
+
return extract_full(pdf) unless pages
|
56
|
+
pages.each {|page| extract_page(pdf, page) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extract a page range worth of text from a PDF via OCR.
|
60
|
+
def extract_from_ocr(pdf, pages)
|
61
|
+
@tempdir ||= Dir.mktmpdir
|
62
|
+
base_path = File.join(@output, @pdf_name)
|
63
|
+
if pages
|
64
|
+
run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
+
@tiffs_generated = true
|
66
|
+
pages.each do |page|
|
67
|
+
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
24
68
|
end
|
69
|
+
else
|
70
|
+
tiff = "#{@tempdir}/#{@pdf_name}.tif"
|
71
|
+
run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
|
+
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
25
73
|
end
|
26
74
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
75
|
+
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
# Run an external process and raise an exception if it fails.
|
80
|
+
def run(command)
|
81
|
+
result = `#{command}`
|
32
82
|
raise ExtractionFailed, result if $? != 0
|
33
83
|
result
|
34
84
|
end
|
35
85
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
raise ExtractionFailed if match.nil?
|
41
|
-
match[1].to_i
|
86
|
+
# Extract the full contents of a pdf as a single file, directly.
|
87
|
+
def extract_full(pdf)
|
88
|
+
text_path = File.join(@output, "#{@pdf_name}.txt")
|
89
|
+
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
|
42
90
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
91
|
+
|
92
|
+
# Extract the contents of a single page of text, directly, adding it to
|
93
|
+
# the `@pages_to_ocr` list if the text length is inadequate.
|
94
|
+
def extract_page(pdf, page)
|
95
|
+
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
96
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
|
97
|
+
unless @forbid_ocr
|
98
|
+
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
46
102
|
def extract_options(options)
|
47
|
-
@output
|
48
|
-
@pages
|
103
|
+
@output = options[:output] || '.'
|
104
|
+
@pages = options[:pages]
|
105
|
+
@force_ocr = options[:ocr] == true
|
106
|
+
@forbid_ocr = options[:ocr] == false
|
49
107
|
end
|
50
108
|
|
51
109
|
end
|
52
|
-
|
110
|
+
|
53
111
|
end
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 23
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
7
|
+
- 3
|
9
8
|
- 0
|
10
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jeremy Ashkenas
|
@@ -16,7 +15,7 @@ autorequire:
|
|
16
15
|
bindir: bin
|
17
16
|
cert_chain: []
|
18
17
|
|
19
|
-
date: 2010-
|
18
|
+
date: 2010-08-05 00:00:00 -04:00
|
20
19
|
default_executable:
|
21
20
|
dependencies: []
|
22
21
|
|
@@ -29,27 +28,14 @@ extensions: []
|
|
29
28
|
extra_rdoc_files: []
|
30
29
|
|
31
30
|
files:
|
32
|
-
- build/org/documentcloud/ExtractInfo$1.class
|
33
|
-
- build/org/documentcloud/ExtractInfo$Keys.class
|
34
|
-
- build/org/documentcloud/ExtractInfo.class
|
35
|
-
- build/org/documentcloud/Extractor.class
|
36
|
-
- build/org/documentcloud/ExtractPages.class
|
37
|
-
- build/org/documentcloud/ExtractText.class
|
38
|
-
- lib/docsplit/argument_parser.rb
|
39
31
|
- lib/docsplit/command_line.rb
|
40
|
-
- lib/docsplit/ExtractInfo.java
|
41
|
-
- lib/docsplit/Extractor.java
|
42
|
-
- lib/docsplit/ExtractPages.java
|
43
|
-
- lib/docsplit/ExtractText.java
|
44
32
|
- lib/docsplit/image_extractor.rb
|
33
|
+
- lib/docsplit/info_extractor.rb
|
34
|
+
- lib/docsplit/page_extractor.rb
|
45
35
|
- lib/docsplit/text_extractor.rb
|
46
36
|
- lib/docsplit/transparent_pdfs.rb
|
47
37
|
- lib/docsplit.rb
|
48
38
|
- bin/docsplit
|
49
|
-
- vendor/bcmail.jar
|
50
|
-
- vendor/bcprov.jar
|
51
|
-
- vendor/commons-logging.jar
|
52
|
-
- vendor/fontbox.jar
|
53
39
|
- vendor/jodconverter/commons-cli-1.2.jar
|
54
40
|
- vendor/jodconverter/commons-io-1.4.jar
|
55
41
|
- vendor/jodconverter/jodconverter-2.2.2.jar
|
@@ -61,11 +47,10 @@ files:
|
|
61
47
|
- vendor/jodconverter/slf4j-jdk14-1.5.6.jar
|
62
48
|
- vendor/jodconverter/unoil-3.0.1.jar
|
63
49
|
- vendor/logging.properties
|
64
|
-
- vendor/pdfbox.jar
|
65
50
|
- docsplit.gemspec
|
66
51
|
- LICENSE
|
67
52
|
- README
|
68
|
-
has_rdoc:
|
53
|
+
has_rdoc: false
|
69
54
|
homepage: http://documentcloud.github.com/docsplit/
|
70
55
|
licenses: []
|
71
56
|
|
@@ -75,27 +60,23 @@ rdoc_options: []
|
|
75
60
|
require_paths:
|
76
61
|
- lib
|
77
62
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
-
none: false
|
79
63
|
requirements:
|
80
64
|
- - ">="
|
81
65
|
- !ruby/object:Gem::Version
|
82
|
-
hash: 3
|
83
66
|
segments:
|
84
67
|
- 0
|
85
68
|
version: "0"
|
86
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
-
none: false
|
88
70
|
requirements:
|
89
71
|
- - ">="
|
90
72
|
- !ruby/object:Gem::Version
|
91
|
-
hash: 3
|
92
73
|
segments:
|
93
74
|
- 0
|
94
75
|
version: "0"
|
95
76
|
requirements: []
|
96
77
|
|
97
78
|
rubyforge_project: docsplit
|
98
|
-
rubygems_version: 1.3.
|
79
|
+
rubygems_version: 1.3.6
|
99
80
|
signing_key:
|
100
81
|
specification_version: 3
|
101
82
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,63 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.IOException;
|
5
|
-
import java.text.SimpleDateFormat;
|
6
|
-
|
7
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
8
|
-
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
9
|
-
|
10
|
-
// Extracts metadata from a PDF file.
|
11
|
-
public class ExtractInfo extends Extractor {
|
12
|
-
|
13
|
-
private PDDocument doc;
|
14
|
-
private PDDocumentInformation info;
|
15
|
-
private String key;
|
16
|
-
|
17
|
-
// The list of metadata keys we know how to extract.
|
18
|
-
private enum Keys {
|
19
|
-
AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
|
20
|
-
}
|
21
|
-
|
22
|
-
// The mainline.
|
23
|
-
public static void main(String[] args) {
|
24
|
-
(new ExtractInfo()).run(args);
|
25
|
-
}
|
26
|
-
|
27
|
-
// The first argument is always the name of the metadata key.
|
28
|
-
protected void parseArguments(List<String> args) {
|
29
|
-
super.parseArguments(args);
|
30
|
-
key = args.remove(0).toUpperCase();
|
31
|
-
}
|
32
|
-
|
33
|
-
// Extract the configured bit of metadata from a PDF, decrypting if necessary.
|
34
|
-
public void extract(String pdfPath) {
|
35
|
-
try {
|
36
|
-
doc = PDDocument.load(pdfPath, false);
|
37
|
-
decrypt(doc);
|
38
|
-
info = doc.getDocumentInformation();
|
39
|
-
String val = extractInfo();
|
40
|
-
if (val != null) System.out.println(val);
|
41
|
-
doc.close();
|
42
|
-
} catch(IOException e) {
|
43
|
-
System.out.println(e.getMessage());
|
44
|
-
System.exit(1);
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
// Use the PDDocumentInformation object to fetch metadata values as strings.
|
49
|
-
public String extractInfo() throws IOException {
|
50
|
-
switch(Keys.valueOf(key)) {
|
51
|
-
case AUTHOR: return info.getAuthor();
|
52
|
-
case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
|
53
|
-
case CREATOR: return info.getCreator();
|
54
|
-
case KEYWORDS: return info.getKeywords();
|
55
|
-
case PRODUCER: return info.getProducer();
|
56
|
-
case SUBJECT: return info.getSubject();
|
57
|
-
case TITLE: return info.getTitle();
|
58
|
-
case LENGTH: return String.valueOf(doc.getNumberOfPages());
|
59
|
-
default: return null;
|
60
|
-
}
|
61
|
-
}
|
62
|
-
|
63
|
-
}
|
@@ -1,54 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.File;
|
5
|
-
import java.io.FileOutputStream;
|
6
|
-
import java.io.IOException;
|
7
|
-
|
8
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
9
|
-
import org.apache.pdfbox.util.Splitter;
|
10
|
-
import org.apache.pdfbox.pdfwriter.COSWriter;
|
11
|
-
import org.apache.pdfbox.exceptions.COSVisitorException;
|
12
|
-
|
13
|
-
// Use PDFBox's Splitter to break apart a large PDF into individual pages.
|
14
|
-
public class ExtractPages extends Extractor {
|
15
|
-
|
16
|
-
private PDDocument doc;
|
17
|
-
private String basename;
|
18
|
-
|
19
|
-
// The mainline.
|
20
|
-
public static void main(String[] args) {
|
21
|
-
(new ExtractPages()).run(args);
|
22
|
-
}
|
23
|
-
|
24
|
-
// Extract each page of the given PDF.
|
25
|
-
public void extract(String pdfPath) {
|
26
|
-
try {
|
27
|
-
basename = getBasename(pdfPath);
|
28
|
-
doc = PDDocument.load(pdfPath);
|
29
|
-
decrypt(doc);
|
30
|
-
List pages = (new Splitter()).split(doc);
|
31
|
-
if (pageNumbers != null) {
|
32
|
-
for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
|
33
|
-
} else {
|
34
|
-
for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
|
35
|
-
}
|
36
|
-
doc.close();
|
37
|
-
} catch(Exception e) {
|
38
|
-
System.out.println(e.getMessage());
|
39
|
-
System.exit(1);
|
40
|
-
}
|
41
|
-
}
|
42
|
-
|
43
|
-
// Writes out a page as a single-page PDF.
|
44
|
-
private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
|
45
|
-
String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
|
46
|
-
FileOutputStream out = new FileOutputStream(outputFile(pageName));
|
47
|
-
COSWriter writer = new COSWriter(out);
|
48
|
-
writer.write(page);
|
49
|
-
out.close();
|
50
|
-
writer.close();
|
51
|
-
page.close();
|
52
|
-
}
|
53
|
-
|
54
|
-
}
|
@@ -1,80 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.util.List;
|
4
|
-
import java.io.File;
|
5
|
-
import java.io.FileOutputStream;
|
6
|
-
import java.io.IOException;
|
7
|
-
import java.io.OutputStreamWriter;
|
8
|
-
|
9
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
-
import org.apache.pdfbox.util.PDFTextStripper;
|
11
|
-
|
12
|
-
// Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
|
13
|
-
// PDF document. Pass --pages to write out the plain text for each individual
|
14
|
-
// page; --pages-only to omit the text for the entire document.
|
15
|
-
public class ExtractText extends Extractor {
|
16
|
-
|
17
|
-
private PDDocument doc;
|
18
|
-
private String basename;
|
19
|
-
|
20
|
-
// The mainline.
|
21
|
-
public static void main(String[] args) {
|
22
|
-
(new ExtractText()).run(args);
|
23
|
-
}
|
24
|
-
|
25
|
-
// Extract the plain text for a PDF, and write it into the requested output
|
26
|
-
// sizes.
|
27
|
-
public void extract(String pdfPath) {
|
28
|
-
try {
|
29
|
-
basename = getBasename(pdfPath);
|
30
|
-
doc = PDDocument.load(pdfPath, false);
|
31
|
-
decrypt(doc);
|
32
|
-
if (allPages || (pageNumbers != null)) {
|
33
|
-
writePageText();
|
34
|
-
} else {
|
35
|
-
writeFullText();
|
36
|
-
}
|
37
|
-
doc.close();
|
38
|
-
} catch(IOException e) {
|
39
|
-
System.out.println(e.getMessage());
|
40
|
-
System.exit(1);
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
// Write out the extracted full text for the entire PDF.
|
45
|
-
public void writeFullText() throws IOException {
|
46
|
-
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
|
47
|
-
extractTextForPageRange(output, 1, Integer.MAX_VALUE);
|
48
|
-
output.close();
|
49
|
-
}
|
50
|
-
|
51
|
-
// Write out the full text for each specified page.
|
52
|
-
public void writePageText() throws IOException {
|
53
|
-
if (pageNumbers != null) {
|
54
|
-
for (Integer num : pageNumbers) writePageText(num.intValue());
|
55
|
-
} else {
|
56
|
-
int pages = doc.getNumberOfPages();
|
57
|
-
for (int i=1; i<=pages; i++) writePageText(i);
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
// Write out the full text for a single page.
|
62
|
-
public void writePageText(int pageNumber) throws IOException {
|
63
|
-
File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
|
64
|
-
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
|
65
|
-
extractTextForPageRange(output, pageNumber, pageNumber);
|
66
|
-
output.close();
|
67
|
-
}
|
68
|
-
|
69
|
-
// Internal method to writes out text from the PDF for a given page range
|
70
|
-
// to a provided output stream.
|
71
|
-
private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
|
72
|
-
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
|
73
|
-
stripper.setSortByPosition(false);
|
74
|
-
stripper.setShouldSeparateByBeads(true);
|
75
|
-
stripper.setStartPage(startPage);
|
76
|
-
stripper.setEndPage(endPage);
|
77
|
-
stripper.writeText(doc, output);
|
78
|
-
}
|
79
|
-
|
80
|
-
}
|
data/lib/docsplit/Extractor.java
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
package org.documentcloud;
|
2
|
-
|
3
|
-
import java.io.File;
|
4
|
-
import java.util.List;
|
5
|
-
import java.util.Arrays;
|
6
|
-
import java.util.ArrayList;
|
7
|
-
import java.util.Iterator;
|
8
|
-
|
9
|
-
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
-
|
11
|
-
// The base Extractor class contains the common functionality needed to run
|
12
|
-
// command-line extractors.
|
13
|
-
public abstract class Extractor {
|
14
|
-
|
15
|
-
protected File output;
|
16
|
-
protected boolean allPages = false;
|
17
|
-
protected ArrayList<Integer> pageNumbers;
|
18
|
-
|
19
|
-
// Running an extractor consists of converting the arguments array into a
|
20
|
-
// more manageable List, parsing arguments, and extracting pdfs.
|
21
|
-
public void run(String[] arguments) {
|
22
|
-
List<String> args = new ArrayList<String>(Arrays.asList(arguments));
|
23
|
-
parseArguments(args);
|
24
|
-
Iterator<String> iter = args.iterator();
|
25
|
-
while(iter.hasNext()) extract(iter.next());
|
26
|
-
}
|
27
|
-
|
28
|
-
// Subclasses must override "extract" to perform their specific extraction.
|
29
|
-
public abstract void extract(String pdfPath);
|
30
|
-
|
31
|
-
// The default "parseArguments" method handles common arguments.
|
32
|
-
protected void parseArguments(List<String> args) {
|
33
|
-
int dirLoc = args.indexOf("--output");
|
34
|
-
if (dirLoc >= 0) {
|
35
|
-
output = new File(args.remove(dirLoc + 1));
|
36
|
-
args.remove(dirLoc);
|
37
|
-
}
|
38
|
-
int pagesLoc = args.indexOf("--pages");
|
39
|
-
if (pagesLoc >= 0) {
|
40
|
-
parsePages(args.remove(pagesLoc + 1));
|
41
|
-
args.remove(pagesLoc);
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
// Utility function to get the basename of a file path.
|
46
|
-
// After File.basename in Ruby.
|
47
|
-
public String getBasename(String pdfPath) {
|
48
|
-
String basename = new File(pdfPath).getName();
|
49
|
-
return basename.substring(0, basename.lastIndexOf('.'));
|
50
|
-
}
|
51
|
-
|
52
|
-
// Get a reference to an output file, placed inside any configured directories,
|
53
|
-
// while ensuring that parent directories exist.
|
54
|
-
public File outputFile(String path) {
|
55
|
-
File file = output != null ? new File(output, path) : new File(path);
|
56
|
-
File parent = file.getParentFile();
|
57
|
-
if (parent != null) parent.mkdirs();
|
58
|
-
return file;
|
59
|
-
}
|
60
|
-
|
61
|
-
// Decrypt a non-passworded but still encrypted document.
|
62
|
-
public void decrypt(PDDocument doc) {
|
63
|
-
if (!doc.isEncrypted()) return;
|
64
|
-
try {
|
65
|
-
doc.decrypt("");
|
66
|
-
} catch (Exception e) {
|
67
|
-
System.out.println("Error decrypting document, details: " + e.getMessage());
|
68
|
-
System.exit(1);
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
private void parsePages(String pageList) {
|
73
|
-
if (pageList.equals("all")) {
|
74
|
-
allPages = true;
|
75
|
-
return;
|
76
|
-
}
|
77
|
-
pageNumbers = new ArrayList<Integer>();
|
78
|
-
String[] groups = pageList.split(",");
|
79
|
-
for (String group : groups) {
|
80
|
-
if (group.contains("-")) {
|
81
|
-
String[] range = group.split("-");
|
82
|
-
int start = Integer.parseInt(range[0]);
|
83
|
-
int end = Integer.parseInt(range[1]);
|
84
|
-
for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
|
85
|
-
} else {
|
86
|
-
pageNumbers.add(new Integer(Integer.parseInt(group)));
|
87
|
-
}
|
88
|
-
}
|
89
|
-
}
|
90
|
-
|
91
|
-
}
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module Docsplit
|
2
|
-
|
3
|
-
module ArgumentParser
|
4
|
-
|
5
|
-
# Flatten an options hash into an arguments string suitable for the command
|
6
|
-
# line.
|
7
|
-
def parse_options(opts)
|
8
|
-
opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
|
9
|
-
end
|
10
|
-
|
11
|
-
# Normalize a value in an options hash for the command line.
|
12
|
-
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
13
|
-
def normalize_value(value)
|
14
|
-
case value
|
15
|
-
when Range then normalize_range(value)
|
16
|
-
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
17
|
-
else value.to_s
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# Serialize a Ruby range into it's command-line equivalent.
|
22
|
-
def normalize_range(range)
|
23
|
-
arr = range.to_a
|
24
|
-
arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
extend ArgumentParser
|
30
|
-
|
31
|
-
end
|
data/vendor/bcmail.jar
DELETED
Binary file
|
data/vendor/bcprov.jar
DELETED
Binary file
|
data/vendor/commons-logging.jar
DELETED
Binary file
|
data/vendor/fontbox.jar
DELETED
Binary file
|
data/vendor/pdfbox.jar
DELETED
Binary file
|