docsplit 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +10 -6
- data/lib/docsplit/command_line.rb +3 -0
- data/lib/docsplit/image_extractor.rb +6 -4
- data/lib/docsplit/info_extractor.rb +1 -1
- data/lib/docsplit/page_extractor.rb +1 -1
- data/lib/docsplit/text_extractor.rb +11 -8
- metadata +5 -5
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2011-
|
3
|
+
s.version = '0.6.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2011-09-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.6.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -19,7 +19,9 @@ module Docsplit
|
|
19
19
|
|
20
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
21
21
|
|
22
|
-
|
22
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
23
|
+
|
24
|
+
# Check for all dependencies, and note their absence.
|
23
25
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
24
26
|
DEPENDENCIES.each_key do |dep|
|
25
27
|
dirs.each do |dir|
|
@@ -28,7 +30,6 @@ module Docsplit
|
|
28
30
|
break
|
29
31
|
end
|
30
32
|
end
|
31
|
-
warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
|
32
33
|
end
|
33
34
|
|
34
35
|
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
@@ -62,11 +63,13 @@ module Docsplit
|
|
62
63
|
[docs].flatten.each do |doc|
|
63
64
|
ext = File.extname(doc)
|
64
65
|
basename = File.basename(doc, ext)
|
65
|
-
|
66
|
-
|
66
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
67
|
+
|
68
|
+
if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
|
69
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
67
70
|
else
|
68
71
|
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
69
|
-
run "#{options}
|
72
|
+
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
70
73
|
end
|
71
74
|
end
|
72
75
|
end
|
@@ -113,6 +116,7 @@ end
|
|
113
116
|
|
114
117
|
require 'tmpdir'
|
115
118
|
require 'fileutils'
|
119
|
+
require 'shellwords'
|
116
120
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
117
121
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
118
122
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -85,6 +85,9 @@ Options:
|
|
85
85
|
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
86
|
@options[:format] = t.split(',')
|
87
87
|
end
|
88
|
+
opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
|
89
|
+
@options[:density] = d
|
90
|
+
end
|
88
91
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
89
92
|
@options[:ocr] = o
|
90
93
|
end
|
@@ -4,9 +4,9 @@ module Docsplit
|
|
4
4
|
# nicely sized images.
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
|
-
DENSITY_ARG = "-density 150"
|
8
7
|
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
9
8
|
DEFAULT_FORMAT = :png
|
9
|
+
DEFAULT_DENSITY = '150'
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
12
|
# configuration in options.
|
@@ -32,16 +32,17 @@ module Docsplit
|
|
32
32
|
basename = File.basename(pdf, File.extname(pdf))
|
33
33
|
directory = directory_for(size)
|
34
34
|
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
|
+
escaped_pdf = ESCAPE[pdf]
|
35
36
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
36
|
-
common = "#{MEMORY_ARGS} #{
|
37
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
37
38
|
if previous
|
38
39
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
39
40
|
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
40
41
|
raise ExtractionFailed, result if $? != 0
|
41
42
|
else
|
42
43
|
page_list(pages).each do |page|
|
43
|
-
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
|
44
|
-
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common}
|
44
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
45
46
|
result = `#{cmd}`.chomp
|
46
47
|
raise ExtractionFailed, result if $? != 0
|
47
48
|
end
|
@@ -57,6 +58,7 @@ module Docsplit
|
|
57
58
|
def extract_options(options)
|
58
59
|
@output = options[:output] || '.'
|
59
60
|
@pages = options[:pages]
|
61
|
+
@density = options[:density] || DEFAULT_DENSITY
|
60
62
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
61
63
|
@sizes = [options[:size]].flatten.compact
|
62
64
|
@sizes = [nil] if @sizes.empty?
|
@@ -18,7 +18,7 @@ module Docsplit
|
|
18
18
|
# Pull out a single datum from a pdf.
|
19
19
|
def extract(key, pdfs, opts)
|
20
20
|
pdf = [pdfs].flatten.first
|
21
|
-
cmd = "pdfinfo #{pdf} 2>&1"
|
21
|
+
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
22
22
|
result = `#{cmd}`.chomp
|
23
23
|
raise ExtractionFailed, result if $? != 0
|
24
24
|
match = result.match(MATCHERS[key])
|
@@ -11,7 +11,7 @@ module Docsplit
|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
12
|
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
-
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
|
14
|
+
cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
15
15
|
result = `#{cmd}`.chomp
|
16
16
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
17
|
raise ExtractionFailed, result if $? != 0
|
@@ -16,7 +16,7 @@ module Docsplit
|
|
16
16
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
|
-
OCR_FLAGS = '-density
|
19
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
20
|
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
21
|
|
22
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
@@ -45,7 +45,7 @@ module Docsplit
|
|
45
45
|
|
46
46
|
# Does a PDF have any text embedded?
|
47
47
|
def contains_text?(pdf)
|
48
|
-
fonts = `pdffonts #{pdf} 2>&1`
|
48
|
+
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
|
49
49
|
!fonts.match(NO_TEXT_DETECTED)
|
50
50
|
end
|
51
51
|
|
@@ -59,19 +59,22 @@ module Docsplit
|
|
59
59
|
def extract_from_ocr(pdf, pages)
|
60
60
|
tempdir = Dir.mktmpdir
|
61
61
|
base_path = File.join(@output, @pdf_name)
|
62
|
+
escaped_pdf = ESCAPE[pdf]
|
62
63
|
if pages
|
63
64
|
pages.each do |page|
|
64
65
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
|
+
escaped_tiff = ESCAPE[tiff]
|
65
67
|
file = "#{base_path}_#{page}"
|
66
|
-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{
|
67
|
-
run "tesseract #{
|
68
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
69
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
|
68
70
|
clean_text(file + '.txt') if @clean_ocr
|
69
71
|
FileUtils.remove_entry_secure tiff
|
70
72
|
end
|
71
73
|
else
|
72
74
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
73
|
-
|
74
|
-
run "
|
75
|
+
escaped_tiff = ESCAPE[tiff]
|
76
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
77
|
+
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
|
75
78
|
clean_text(base_path + '.txt') if @clean_ocr
|
76
79
|
end
|
77
80
|
ensure
|
@@ -100,14 +103,14 @@ module Docsplit
|
|
100
103
|
# Extract the full contents of a pdf as a single file, directly.
|
101
104
|
def extract_full(pdf)
|
102
105
|
text_path = File.join(@output, "#{@pdf_name}.txt")
|
103
|
-
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
|
106
|
+
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
104
107
|
end
|
105
108
|
|
106
109
|
# Extract the contents of a single page of text, directly, adding it to
|
107
110
|
# the `@pages_to_ocr` list if the text length is inadequate.
|
108
111
|
def extract_page(pdf, page)
|
109
112
|
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
110
|
-
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
|
113
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
111
114
|
unless @forbid_ocr
|
112
115
|
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
113
116
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 6
|
9
|
+
- 0
|
10
|
+
version: 0.6.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-09-13 00:00:00 Z
|
20
20
|
dependencies: []
|
21
21
|
|
22
22
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|