docsplit 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +10 -6
- data/lib/docsplit/command_line.rb +3 -0
- data/lib/docsplit/image_extractor.rb +6 -4
- data/lib/docsplit/info_extractor.rb +1 -1
- data/lib/docsplit/page_extractor.rb +1 -1
- data/lib/docsplit/text_extractor.rb +11 -8
- metadata +5 -5
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2011-
|
3
|
+
s.version = '0.6.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2011-09-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.6.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -19,7 +19,9 @@ module Docsplit
|
|
19
19
|
|
20
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
21
21
|
|
22
|
-
|
22
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
23
|
+
|
24
|
+
# Check for all dependencies, and note their absence.
|
23
25
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
24
26
|
DEPENDENCIES.each_key do |dep|
|
25
27
|
dirs.each do |dir|
|
@@ -28,7 +30,6 @@ module Docsplit
|
|
28
30
|
break
|
29
31
|
end
|
30
32
|
end
|
31
|
-
warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
|
32
33
|
end
|
33
34
|
|
34
35
|
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
@@ -62,11 +63,13 @@ module Docsplit
|
|
62
63
|
[docs].flatten.each do |doc|
|
63
64
|
ext = File.extname(doc)
|
64
65
|
basename = File.basename(doc, ext)
|
65
|
-
|
66
|
-
|
66
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
67
|
+
|
68
|
+
if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
|
69
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
67
70
|
else
|
68
71
|
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
69
|
-
run "#{options}
|
72
|
+
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
70
73
|
end
|
71
74
|
end
|
72
75
|
end
|
@@ -113,6 +116,7 @@ end
|
|
113
116
|
|
114
117
|
require 'tmpdir'
|
115
118
|
require 'fileutils'
|
119
|
+
require 'shellwords'
|
116
120
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
117
121
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
118
122
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -85,6 +85,9 @@ Options:
|
|
85
85
|
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
86
|
@options[:format] = t.split(',')
|
87
87
|
end
|
88
|
+
opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
|
89
|
+
@options[:density] = d
|
90
|
+
end
|
88
91
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
89
92
|
@options[:ocr] = o
|
90
93
|
end
|
@@ -4,9 +4,9 @@ module Docsplit
|
|
4
4
|
# nicely sized images.
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
|
-
DENSITY_ARG = "-density 150"
|
8
7
|
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
9
8
|
DEFAULT_FORMAT = :png
|
9
|
+
DEFAULT_DENSITY = '150'
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
12
12
|
# configuration in options.
|
@@ -32,16 +32,17 @@ module Docsplit
|
|
32
32
|
basename = File.basename(pdf, File.extname(pdf))
|
33
33
|
directory = directory_for(size)
|
34
34
|
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
35
|
+
escaped_pdf = ESCAPE[pdf]
|
35
36
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
36
|
-
common = "#{MEMORY_ARGS} #{
|
37
|
+
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
|
37
38
|
if previous
|
38
39
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
39
40
|
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
40
41
|
raise ExtractionFailed, result if $? != 0
|
41
42
|
else
|
42
43
|
page_list(pages).each do |page|
|
43
|
-
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
|
44
|
-
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common}
|
44
|
+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
45
46
|
result = `#{cmd}`.chomp
|
46
47
|
raise ExtractionFailed, result if $? != 0
|
47
48
|
end
|
@@ -57,6 +58,7 @@ module Docsplit
|
|
57
58
|
def extract_options(options)
|
58
59
|
@output = options[:output] || '.'
|
59
60
|
@pages = options[:pages]
|
61
|
+
@density = options[:density] || DEFAULT_DENSITY
|
60
62
|
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
61
63
|
@sizes = [options[:size]].flatten.compact
|
62
64
|
@sizes = [nil] if @sizes.empty?
|
@@ -18,7 +18,7 @@ module Docsplit
|
|
18
18
|
# Pull out a single datum from a pdf.
|
19
19
|
def extract(key, pdfs, opts)
|
20
20
|
pdf = [pdfs].flatten.first
|
21
|
-
cmd = "pdfinfo #{pdf} 2>&1"
|
21
|
+
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
22
22
|
result = `#{cmd}`.chomp
|
23
23
|
raise ExtractionFailed, result if $? != 0
|
24
24
|
match = result.match(MATCHERS[key])
|
@@ -11,7 +11,7 @@ module Docsplit
|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
12
|
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
-
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
|
14
|
+
cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
15
15
|
result = `#{cmd}`.chomp
|
16
16
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
17
|
raise ExtractionFailed, result if $? != 0
|
@@ -16,7 +16,7 @@ module Docsplit
|
|
16
16
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
|
-
OCR_FLAGS = '-density
|
19
|
+
OCR_FLAGS = '-density 400x400 -colorspace GRAY'
|
20
20
|
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
21
|
|
22
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
@@ -45,7 +45,7 @@ module Docsplit
|
|
45
45
|
|
46
46
|
# Does a PDF have any text embedded?
|
47
47
|
def contains_text?(pdf)
|
48
|
-
fonts = `pdffonts #{pdf} 2>&1`
|
48
|
+
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
|
49
49
|
!fonts.match(NO_TEXT_DETECTED)
|
50
50
|
end
|
51
51
|
|
@@ -59,19 +59,22 @@ module Docsplit
|
|
59
59
|
def extract_from_ocr(pdf, pages)
|
60
60
|
tempdir = Dir.mktmpdir
|
61
61
|
base_path = File.join(@output, @pdf_name)
|
62
|
+
escaped_pdf = ESCAPE[pdf]
|
62
63
|
if pages
|
63
64
|
pages.each do |page|
|
64
65
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
|
+
escaped_tiff = ESCAPE[tiff]
|
65
67
|
file = "#{base_path}_#{page}"
|
66
|
-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{
|
67
|
-
run "tesseract #{
|
68
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
69
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
|
68
70
|
clean_text(file + '.txt') if @clean_ocr
|
69
71
|
FileUtils.remove_entry_secure tiff
|
70
72
|
end
|
71
73
|
else
|
72
74
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
73
|
-
|
74
|
-
run "
|
75
|
+
escaped_tiff = ESCAPE[tiff]
|
76
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
77
|
+
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
|
75
78
|
clean_text(base_path + '.txt') if @clean_ocr
|
76
79
|
end
|
77
80
|
ensure
|
@@ -100,14 +103,14 @@ module Docsplit
|
|
100
103
|
# Extract the full contents of a pdf as a single file, directly.
|
101
104
|
def extract_full(pdf)
|
102
105
|
text_path = File.join(@output, "#{@pdf_name}.txt")
|
103
|
-
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
|
106
|
+
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
104
107
|
end
|
105
108
|
|
106
109
|
# Extract the contents of a single page of text, directly, adding it to
|
107
110
|
# the `@pages_to_ocr` list if the text length is inadequate.
|
108
111
|
def extract_page(pdf, page)
|
109
112
|
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
|
110
|
-
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
|
113
|
+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
|
111
114
|
unless @forbid_ocr
|
112
115
|
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
|
113
116
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 6
|
9
|
+
- 0
|
10
|
+
version: 0.6.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-09-13 00:00:00 Z
|
20
20
|
dependencies: []
|
21
21
|
|
22
22
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|