docsplit 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/image_extractor.rb +17 -33
- data/lib/docsplit/text_extractor.rb +6 -6
- metadata +4 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.4' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-20'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ module Docsplit
|
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
7
|
DENSITY_ARG = "-density 150"
|
8
|
-
MEMORY_ARGS = "-limit memory
|
8
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
9
9
|
DEFAULT_FORMAT = :png
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
@@ -23,22 +23,29 @@ module Docsplit
|
|
23
23
|
end
|
24
24
|
|
25
25
|
# Convert a single PDF into page images at the specified size and format.
|
26
|
+
# If `--rolling`, and we have a previous image at a larger size to work with,
|
27
|
+
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
|
+
# Now we generate one page at a time, a counterintuitive opimization
|
29
|
+
# suggested by the GraphicsMagick list, that seems to work quite well.
|
26
30
|
def convert(pdf, size, format, previous=nil)
|
27
31
|
tempdir = Dir.mktmpdir
|
28
32
|
basename = File.basename(pdf, File.extname(pdf))
|
29
33
|
directory = directory_for(size)
|
34
|
+
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
30
35
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
31
|
-
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
32
36
|
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
33
37
|
if previous
|
34
38
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
35
|
-
|
39
|
+
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
40
|
+
raise ExtractionFailed, result if $? != 0
|
36
41
|
else
|
37
|
-
|
42
|
+
page_list(pages).each do |page|
|
43
|
+
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
|
44
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
|
45
|
+
result = `#{cmd}`.chomp
|
46
|
+
raise ExtractionFailed, result if $? != 0
|
47
|
+
end
|
38
48
|
end
|
39
|
-
result = `#{cmd}`.chomp
|
40
|
-
raise ExtractionFailed, result if $? != 0
|
41
|
-
renumber_images(out_file, format)
|
42
49
|
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
43
50
|
end
|
44
51
|
|
@@ -76,39 +83,16 @@ module Docsplit
|
|
76
83
|
end
|
77
84
|
end
|
78
85
|
|
79
|
-
# Generate the requested page index into the document.
|
80
|
-
def pages_arg
|
81
|
-
return '' if @pages.nil?
|
82
|
-
pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
|
83
|
-
"[#{pages}]"
|
84
|
-
end
|
85
|
-
|
86
86
|
# Generate the expanded list of requested page numbers.
|
87
|
-
def page_list
|
88
|
-
|
87
|
+
def page_list(pages)
|
88
|
+
pages.split(',').map { |range|
|
89
89
|
if range.include?('-')
|
90
90
|
range = range.split('-')
|
91
91
|
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
92
92
|
else
|
93
93
|
range.to_i
|
94
94
|
end
|
95
|
-
}.flatten.sort
|
96
|
-
end
|
97
|
-
|
98
|
-
# When GraphicsMagick is through, it will have generated a number of
|
99
|
-
# incrementing page images, starting at 0. Renumber them with their correct
|
100
|
-
# page numbers.
|
101
|
-
def renumber_images(template, format)
|
102
|
-
suffixer = /_0+(\d+)\.#{format}\Z/
|
103
|
-
images = Dir[template.sub('%05d', '0*')].map do |path|
|
104
|
-
index = path[suffixer, 1].to_i
|
105
|
-
{:path => path, :index => index, :page_number => index + 1}
|
106
|
-
end
|
107
|
-
numbers = @pages ? page_list.reverse : nil
|
108
|
-
images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
|
109
|
-
number = numbers ? numbers[i] : image[:page_number]
|
110
|
-
FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
|
111
|
-
end
|
95
|
+
}.flatten.uniq.sort
|
112
96
|
end
|
113
97
|
|
114
98
|
end
|
@@ -17,13 +17,12 @@ module Docsplit
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
19
|
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
-
MEMORY_ARGS = '-limit memory
|
20
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
21
|
|
22
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
23
|
|
24
24
|
def initialize
|
25
|
-
@
|
26
|
-
@pages_to_ocr = []
|
25
|
+
@pages_to_ocr = []
|
27
26
|
end
|
28
27
|
|
29
28
|
# Extract text from a list of PDFs.
|
@@ -61,10 +60,11 @@ module Docsplit
|
|
61
60
|
tempdir = Dir.mktmpdir
|
62
61
|
base_path = File.join(@output, @pdf_name)
|
63
62
|
if pages
|
64
|
-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
-
@tiffs_generated = true
|
66
63
|
pages.each do |page|
|
67
|
-
|
64
|
+
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
65
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
|
66
|
+
run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
|
67
|
+
FileUtils.remove_entry_secure tiff
|
68
68
|
end
|
69
69
|
else
|
70
70
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 4
|
10
|
+
version: 0.3.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-08-
|
19
|
+
date: 2010-08-20 00:00:00 -04:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|