docsplit 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/image_extractor.rb +17 -33
- data/lib/docsplit/text_extractor.rb +6 -6
- metadata +4 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.4' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-20'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ module Docsplit
|
|
5
5
|
class ImageExtractor
|
6
6
|
|
7
7
|
DENSITY_ARG = "-density 150"
|
8
|
-
MEMORY_ARGS = "-limit memory
|
8
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
9
9
|
DEFAULT_FORMAT = :png
|
10
10
|
|
11
11
|
# Extract a list of PDFs as rasterized page images, according to the
|
@@ -23,22 +23,29 @@ module Docsplit
|
|
23
23
|
end
|
24
24
|
|
25
25
|
# Convert a single PDF into page images at the specified size and format.
|
26
|
+
# If `--rolling`, and we have a previous image at a larger size to work with,
|
27
|
+
# we simply downsample that image, instead of re-rendering the entire PDF.
|
28
|
+
# Now we generate one page at a time, a counterintuitive opimization
|
29
|
+
# suggested by the GraphicsMagick list, that seems to work quite well.
|
26
30
|
def convert(pdf, size, format, previous=nil)
|
27
31
|
tempdir = Dir.mktmpdir
|
28
32
|
basename = File.basename(pdf, File.extname(pdf))
|
29
33
|
directory = directory_for(size)
|
34
|
+
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
|
30
35
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
31
|
-
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
32
36
|
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
33
37
|
if previous
|
34
38
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
35
|
-
|
39
|
+
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
|
40
|
+
raise ExtractionFailed, result if $? != 0
|
36
41
|
else
|
37
|
-
|
42
|
+
page_list(pages).each do |page|
|
43
|
+
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
|
44
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
|
45
|
+
result = `#{cmd}`.chomp
|
46
|
+
raise ExtractionFailed, result if $? != 0
|
47
|
+
end
|
38
48
|
end
|
39
|
-
result = `#{cmd}`.chomp
|
40
|
-
raise ExtractionFailed, result if $? != 0
|
41
|
-
renumber_images(out_file, format)
|
42
49
|
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
43
50
|
end
|
44
51
|
|
@@ -76,39 +83,16 @@ module Docsplit
|
|
76
83
|
end
|
77
84
|
end
|
78
85
|
|
79
|
-
# Generate the requested page index into the document.
|
80
|
-
def pages_arg
|
81
|
-
return '' if @pages.nil?
|
82
|
-
pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
|
83
|
-
"[#{pages}]"
|
84
|
-
end
|
85
|
-
|
86
86
|
# Generate the expanded list of requested page numbers.
|
87
|
-
def page_list
|
88
|
-
|
87
|
+
def page_list(pages)
|
88
|
+
pages.split(',').map { |range|
|
89
89
|
if range.include?('-')
|
90
90
|
range = range.split('-')
|
91
91
|
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
92
92
|
else
|
93
93
|
range.to_i
|
94
94
|
end
|
95
|
-
}.flatten.sort
|
96
|
-
end
|
97
|
-
|
98
|
-
# When GraphicsMagick is through, it will have generated a number of
|
99
|
-
# incrementing page images, starting at 0. Renumber them with their correct
|
100
|
-
# page numbers.
|
101
|
-
def renumber_images(template, format)
|
102
|
-
suffixer = /_0+(\d+)\.#{format}\Z/
|
103
|
-
images = Dir[template.sub('%05d', '0*')].map do |path|
|
104
|
-
index = path[suffixer, 1].to_i
|
105
|
-
{:path => path, :index => index, :page_number => index + 1}
|
106
|
-
end
|
107
|
-
numbers = @pages ? page_list.reverse : nil
|
108
|
-
images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
|
109
|
-
number = numbers ? numbers[i] : image[:page_number]
|
110
|
-
FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
|
111
|
-
end
|
95
|
+
}.flatten.uniq.sort
|
112
96
|
end
|
113
97
|
|
114
98
|
end
|
@@ -17,13 +17,12 @@ module Docsplit
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
19
|
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
-
MEMORY_ARGS = '-limit memory
|
20
|
+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
|
21
21
|
|
22
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
23
23
|
|
24
24
|
def initialize
|
25
|
-
@
|
26
|
-
@pages_to_ocr = []
|
25
|
+
@pages_to_ocr = []
|
27
26
|
end
|
28
27
|
|
29
28
|
# Extract text from a list of PDFs.
|
@@ -61,10 +60,11 @@ module Docsplit
|
|
61
60
|
tempdir = Dir.mktmpdir
|
62
61
|
base_path = File.join(@output, @pdf_name)
|
63
62
|
if pages
|
64
|
-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
-
@tiffs_generated = true
|
66
63
|
pages.each do |page|
|
67
|
-
|
64
|
+
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
65
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
|
66
|
+
run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
|
67
|
+
FileUtils.remove_entry_secure tiff
|
68
68
|
end
|
69
69
|
else
|
70
70
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 4
|
10
|
+
version: 0.3.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-08-
|
19
|
+
date: 2010-08-20 00:00:00 -04:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|