docsplit 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/text_extractor.rb +4 -3
- metadata +3 -3
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.1' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-10'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -16,7 +16,8 @@ module Docsplit
|
|
16
16
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
|
-
OCR_FLAGS
|
19
|
+
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
+
MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
|
20
21
|
|
21
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
22
23
|
|
@@ -61,14 +62,14 @@ module Docsplit
|
|
61
62
|
@tempdir ||= Dir.mktmpdir
|
62
63
|
base_path = File.join(@output, @pdf_name)
|
63
64
|
if pages
|
64
|
-
run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
+
run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
66
|
@tiffs_generated = true
|
66
67
|
pages.each do |page|
|
67
68
|
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
68
69
|
end
|
69
70
|
else
|
70
71
|
tiff = "#{@tempdir}/#{@pdf_name}.tif"
|
71
|
-
run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
|
+
run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
73
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
73
74
|
end
|
74
75
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jeremy Ashkenas
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-10 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|