docsplit 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/text_extractor.rb +4 -3
- metadata +3 -3
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.1' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-10'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -16,7 +16,8 @@ module Docsplit
|
|
16
16
|
|
17
17
|
NO_TEXT_DETECTED = /---------\n\Z/
|
18
18
|
|
19
|
-
OCR_FLAGS
|
19
|
+
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
|
20
|
+
MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
|
20
21
|
|
21
22
|
MIN_TEXT_PER_PAGE = 100 # in bytes
|
22
23
|
|
@@ -61,14 +62,14 @@ module Docsplit
|
|
61
62
|
@tempdir ||= Dir.mktmpdir
|
62
63
|
base_path = File.join(@output, @pdf_name)
|
63
64
|
if pages
|
64
|
-
run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
|
+
run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
65
66
|
@tiffs_generated = true
|
66
67
|
pages.each do |page|
|
67
68
|
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
68
69
|
end
|
69
70
|
else
|
70
71
|
tiff = "#{@tempdir}/#{@pdf_name}.tif"
|
71
|
-
run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
|
+
run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
73
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
73
74
|
end
|
74
75
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jeremy Ashkenas
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-10 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|