docsplit 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-5'
3
+ s.version = '0.3.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-10'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.0' # Keep in sync with gemspec.
4
+ VERSION = '0.3.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -16,7 +16,8 @@ module Docsplit
16
16
 
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
- OCR_FLAGS = '-density 200x200 -colorspace GRAY'
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
20
21
 
21
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
22
23
 
@@ -61,14 +62,14 @@ module Docsplit
61
62
  @tempdir ||= Dir.mktmpdir
62
63
  base_path = File.join(@output, @pdf_name)
63
64
  if pages
64
- run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
+ run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
66
  @tiffs_generated = true
66
67
  pages.each do |page|
67
68
  run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
68
69
  end
69
70
  else
70
71
  tiff = "#{@tempdir}/#{@pdf_name}.tif"
71
- run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
+ run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
73
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
73
74
  end
74
75
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 0
9
- version: 0.3.0
8
+ - 1
9
+ version: 0.3.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jeremy Ashkenas
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-05 00:00:00 -04:00
18
+ date: 2010-08-10 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies: []
21
21