docsplit 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-5'
3
+ s.version = '0.3.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-10'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.0' # Keep in sync with gemspec.
4
+ VERSION = '0.3.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -16,7 +16,8 @@ module Docsplit
16
16
 
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
- OCR_FLAGS = '-density 200x200 -colorspace GRAY'
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+ MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
20
21
 
21
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
22
23
 
@@ -61,14 +62,14 @@ module Docsplit
61
62
  @tempdir ||= Dir.mktmpdir
62
63
  base_path = File.join(@output, @pdf_name)
63
64
  if pages
64
- run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
+ run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
66
  @tiffs_generated = true
66
67
  pages.each do |page|
67
68
  run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
68
69
  end
69
70
  else
70
71
  tiff = "#{@tempdir}/#{@pdf_name}.tif"
71
- run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
+ run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
73
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
73
74
  end
74
75
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 0
9
- version: 0.3.0
8
+ - 1
9
+ version: 0.3.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jeremy Ashkenas
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-05 00:00:00 -04:00
18
+ date: 2010-08-10 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies: []
21
21