docsplit 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.3' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-17'
3
+ s.version = '0.3.4' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-20'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.3' # Keep in sync with gemspec.
4
+ VERSION = '0.3.4' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -5,7 +5,7 @@ module Docsplit
5
5
  class ImageExtractor
6
6
 
7
7
  DENSITY_ARG = "-density 150"
8
- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
8
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
9
  DEFAULT_FORMAT = :png
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
@@ -23,22 +23,29 @@ module Docsplit
23
23
  end
24
24
 
25
25
  # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
26
30
  def convert(pdf, size, format, previous=nil)
27
31
  tempdir = Dir.mktmpdir
28
32
  basename = File.basename(pdf, File.extname(pdf))
29
33
  directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
30
35
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
31
- out_file = File.join(directory, "#{basename}_%05d.#{format}")
32
36
  common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
33
37
  if previous
34
38
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
35
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
39
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
40
+ raise ExtractionFailed, result if $? != 0
36
41
  else
37
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
42
+ page_list(pages).each do |page|
43
+ out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
45
+ result = `#{cmd}`.chomp
46
+ raise ExtractionFailed, result if $? != 0
47
+ end
38
48
  end
39
- result = `#{cmd}`.chomp
40
- raise ExtractionFailed, result if $? != 0
41
- renumber_images(out_file, format)
42
49
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
43
50
  end
44
51
 
@@ -76,39 +83,16 @@ module Docsplit
76
83
  end
77
84
  end
78
85
 
79
- # Generate the requested page index into the document.
80
- def pages_arg
81
- return '' if @pages.nil?
82
- pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
83
- "[#{pages}]"
84
- end
85
-
86
86
  # Generate the expanded list of requested page numbers.
87
- def page_list
88
- @pages.split(',').map { |range|
87
+ def page_list(pages)
88
+ pages.split(',').map { |range|
89
89
  if range.include?('-')
90
90
  range = range.split('-')
91
91
  Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
92
92
  else
93
93
  range.to_i
94
94
  end
95
- }.flatten.sort
96
- end
97
-
98
- # When GraphicsMagick is through, it will have generated a number of
99
- # incrementing page images, starting at 0. Renumber them with their correct
100
- # page numbers.
101
- def renumber_images(template, format)
102
- suffixer = /_0+(\d+)\.#{format}\Z/
103
- images = Dir[template.sub('%05d', '0*')].map do |path|
104
- index = path[suffixer, 1].to_i
105
- {:path => path, :index => index, :page_number => index + 1}
106
- end
107
- numbers = @pages ? page_list.reverse : nil
108
- images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
109
- number = numbers ? numbers[i] : image[:page_number]
110
- FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
111
- end
95
+ }.flatten.uniq.sort
112
96
  end
113
97
 
114
98
  end
@@ -17,13 +17,12 @@ module Docsplit
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
19
  OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
- MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
21
 
22
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
23
23
 
24
24
  def initialize
25
- @tiffs_generated = false
26
- @pages_to_ocr = []
25
+ @pages_to_ocr = []
27
26
  end
28
27
 
29
28
  # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ module Docsplit
61
60
  tempdir = Dir.mktmpdir
62
61
  base_path = File.join(@output, @pdf_name)
63
62
  if pages
64
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
- @tiffs_generated = true
66
63
  pages.each do |page|
67
- run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
64
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66
+ run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67
+ FileUtils.remove_entry_secure tiff
68
68
  end
69
69
  else
70
70
  tiff = "#{tempdir}/#{@pdf_name}.tif"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 3
10
- version: 0.3.3
9
+ - 4
10
+ version: 0.3.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-08-17 00:00:00 -04:00
19
+ date: 2010-08-20 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies: []
22
22