docsplit 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.3' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-17'
3
+ s.version = '0.3.4' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-20'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.3' # Keep in sync with gemspec.
4
+ VERSION = '0.3.4' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -5,7 +5,7 @@ module Docsplit
5
5
  class ImageExtractor
6
6
 
7
7
  DENSITY_ARG = "-density 150"
8
- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
8
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
9
  DEFAULT_FORMAT = :png
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
@@ -23,22 +23,29 @@ module Docsplit
23
23
  end
24
24
 
25
25
  # Convert a single PDF into page images at the specified size and format.
26
+ # If `--rolling`, and we have a previous image at a larger size to work with,
27
+ # we simply downsample that image, instead of re-rendering the entire PDF.
28
+ # Now we generate one page at a time, a counterintuitive opimization
29
+ # suggested by the GraphicsMagick list, that seems to work quite well.
26
30
  def convert(pdf, size, format, previous=nil)
27
31
  tempdir = Dir.mktmpdir
28
32
  basename = File.basename(pdf, File.extname(pdf))
29
33
  directory = directory_for(size)
34
+ pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
30
35
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
31
- out_file = File.join(directory, "#{basename}_%05d.#{format}")
32
36
  common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
33
37
  if previous
34
38
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
35
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
39
+ result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
40
+ raise ExtractionFailed, result if $? != 0
36
41
  else
37
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
42
+ page_list(pages).each do |page|
43
+ out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
45
+ result = `#{cmd}`.chomp
46
+ raise ExtractionFailed, result if $? != 0
47
+ end
38
48
  end
39
- result = `#{cmd}`.chomp
40
- raise ExtractionFailed, result if $? != 0
41
- renumber_images(out_file, format)
42
49
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
43
50
  end
44
51
 
@@ -76,39 +83,16 @@ module Docsplit
76
83
  end
77
84
  end
78
85
 
79
- # Generate the requested page index into the document.
80
- def pages_arg
81
- return '' if @pages.nil?
82
- pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
83
- "[#{pages}]"
84
- end
85
-
86
86
  # Generate the expanded list of requested page numbers.
87
- def page_list
88
- @pages.split(',').map { |range|
87
+ def page_list(pages)
88
+ pages.split(',').map { |range|
89
89
  if range.include?('-')
90
90
  range = range.split('-')
91
91
  Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
92
92
  else
93
93
  range.to_i
94
94
  end
95
- }.flatten.sort
96
- end
97
-
98
- # When GraphicsMagick is through, it will have generated a number of
99
- # incrementing page images, starting at 0. Renumber them with their correct
100
- # page numbers.
101
- def renumber_images(template, format)
102
- suffixer = /_0+(\d+)\.#{format}\Z/
103
- images = Dir[template.sub('%05d', '0*')].map do |path|
104
- index = path[suffixer, 1].to_i
105
- {:path => path, :index => index, :page_number => index + 1}
106
- end
107
- numbers = @pages ? page_list.reverse : nil
108
- images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
109
- number = numbers ? numbers[i] : image[:page_number]
110
- FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
111
- end
95
+ }.flatten.uniq.sort
112
96
  end
113
97
 
114
98
  end
@@ -17,13 +17,12 @@ module Docsplit
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
19
  OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
- MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
20
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
21
 
22
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
23
23
 
24
24
  def initialize
25
- @tiffs_generated = false
26
- @pages_to_ocr = []
25
+ @pages_to_ocr = []
27
26
  end
28
27
 
29
28
  # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ module Docsplit
61
60
  tempdir = Dir.mktmpdir
62
61
  base_path = File.join(@output, @pdf_name)
63
62
  if pages
64
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
- @tiffs_generated = true
66
63
  pages.each do |page|
67
- run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
64
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66
+ run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67
+ FileUtils.remove_entry_secure tiff
68
68
  end
69
69
  else
70
70
  tiff = "#{tempdir}/#{@pdf_name}.tif"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 3
10
- version: 0.3.3
9
+ - 4
10
+ version: 0.3.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-08-17 00:00:00 -04:00
19
+ date: 2010-08-20 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies: []
22
22