docsplit 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-10'
3
+ s.version = '0.3.3' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.1' # Keep in sync with gemspec.
4
+ VERSION = '0.3.3' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -24,6 +24,7 @@ module Docsplit
24
24
 
25
25
  # Convert a single PDF into page images at the specified size and format.
26
26
  def convert(pdf, size, format, previous=nil)
27
+ tempdir = Dir.mktmpdir
27
28
  basename = File.basename(pdf, File.extname(pdf))
28
29
  directory = directory_for(size)
29
30
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
@@ -31,13 +32,14 @@ module Docsplit
31
32
  common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
32
33
  if previous
33
34
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
34
- cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
36
  else
36
- cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
38
  end
38
39
  result = `#{cmd}`.chomp
39
40
  raise ExtractionFailed, result if $? != 0
40
41
  renumber_images(out_file, format)
42
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
41
43
  end
42
44
 
43
45
 
@@ -42,7 +42,6 @@ module Docsplit
42
42
  end
43
43
  end
44
44
  end
45
- FileUtils.remove_entry_secure @tempdir if @tempdir
46
45
  end
47
46
 
48
47
  # Does a PDF have any text embedded?
@@ -59,19 +58,20 @@ module Docsplit
59
58
 
60
59
  # Extract a page range worth of text from a PDF via OCR.
61
60
  def extract_from_ocr(pdf, pages)
62
- @tempdir ||= Dir.mktmpdir
61
+ tempdir = Dir.mktmpdir
63
62
  base_path = File.join(@output, @pdf_name)
64
63
  if pages
65
- run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
64
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
66
65
  @tiffs_generated = true
67
66
  pages.each do |page|
68
- run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
67
+ run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
69
68
  end
70
69
  else
71
- tiff = "#{@tempdir}/#{@pdf_name}.tif"
72
- run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
70
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
71
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
73
72
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
74
73
  end
74
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
75
75
  end
76
76
 
77
77
 
@@ -12,9 +12,9 @@ module Docsplit
12
12
  if ext.downcase == '.pdf'
13
13
  doc
14
14
  else
15
- @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
16
- extract_pdf([doc], {:output => @tempdir})
17
- File.join(@tempdir, File.basename(doc, ext) + '.pdf')
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
18
  end
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 21
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 3
8
- - 1
9
- version: 0.3.1
9
+ - 3
10
+ version: 0.3.3
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jeremy Ashkenas
@@ -15,7 +16,7 @@ autorequire:
15
16
  bindir: bin
16
17
  cert_chain: []
17
18
 
18
- date: 2010-08-10 00:00:00 -04:00
19
+ date: 2010-08-17 00:00:00 -04:00
19
20
  default_executable:
20
21
  dependencies: []
21
22
 
@@ -60,23 +61,27 @@ rdoc_options: []
60
61
  require_paths:
61
62
  - lib
62
63
  required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
63
65
  requirements:
64
66
  - - ">="
65
67
  - !ruby/object:Gem::Version
68
+ hash: 3
66
69
  segments:
67
70
  - 0
68
71
  version: "0"
69
72
  required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
70
74
  requirements:
71
75
  - - ">="
72
76
  - !ruby/object:Gem::Version
77
+ hash: 3
73
78
  segments:
74
79
  - 0
75
80
  version: "0"
76
81
  requirements: []
77
82
 
78
83
  rubyforge_project: docsplit
79
- rubygems_version: 1.3.6
84
+ rubygems_version: 1.3.7
80
85
  signing_key:
81
86
  specification_version: 3
82
87
  summary: Break Apart Documents into Images, Text, Pages and PDFs