docsplit 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-10'
3
+ s.version = '0.3.3' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.1' # Keep in sync with gemspec.
4
+ VERSION = '0.3.3' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -24,6 +24,7 @@ module Docsplit
24
24
 
25
25
  # Convert a single PDF into page images at the specified size and format.
26
26
  def convert(pdf, size, format, previous=nil)
27
+ tempdir = Dir.mktmpdir
27
28
  basename = File.basename(pdf, File.extname(pdf))
28
29
  directory = directory_for(size)
29
30
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
@@ -31,13 +32,14 @@ module Docsplit
31
32
  common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
32
33
  if previous
33
34
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
34
- cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
36
  else
36
- cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
38
  end
38
39
  result = `#{cmd}`.chomp
39
40
  raise ExtractionFailed, result if $? != 0
40
41
  renumber_images(out_file, format)
42
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
41
43
  end
42
44
 
43
45
 
@@ -42,7 +42,6 @@ module Docsplit
42
42
  end
43
43
  end
44
44
  end
45
- FileUtils.remove_entry_secure @tempdir if @tempdir
46
45
  end
47
46
 
48
47
  # Does a PDF have any text embedded?
@@ -59,19 +58,20 @@ module Docsplit
59
58
 
60
59
  # Extract a page range worth of text from a PDF via OCR.
61
60
  def extract_from_ocr(pdf, pages)
62
- @tempdir ||= Dir.mktmpdir
61
+ tempdir = Dir.mktmpdir
63
62
  base_path = File.join(@output, @pdf_name)
64
63
  if pages
65
- run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
64
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
66
65
  @tiffs_generated = true
67
66
  pages.each do |page|
68
- run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
67
+ run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
69
68
  end
70
69
  else
71
- tiff = "#{@tempdir}/#{@pdf_name}.tif"
72
- run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
70
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
71
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
73
72
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
74
73
  end
74
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
75
75
  end
76
76
 
77
77
 
@@ -12,9 +12,9 @@ module Docsplit
12
12
  if ext.downcase == '.pdf'
13
13
  doc
14
14
  else
15
- @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
16
- extract_pdf([doc], {:output => @tempdir})
17
- File.join(@tempdir, File.basename(doc, ext) + '.pdf')
15
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => tempdir})
17
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
18
18
  end
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 21
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 3
8
- - 1
9
- version: 0.3.1
9
+ - 3
10
+ version: 0.3.3
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jeremy Ashkenas
@@ -15,7 +16,7 @@ autorequire:
15
16
  bindir: bin
16
17
  cert_chain: []
17
18
 
18
- date: 2010-08-10 00:00:00 -04:00
19
+ date: 2010-08-17 00:00:00 -04:00
19
20
  default_executable:
20
21
  dependencies: []
21
22
 
@@ -60,23 +61,27 @@ rdoc_options: []
60
61
  require_paths:
61
62
  - lib
62
63
  required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
63
65
  requirements:
64
66
  - - ">="
65
67
  - !ruby/object:Gem::Version
68
+ hash: 3
66
69
  segments:
67
70
  - 0
68
71
  version: "0"
69
72
  required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
70
74
  requirements:
71
75
  - - ">="
72
76
  - !ruby/object:Gem::Version
77
+ hash: 3
73
78
  segments:
74
79
  - 0
75
80
  version: "0"
76
81
  requirements: []
77
82
 
78
83
  rubyforge_project: docsplit
79
- rubygems_version: 1.3.6
84
+ rubygems_version: 1.3.7
80
85
  signing_key:
81
86
  specification_version: 3
82
87
  summary: Break Apart Documents into Images, Text, Pages and PDFs