burisu-docsplit 0.7.7 → 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 370a33126963926b13bef202fb15e05127a02db0
4
- data.tar.gz: 76024f613e3ad9a339cc207ac428037c4ba6f7ef
3
+ metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
4
+ data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
5
5
  SHA512:
6
- metadata.gz: d3564ec6ea484e25fd09f8e3b135bdbfb31c02ed64e74f5f3f269c38fbd58ab9f2c0d63cf9387cedd7eb10549832d583819f2caf09e9d2c2b3316da1c31243e4
7
- data.tar.gz: 6a88a1820ab2bf23a0dacab2d54d5949de9bee18d1ba2bda86ce67948dfec97f2dee8dd3195a01033199d2e0b5c28b8e530c7d493aaafa910d2781af82371ee8
6
+ metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
7
+ data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'burisu-docsplit'
3
- s.version = '0.7.7' # Keep version in sync with docsplit.rb
3
+ s.version = '0.7.8' # Keep version in sync with docsplit.rb
4
4
  s.homepage = "http://documentcloud.github.com/docsplit/"
5
5
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
6
6
  s.description = <<-EOS
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.5' # Keep in sync with gemspec.
8
+ VERSION = '0.7.6' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
18
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
20
20
 
21
21
  # Check for all dependencies, and note their absence.
22
22
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
29
29
  end
30
30
  end
31
31
 
32
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
32
+ # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
+ if DEPENDENCIES[:tesseract]
34
+ # osd will be listed in tesseract --listlangs
35
+ val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
36
+ DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
+ end
38
+
39
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
40
  # broke.
34
41
  class ExtractionFailed < StandardError; end
35
42
 
@@ -96,7 +96,9 @@ Options:
96
96
  end
97
97
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
98
  @options[:language] = l
99
- @options[:clean] = false
99
+ end
100
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
101
+ @options[:detect_orientation] = false
100
102
  end
101
103
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
102
104
  @options[:rolling] = true
@@ -61,6 +61,10 @@ module Docsplit
61
61
  /usr/lib64/openoffice
62
62
  /opt/openoffice.org3
63
63
  /app/vendor/libreoffice
64
+ /usr/bin/libreoffice
65
+ /usr/local/bin
66
+ /usr/lib64/libreoffice
67
+ /usr/lib64/openoffice.org3
64
68
  )
65
69
  end
66
70
  search_paths
@@ -60,13 +60,14 @@ module Docsplit
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
62
  escaped_pdf = ESCAPE[pdf]
63
+ psm = @detect_orientation ? "-psm 1" : ""
63
64
  if pages
64
65
  pages.each do |page|
65
66
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
67
  escaped_tiff = ESCAPE[tiff]
67
68
  file = "#{base_path}_#{page}"
68
69
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
70
71
  clean_text(file + '.txt') if @clean_ocr
71
72
  FileUtils.remove_entry_secure tiff
72
73
  end
@@ -74,7 +75,8 @@ module Docsplit
74
75
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
76
  escaped_tiff = ESCAPE[tiff]
76
77
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
78
80
  clean_text(base_path + '.txt') if @clean_ocr
79
81
  end
80
82
  ensure
@@ -117,14 +119,15 @@ module Docsplit
117
119
  end
118
120
 
119
121
  def extract_options(options)
120
- @output = options[:output] || '.'
121
- @pages = options[:pages]
122
- @force_ocr = options[:ocr] == true
123
- @forbid_ocr = options[:ocr] == false
124
- @clean_ocr = !(options[:clean] == false)
125
- @language = options[:language] || 'eng'
122
+ @output = options[:output] || '.'
123
+ @pages = options[:pages]
124
+ @force_ocr = options[:ocr] == true
125
+ @forbid_ocr = options[:ocr] == false
126
+ @language = options[:language] || 'eng'
127
+ @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
+ @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
126
129
  end
127
130
 
128
131
  end
129
132
 
130
- end
133
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: burisu-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.7
4
+ version: 0.7.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-10-18 00:00:00.000000000 Z
13
+ date: 2015-06-26 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart
@@ -66,8 +66,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
66
66
  version: '0'
67
67
  requirements: []
68
68
  rubyforge_project:
69
- rubygems_version: 2.2.2
69
+ rubygems_version: 2.4.5
70
70
  signing_key:
71
71
  specification_version: 4
72
72
  summary: Break Apart Documents into Images, Text, Pages and PDFs
73
73
  test_files: []
74
+ has_rdoc: