docsplit 0.7.5 → 0.7.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ebdecba4d9b5b3a19244e08a2f3bcbaff8d8fab1
4
- data.tar.gz: adb719d204a184313c1d282c837a7d1c929977ff
3
+ metadata.gz: 6c4106dcd5d8d9f8f6a1915a99a438b293154e1e
4
+ data.tar.gz: 90450ce6412bbedb022f4bc68ec7171f47b5d829
5
5
  SHA512:
6
- metadata.gz: 8e58b660472bbff77ce500e50007c1dcdeec9adce59d527d9e331b42bde76d4ff9f2df9aece56e3793eef9d49795aafeddc8a7dfc9bd56a4fa07ca69fa080ca2
7
- data.tar.gz: 1ca0706ec0b4eca050c9a1d0a45858365f5bde18b639c3ff641278dc777a1edf17128585f264b2a128e36a5388d4b2c7b9386e8f0b1993e850bc995d625c731b
6
+ metadata.gz: 1f6ccf476687ce1bf3a5559f07d0f7d8ebd2a80034b102b3058f538fb962a3b537b8e3eaeb245df27f14a4dc70716b69e34599bb50edf3e99e7b8a7b3f38d98d
7
+ data.tar.gz: 912d974bc4ed17942d32a932232439cd2df6903d6d20e72af31e0e80a1c70fc5e58d4be63bd00f245c53be90dc93a815ffd41a25268072367a1a244a5cb59ec4
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.5' # Keep version in sync with docsplit.rb
4
- s.date = '2014-05-28'
3
+ s.version = '0.7.6' # Keep version in sync with docsplit.rb
4
+ s.date = '2014-11-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.5' # Keep in sync with gemspec.
8
+ VERSION = '0.7.6' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
18
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
20
20
 
21
21
  # Check for all dependencies, and note their absence.
22
22
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
29
29
  end
30
30
  end
31
31
 
32
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
32
+ # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
+ if DEPENDENCIES[:tesseract]
34
+ # osd will be listed in tesseract --listlangs
35
+ val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
36
+ DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
+ end
38
+
39
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
40
  # broke.
34
41
  class ExtractionFailed < StandardError; end
35
42
 
@@ -96,7 +96,9 @@ Options:
96
96
  end
97
97
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
98
  @options[:language] = l
99
- @options[:clean] = false
99
+ end
100
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
101
+ @options[:detect_orientation] = false
100
102
  end
101
103
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
102
104
  @options[:rolling] = true
@@ -9,13 +9,13 @@ module Docsplit
9
9
  extract_options opts
10
10
  [pdfs].flatten.each do |pdf|
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
- page_path = File.join(@output, "#{pdf_name}_%d.pdf")
12
+ page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
14
 
15
15
  cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
- "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
16
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
17
  else
18
- "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
18
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
19
  end
20
20
  result = `#{cmd}`.chomp
21
21
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
@@ -33,4 +33,4 @@ module Docsplit
33
33
 
34
34
  end
35
35
 
36
- end
36
+ end
@@ -23,7 +23,7 @@ module Docsplit
23
23
  unless @@version_string
24
24
  null = windows? ? "NUL" : "/dev/null"
25
25
  @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
- if !!@@version_string.match(/[0-9]*/)
26
+ if !!@@version_string.to_s.match(/[0-9]*/)
27
27
  @@version_string = `#{office_executable} --version`.split("\n").first
28
28
  end
29
29
  end
@@ -61,6 +61,10 @@ module Docsplit
61
61
  /usr/lib64/openoffice
62
62
  /opt/openoffice.org3
63
63
  /app/vendor/libreoffice
64
+ /usr/bin/libreoffice
65
+ /usr/local/bin
66
+ /usr/lib64/libreoffice
67
+ /usr/lib64/openoffice.org3
64
68
  )
65
69
  end
66
70
  search_paths
@@ -60,13 +60,14 @@ module Docsplit
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
62
  escaped_pdf = ESCAPE[pdf]
63
+ psm = @detect_orientation ? "-psm 1" : ""
63
64
  if pages
64
65
  pages.each do |page|
65
66
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
67
  escaped_tiff = ESCAPE[tiff]
67
68
  file = "#{base_path}_#{page}"
68
69
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
70
71
  clean_text(file + '.txt') if @clean_ocr
71
72
  FileUtils.remove_entry_secure tiff
72
73
  end
@@ -74,7 +75,8 @@ module Docsplit
74
75
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
76
  escaped_tiff = ESCAPE[tiff]
76
77
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
78
80
  clean_text(base_path + '.txt') if @clean_ocr
79
81
  end
80
82
  ensure
@@ -117,14 +119,15 @@ module Docsplit
117
119
  end
118
120
 
119
121
  def extract_options(options)
120
- @output = options[:output] || '.'
121
- @pages = options[:pages]
122
- @force_ocr = options[:ocr] == true
123
- @forbid_ocr = options[:ocr] == false
124
- @clean_ocr = !(options[:clean] == false)
125
- @language = options[:language] || 'eng'
122
+ @output = options[:output] || '.'
123
+ @pages = options[:pages]
124
+ @force_ocr = options[:ocr] == true
125
+ @forbid_ocr = options[:ocr] == false
126
+ @language = options[:language] || 'eng'
127
+ @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
+ @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
126
129
  end
127
130
 
128
131
  end
129
132
 
130
- end
133
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.5
4
+ version: 0.7.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-05-28 00:00:00.000000000 Z
13
+ date: 2014-11-17 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart