docsplit 0.7.5 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ebdecba4d9b5b3a19244e08a2f3bcbaff8d8fab1
4
- data.tar.gz: adb719d204a184313c1d282c837a7d1c929977ff
3
+ metadata.gz: 6c4106dcd5d8d9f8f6a1915a99a438b293154e1e
4
+ data.tar.gz: 90450ce6412bbedb022f4bc68ec7171f47b5d829
5
5
  SHA512:
6
- metadata.gz: 8e58b660472bbff77ce500e50007c1dcdeec9adce59d527d9e331b42bde76d4ff9f2df9aece56e3793eef9d49795aafeddc8a7dfc9bd56a4fa07ca69fa080ca2
7
- data.tar.gz: 1ca0706ec0b4eca050c9a1d0a45858365f5bde18b639c3ff641278dc777a1edf17128585f264b2a128e36a5388d4b2c7b9386e8f0b1993e850bc995d625c731b
6
+ metadata.gz: 1f6ccf476687ce1bf3a5559f07d0f7d8ebd2a80034b102b3058f538fb962a3b537b8e3eaeb245df27f14a4dc70716b69e34599bb50edf3e99e7b8a7b3f38d98d
7
+ data.tar.gz: 912d974bc4ed17942d32a932232439cd2df6903d6d20e72af31e0e80a1c70fc5e58d4be63bd00f245c53be90dc93a815ffd41a25268072367a1a244a5cb59ec4
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.5' # Keep version in sync with docsplit.rb
4
- s.date = '2014-05-28'
3
+ s.version = '0.7.6' # Keep version in sync with docsplit.rb
4
+ s.date = '2014-11-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.5' # Keep in sync with gemspec.
8
+ VERSION = '0.7.6' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
18
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
20
20
 
21
21
  # Check for all dependencies, and note their absence.
22
22
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
29
29
  end
30
30
  end
31
31
 
32
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
32
+ # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
+ if DEPENDENCIES[:tesseract]
34
+ # osd will be listed in tesseract --listlangs
35
+ val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
36
+ DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
+ end
38
+
39
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
40
  # broke.
34
41
  class ExtractionFailed < StandardError; end
35
42
 
@@ -96,7 +96,9 @@ Options:
96
96
  end
97
97
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
98
  @options[:language] = l
99
- @options[:clean] = false
99
+ end
100
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
101
+ @options[:detect_orientation] = false
100
102
  end
101
103
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
102
104
  @options[:rolling] = true
@@ -9,13 +9,13 @@ module Docsplit
9
9
  extract_options opts
10
10
  [pdfs].flatten.each do |pdf|
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
- page_path = File.join(@output, "#{pdf_name}_%d.pdf")
12
+ page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
14
 
15
15
  cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
- "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
16
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
17
  else
18
- "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
18
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
19
  end
20
20
  result = `#{cmd}`.chomp
21
21
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
@@ -33,4 +33,4 @@ module Docsplit
33
33
 
34
34
  end
35
35
 
36
- end
36
+ end
@@ -23,7 +23,7 @@ module Docsplit
23
23
  unless @@version_string
24
24
  null = windows? ? "NUL" : "/dev/null"
25
25
  @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
- if !!@@version_string.match(/[0-9]*/)
26
+ if !!@@version_string.to_s.match(/[0-9]*/)
27
27
  @@version_string = `#{office_executable} --version`.split("\n").first
28
28
  end
29
29
  end
@@ -61,6 +61,10 @@ module Docsplit
61
61
  /usr/lib64/openoffice
62
62
  /opt/openoffice.org3
63
63
  /app/vendor/libreoffice
64
+ /usr/bin/libreoffice
65
+ /usr/local/bin
66
+ /usr/lib64/libreoffice
67
+ /usr/lib64/openoffice.org3
64
68
  )
65
69
  end
66
70
  search_paths
@@ -60,13 +60,14 @@ module Docsplit
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
62
  escaped_pdf = ESCAPE[pdf]
63
+ psm = @detect_orientation ? "-psm 1" : ""
63
64
  if pages
64
65
  pages.each do |page|
65
66
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
67
  escaped_tiff = ESCAPE[tiff]
67
68
  file = "#{base_path}_#{page}"
68
69
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
70
71
  clean_text(file + '.txt') if @clean_ocr
71
72
  FileUtils.remove_entry_secure tiff
72
73
  end
@@ -74,7 +75,8 @@ module Docsplit
74
75
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
76
  escaped_tiff = ESCAPE[tiff]
76
77
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
78
80
  clean_text(base_path + '.txt') if @clean_ocr
79
81
  end
80
82
  ensure
@@ -117,14 +119,15 @@ module Docsplit
117
119
  end
118
120
 
119
121
  def extract_options(options)
120
- @output = options[:output] || '.'
121
- @pages = options[:pages]
122
- @force_ocr = options[:ocr] == true
123
- @forbid_ocr = options[:ocr] == false
124
- @clean_ocr = !(options[:clean] == false)
125
- @language = options[:language] || 'eng'
122
+ @output = options[:output] || '.'
123
+ @pages = options[:pages]
124
+ @force_ocr = options[:ocr] == true
125
+ @forbid_ocr = options[:ocr] == false
126
+ @language = options[:language] || 'eng'
127
+ @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
+ @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
126
129
  end
127
130
 
128
131
  end
129
132
 
130
- end
133
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.5
4
+ version: 0.7.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-05-28 00:00:00.000000000 Z
13
+ date: 2014-11-17 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart