luccasmaso-docsplit 0.7.4.1 → 0.7.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5c348a5547895d1dadbbd0330346bf1e295677a4
4
- data.tar.gz: 5f90fad7ad4849b9b85854a4a50e8450b13b1b9e
3
+ metadata.gz: 503116ea38655488e9a9f29ca17b2be29a3585c0
4
+ data.tar.gz: 26b0bc22c27bda0ef2e711a7cb0ecd46852ee5f7
5
5
  SHA512:
6
- metadata.gz: 90186b801914fe20ab18f1ab2c8bc5669a9933745e312f0a326d185c0060f4d7cc2d2b65194a8b9f1fe4db39495041a63222d6b2ead650ae991fd50124ea503c
7
- data.tar.gz: eacf34acea2bfc17da2c0d84bdc2a696d9a32e22ca29326727a07cad422f82de6e11f98646d9b362d6cfb8f2dd248d96c091a965b048fd5e4f886ba5e3c7da4f
6
+ metadata.gz: 490f30ec2b1410e026c30a8fafc6676d5e117e33f4c326b11875a209d98df62964fca4ae1bf28397319408df425184c3b0d6b0fc9f8f98586545ae83df040cfa
7
+ data.tar.gz: 0dbac40654167ff244b0dea9552219537f4a0a8947d5587bd2fdb206d0d85916f0421ab19fc50d81ccaaeeb9c5b241dda3bee4bd7d74b52e8d15e0528a0222a6
@@ -1,8 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
- s.name = 'luccasmaso-docsplit'
3
- s.version = '0.7.4.1' # Keep version in sync with docsplit.rb
4
- s.date = '2014-02-16'
5
-
2
+ s.name = 'luccasmaso-docsplit'
3
+ s.version = '0.7.4.2' # Keep version in sync with docsplit.rb
4
+ s.date = '2014-11-17'
6
5
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
6
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
7
  s.description = <<-EOS
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.4' # Keep in sync with gemspec.
8
+ VERSION = '0.7.6' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
18
18
 
19
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
19
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
20
20
 
21
21
  # Check for all dependencies, and note their absence.
22
22
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
29
29
  end
30
30
  end
31
31
 
32
- # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
32
+ # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
33
+ if DEPENDENCIES[:tesseract]
34
+ # osd will be listed in tesseract --listlangs
35
+ val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
36
+ DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
37
+ end
38
+
39
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
33
40
  # broke.
34
41
  class ExtractionFailed < StandardError; end
35
42
 
@@ -99,7 +99,9 @@ Options:
99
99
  end
100
100
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
101
101
  @options[:language] = l
102
- @options[:clean] = false
102
+ end
103
+ opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
104
+ @options[:detect_orientation] = false
103
105
  end
104
106
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
105
107
  @options[:rolling] = true
@@ -9,13 +9,13 @@ module Docsplit
9
9
  extract_options opts
10
10
  [pdfs].flatten.each do |pdf|
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
- page_path = File.join(@output, "#{pdf_name}_%d.pdf")
12
+ page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
14
 
15
15
  cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
- "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
16
+ "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17
17
  else
18
- "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
18
+ "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19
19
  end
20
20
  result = `#{cmd}`.chomp
21
21
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
@@ -33,4 +33,4 @@ module Docsplit
33
33
 
34
34
  end
35
35
 
36
- end
36
+ end
@@ -23,7 +23,7 @@ module Docsplit
23
23
  unless @@version_string
24
24
  null = windows? ? "NUL" : "/dev/null"
25
25
  @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
26
- if !!@@version_string.match(/[0-9]*/)
26
+ if !!@@version_string.to_s.match(/[0-9]*/)
27
27
  @@version_string = `#{office_executable} --version`.split("\n").first
28
28
  end
29
29
  end
@@ -61,6 +61,10 @@ module Docsplit
61
61
  /usr/lib64/openoffice
62
62
  /opt/openoffice.org3
63
63
  /app/vendor/libreoffice
64
+ /usr/bin/libreoffice
65
+ /usr/local/bin
66
+ /usr/lib64/libreoffice
67
+ /usr/lib64/openoffice.org3
64
68
  )
65
69
  end
66
70
  search_paths
@@ -60,13 +60,14 @@ module Docsplit
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
62
  escaped_pdf = ESCAPE[pdf]
63
+ psm = @detect_orientation ? "-psm 1" : ""
63
64
  if pages
64
65
  pages.each do |page|
65
66
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
67
  escaped_tiff = ESCAPE[tiff]
67
68
  file = "#{base_path}_#{page}"
68
69
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
70
71
  clean_text(file + '.txt') if @clean_ocr
71
72
  FileUtils.remove_entry_secure tiff
72
73
  end
@@ -74,7 +75,8 @@ module Docsplit
74
75
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
76
  escaped_tiff = ESCAPE[tiff]
76
77
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
+ #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
79
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
78
80
  clean_text(base_path + '.txt') if @clean_ocr
79
81
  end
80
82
  ensure
@@ -117,14 +119,15 @@ module Docsplit
117
119
  end
118
120
 
119
121
  def extract_options(options)
120
- @output = options[:output] || '.'
121
- @pages = options[:pages]
122
- @force_ocr = options[:ocr] == true
123
- @forbid_ocr = options[:ocr] == false
124
- @clean_ocr = !(options[:clean] == false)
125
- @language = options[:language] || 'eng'
122
+ @output = options[:output] || '.'
123
+ @pages = options[:pages]
124
+ @force_ocr = options[:ocr] == true
125
+ @forbid_ocr = options[:ocr] == false
126
+ @language = options[:language] || 'eng'
127
+ @clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128
+ @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
126
129
  end
127
130
 
128
131
  end
129
132
 
130
- end
133
+ end
@@ -8,17 +8,20 @@ module Docsplit
8
8
  # through further extraction.
9
9
  def ensure_pdfs(docs)
10
10
  [docs].flatten.map do |doc|
11
- ext = File.extname(doc)
12
- if ext.downcase == '.pdf' || File.open(doc, &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?$/
11
+ if is_pdf?(doc)
13
12
  doc
14
13
  else
15
14
  tempdir = File.join(Dir.tmpdir, 'docsplit')
16
15
  extract_pdf([doc], {:output => tempdir})
17
- File.join(tempdir, File.basename(doc, ext) + '.pdf')
16
+ File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
18
17
  end
19
18
  end
20
19
  end
21
20
 
21
+ def is_pdf?(doc)
22
+ File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?/
23
+ end
24
+
22
25
  end
23
26
 
24
27
  extend TransparentPDFs
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: luccasmaso-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.4.1
4
+ version: 0.7.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-02-16 00:00:00.000000000 Z
13
+ date: 2014-11-17 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: |2
16
16
  Docsplit is a command-line utility and Ruby library for splitting apart
@@ -66,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
66
66
  version: '0'
67
67
  requirements: []
68
68
  rubyforge_project: docsplit
69
- rubygems_version: 2.1.11
69
+ rubygems_version: 2.0.14
70
70
  signing_key:
71
71
  specification_version: 4
72
72
  summary: Break Apart Documents into Images, Text, Pages and PDFs