luccasmaso-docsplit 0.7.4.1 → 0.7.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/docsplit.gemspec +3 -4
- data/lib/docsplit.rb +10 -3
- data/lib/docsplit/command_line.rb +3 -1
- data/lib/docsplit/page_extractor.rb +4 -4
- data/lib/docsplit/pdf_extractor.rb +5 -1
- data/lib/docsplit/text_extractor.rb +12 -9
- data/lib/docsplit/transparent_pdfs.rb +6 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 503116ea38655488e9a9f29ca17b2be29a3585c0
|
4
|
+
data.tar.gz: 26b0bc22c27bda0ef2e711a7cb0ecd46852ee5f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 490f30ec2b1410e026c30a8fafc6676d5e117e33f4c326b11875a209d98df62964fca4ae1bf28397319408df425184c3b0d6b0fc9f8f98586545ae83df040cfa
|
7
|
+
data.tar.gz: 0dbac40654167ff244b0dea9552219537f4a0a8947d5587bd2fdb206d0d85916f0421ab19fc50d81ccaaeeb9c5b241dda3bee4bd7d74b52e8d15e0528a0222a6
|
data/docsplit.gemspec
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
|
-
s.name
|
3
|
-
s.version
|
4
|
-
s.date
|
5
|
-
|
2
|
+
s.name = 'luccasmaso-docsplit'
|
3
|
+
s.version = '0.7.4.2' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2014-11-17'
|
6
5
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
6
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
8
7
|
s.description = <<-EOS
|
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.7.
|
8
|
+
VERSION = '0.7.6' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
@@ -16,7 +16,7 @@ module Docsplit
|
|
16
16
|
|
17
17
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
18
18
|
|
19
|
-
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
19
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
|
20
20
|
|
21
21
|
# Check for all dependencies, and note their absence.
|
22
22
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -29,7 +29,14 @@ module Docsplit
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
#
|
32
|
+
# if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
|
33
|
+
if DEPENDENCIES[:tesseract]
|
34
|
+
# osd will be listed in tesseract --listlangs
|
35
|
+
val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
|
36
|
+
DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
|
37
|
+
end
|
38
|
+
|
39
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
33
40
|
# broke.
|
34
41
|
class ExtractionFailed < StandardError; end
|
35
42
|
|
@@ -99,7 +99,9 @@ Options:
|
|
99
99
|
end
|
100
100
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
101
101
|
@options[:language] = l
|
102
|
-
|
102
|
+
end
|
103
|
+
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
|
104
|
+
@options[:detect_orientation] = false
|
103
105
|
end
|
104
106
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
105
107
|
@options[:rolling] = true
|
@@ -9,13 +9,13 @@ module Docsplit
|
|
9
9
|
extract_options opts
|
10
10
|
[pdfs].flatten.each do |pdf|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
page_path = File.join(@output, "#{pdf_name}_%d.pdf"
|
12
|
+
page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
14
|
|
15
15
|
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
-
"pdftailor unstitch --output #{
|
16
|
+
"pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
|
17
17
|
else
|
18
|
-
"pdftk #{ESCAPE[pdf]} burst output #{
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
|
19
19
|
end
|
20
20
|
result = `#{cmd}`.chomp
|
21
21
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
@@ -33,4 +33,4 @@ module Docsplit
|
|
33
33
|
|
34
34
|
end
|
35
35
|
|
36
|
-
end
|
36
|
+
end
|
@@ -23,7 +23,7 @@ module Docsplit
|
|
23
23
|
unless @@version_string
|
24
24
|
null = windows? ? "NUL" : "/dev/null"
|
25
25
|
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
|
-
if !!@@version_string.match(/[0-9]*/)
|
26
|
+
if !!@@version_string.to_s.match(/[0-9]*/)
|
27
27
|
@@version_string = `#{office_executable} --version`.split("\n").first
|
28
28
|
end
|
29
29
|
end
|
@@ -61,6 +61,10 @@ module Docsplit
|
|
61
61
|
/usr/lib64/openoffice
|
62
62
|
/opt/openoffice.org3
|
63
63
|
/app/vendor/libreoffice
|
64
|
+
/usr/bin/libreoffice
|
65
|
+
/usr/local/bin
|
66
|
+
/usr/lib64/libreoffice
|
67
|
+
/usr/lib64/openoffice.org3
|
64
68
|
)
|
65
69
|
end
|
66
70
|
search_paths
|
@@ -60,13 +60,14 @@ module Docsplit
|
|
60
60
|
tempdir = Dir.mktmpdir
|
61
61
|
base_path = File.join(@output, @pdf_name)
|
62
62
|
escaped_pdf = ESCAPE[pdf]
|
63
|
+
psm = @detect_orientation ? "-psm 1" : ""
|
63
64
|
if pages
|
64
65
|
pages.each do |page|
|
65
66
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
67
|
escaped_tiff = ESCAPE[tiff]
|
67
68
|
file = "#{base_path}_#{page}"
|
68
69
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
69
|
-
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
|
70
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
|
70
71
|
clean_text(file + '.txt') if @clean_ocr
|
71
72
|
FileUtils.remove_entry_secure tiff
|
72
73
|
end
|
@@ -74,7 +75,8 @@ module Docsplit
|
|
74
75
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
75
76
|
escaped_tiff = ESCAPE[tiff]
|
76
77
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
77
|
-
|
78
|
+
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
79
|
+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
|
78
80
|
clean_text(base_path + '.txt') if @clean_ocr
|
79
81
|
end
|
80
82
|
ensure
|
@@ -117,14 +119,15 @@ module Docsplit
|
|
117
119
|
end
|
118
120
|
|
119
121
|
def extract_options(options)
|
120
|
-
@output
|
121
|
-
@pages
|
122
|
-
@force_ocr
|
123
|
-
@forbid_ocr
|
124
|
-
@
|
125
|
-
@
|
122
|
+
@output = options[:output] || '.'
|
123
|
+
@pages = options[:pages]
|
124
|
+
@force_ocr = options[:ocr] == true
|
125
|
+
@forbid_ocr = options[:ocr] == false
|
126
|
+
@language = options[:language] || 'eng'
|
127
|
+
@clean_ocr = (!(options[:clean] == false) and @language == 'eng')
|
128
|
+
@detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
|
126
129
|
end
|
127
130
|
|
128
131
|
end
|
129
132
|
|
130
|
-
end
|
133
|
+
end
|
@@ -8,17 +8,20 @@ module Docsplit
|
|
8
8
|
# through further extraction.
|
9
9
|
def ensure_pdfs(docs)
|
10
10
|
[docs].flatten.map do |doc|
|
11
|
-
|
12
|
-
if ext.downcase == '.pdf' || File.open(doc, &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?$/
|
11
|
+
if is_pdf?(doc)
|
13
12
|
doc
|
14
13
|
else
|
15
14
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
15
|
extract_pdf([doc], {:output => tempdir})
|
17
|
-
File.join(tempdir, File.basename(doc,
|
16
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
18
17
|
end
|
19
18
|
end
|
20
19
|
end
|
21
20
|
|
21
|
+
def is_pdf?(doc)
|
22
|
+
File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?/
|
23
|
+
end
|
24
|
+
|
22
25
|
end
|
23
26
|
|
24
27
|
extend TransparentPDFs
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: luccasmaso-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.4.
|
4
|
+
version: 0.7.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-11-17 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: |2
|
16
16
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
@@ -66,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
66
|
version: '0'
|
67
67
|
requirements: []
|
68
68
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version: 2.
|
69
|
+
rubygems_version: 2.0.14
|
70
70
|
signing_key:
|
71
71
|
specification_version: 4
|
72
72
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|