luccasmaso-docsplit 0.7.4.1 → 0.7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docsplit.gemspec +3 -4
- data/lib/docsplit.rb +10 -3
- data/lib/docsplit/command_line.rb +3 -1
- data/lib/docsplit/page_extractor.rb +4 -4
- data/lib/docsplit/pdf_extractor.rb +5 -1
- data/lib/docsplit/text_extractor.rb +12 -9
- data/lib/docsplit/transparent_pdfs.rb +6 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 503116ea38655488e9a9f29ca17b2be29a3585c0
|
4
|
+
data.tar.gz: 26b0bc22c27bda0ef2e711a7cb0ecd46852ee5f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 490f30ec2b1410e026c30a8fafc6676d5e117e33f4c326b11875a209d98df62964fca4ae1bf28397319408df425184c3b0d6b0fc9f8f98586545ae83df040cfa
|
7
|
+
data.tar.gz: 0dbac40654167ff244b0dea9552219537f4a0a8947d5587bd2fdb206d0d85916f0421ab19fc50d81ccaaeeb9c5b241dda3bee4bd7d74b52e8d15e0528a0222a6
|
data/docsplit.gemspec
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
|
-
s.name
|
3
|
-
s.version
|
4
|
-
s.date
|
5
|
-
|
2
|
+
s.name = 'luccasmaso-docsplit'
|
3
|
+
s.version = '0.7.4.2' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2014-11-17'
|
6
5
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
6
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
8
7
|
s.description = <<-EOS
|
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.7.
|
8
|
+
VERSION = '0.7.6' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
@@ -16,7 +16,7 @@ module Docsplit
|
|
16
16
|
|
17
17
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
18
18
|
|
19
|
-
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
19
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
|
20
20
|
|
21
21
|
# Check for all dependencies, and note their absence.
|
22
22
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -29,7 +29,14 @@ module Docsplit
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
#
|
32
|
+
# if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
|
33
|
+
if DEPENDENCIES[:tesseract]
|
34
|
+
# osd will be listed in tesseract --listlangs
|
35
|
+
val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
|
36
|
+
DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
|
37
|
+
end
|
38
|
+
|
39
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
33
40
|
# broke.
|
34
41
|
class ExtractionFailed < StandardError; end
|
35
42
|
|
@@ -99,7 +99,9 @@ Options:
|
|
99
99
|
end
|
100
100
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
101
101
|
@options[:language] = l
|
102
|
-
|
102
|
+
end
|
103
|
+
opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
|
104
|
+
@options[:detect_orientation] = false
|
103
105
|
end
|
104
106
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
105
107
|
@options[:rolling] = true
|
@@ -9,13 +9,13 @@ module Docsplit
|
|
9
9
|
extract_options opts
|
10
10
|
[pdfs].flatten.each do |pdf|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
-
page_path = File.join(@output, "#{pdf_name}_%d.pdf"
|
12
|
+
page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
14
|
|
15
15
|
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
-
"pdftailor unstitch --output #{
|
16
|
+
"pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
|
17
17
|
else
|
18
|
-
"pdftk #{ESCAPE[pdf]} burst output #{
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
|
19
19
|
end
|
20
20
|
result = `#{cmd}`.chomp
|
21
21
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
@@ -33,4 +33,4 @@ module Docsplit
|
|
33
33
|
|
34
34
|
end
|
35
35
|
|
36
|
-
end
|
36
|
+
end
|
@@ -23,7 +23,7 @@ module Docsplit
|
|
23
23
|
unless @@version_string
|
24
24
|
null = windows? ? "NUL" : "/dev/null"
|
25
25
|
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
|
26
|
-
if !!@@version_string.match(/[0-9]*/)
|
26
|
+
if !!@@version_string.to_s.match(/[0-9]*/)
|
27
27
|
@@version_string = `#{office_executable} --version`.split("\n").first
|
28
28
|
end
|
29
29
|
end
|
@@ -61,6 +61,10 @@ module Docsplit
|
|
61
61
|
/usr/lib64/openoffice
|
62
62
|
/opt/openoffice.org3
|
63
63
|
/app/vendor/libreoffice
|
64
|
+
/usr/bin/libreoffice
|
65
|
+
/usr/local/bin
|
66
|
+
/usr/lib64/libreoffice
|
67
|
+
/usr/lib64/openoffice.org3
|
64
68
|
)
|
65
69
|
end
|
66
70
|
search_paths
|
@@ -60,13 +60,14 @@ module Docsplit
|
|
60
60
|
tempdir = Dir.mktmpdir
|
61
61
|
base_path = File.join(@output, @pdf_name)
|
62
62
|
escaped_pdf = ESCAPE[pdf]
|
63
|
+
psm = @detect_orientation ? "-psm 1" : ""
|
63
64
|
if pages
|
64
65
|
pages.each do |page|
|
65
66
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
66
67
|
escaped_tiff = ESCAPE[tiff]
|
67
68
|
file = "#{base_path}_#{page}"
|
68
69
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
69
|
-
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
|
70
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
|
70
71
|
clean_text(file + '.txt') if @clean_ocr
|
71
72
|
FileUtils.remove_entry_secure tiff
|
72
73
|
end
|
@@ -74,7 +75,8 @@ module Docsplit
|
|
74
75
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
75
76
|
escaped_tiff = ESCAPE[tiff]
|
76
77
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
77
|
-
|
78
|
+
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
|
79
|
+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
|
78
80
|
clean_text(base_path + '.txt') if @clean_ocr
|
79
81
|
end
|
80
82
|
ensure
|
@@ -117,14 +119,15 @@ module Docsplit
|
|
117
119
|
end
|
118
120
|
|
119
121
|
def extract_options(options)
|
120
|
-
@output
|
121
|
-
@pages
|
122
|
-
@force_ocr
|
123
|
-
@forbid_ocr
|
124
|
-
@
|
125
|
-
@
|
122
|
+
@output = options[:output] || '.'
|
123
|
+
@pages = options[:pages]
|
124
|
+
@force_ocr = options[:ocr] == true
|
125
|
+
@forbid_ocr = options[:ocr] == false
|
126
|
+
@language = options[:language] || 'eng'
|
127
|
+
@clean_ocr = (!(options[:clean] == false) and @language == 'eng')
|
128
|
+
@detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
|
126
129
|
end
|
127
130
|
|
128
131
|
end
|
129
132
|
|
130
|
-
end
|
133
|
+
end
|
@@ -8,17 +8,20 @@ module Docsplit
|
|
8
8
|
# through further extraction.
|
9
9
|
def ensure_pdfs(docs)
|
10
10
|
[docs].flatten.map do |doc|
|
11
|
-
|
12
|
-
if ext.downcase == '.pdf' || File.open(doc, &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?$/
|
11
|
+
if is_pdf?(doc)
|
13
12
|
doc
|
14
13
|
else
|
15
14
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
15
|
extract_pdf([doc], {:output => tempdir})
|
17
|
-
File.join(tempdir, File.basename(doc,
|
16
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
18
17
|
end
|
19
18
|
end
|
20
19
|
end
|
21
20
|
|
21
|
+
def is_pdf?(doc)
|
22
|
+
File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?/
|
23
|
+
end
|
24
|
+
|
22
25
|
end
|
23
26
|
|
24
27
|
extend TransparentPDFs
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: luccasmaso-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.4.
|
4
|
+
version: 0.7.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-11-17 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: |2
|
16
16
|
Docsplit is a command-line utility and Ruby library for splitting apart
|
@@ -66,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
66
|
version: '0'
|
67
67
|
requirements: []
|
68
68
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version: 2.
|
69
|
+
rubygems_version: 2.0.14
|
70
70
|
signing_key:
|
71
71
|
specification_version: 4
|
72
72
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|