docsplit 0.6.3 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +12 -10
- data/lib/docsplit/command_line.rb +3 -0
- data/lib/docsplit/image_extractor.rb +1 -1
- data/lib/docsplit/page_extractor.rb +6 -1
- metadata +10 -7
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.6.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.6.4' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2012-11-12'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,13 +1,20 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'shellwords'
|
4
|
+
|
1
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
6
|
module Docsplit
|
3
7
|
|
4
|
-
VERSION = '0.6.
|
8
|
+
VERSION = '0.6.4' # Keep in sync with gemspec.
|
9
|
+
|
10
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
5
11
|
|
6
12
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
|
+
ESCAPED_ROOT = ESCAPE[ROOT]
|
7
14
|
|
8
|
-
CLASSPATH = "#{
|
15
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
9
16
|
|
10
|
-
LOGGING = "-Djava.util.logging.config.file=#{
|
17
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
11
18
|
|
12
19
|
HEADLESS = "-Djava.awt.headless=true"
|
13
20
|
|
@@ -20,9 +27,7 @@ module Docsplit
|
|
20
27
|
|
21
28
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
22
29
|
|
23
|
-
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
24
|
-
|
25
|
-
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
30
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
26
31
|
|
27
32
|
# Check for all dependencies, and note their absence.
|
28
33
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -71,7 +76,7 @@ module Docsplit
|
|
71
76
|
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
72
77
|
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
73
78
|
else
|
74
|
-
options = "-jar #{
|
79
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
75
80
|
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
76
81
|
end
|
77
82
|
end
|
@@ -117,9 +122,6 @@ module Docsplit
|
|
117
122
|
|
118
123
|
end
|
119
124
|
|
120
|
-
require 'tmpdir'
|
121
|
-
require 'fileutils'
|
122
|
-
require 'shellwords'
|
123
125
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
124
126
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
125
127
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -94,6 +94,9 @@ Options:
|
|
94
94
|
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
95
95
|
@options[:clean] = false
|
96
96
|
end
|
97
|
+
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
|
+
@options[:language] = l
|
99
|
+
end
|
97
100
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
98
101
|
@options[:rolling] = true
|
99
102
|
end
|
@@ -42,7 +42,7 @@ module Docsplit
|
|
42
42
|
else
|
43
43
|
page_list(pages).each do |page|
|
44
44
|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
-
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
46
|
result = `#{cmd}`.chomp
|
47
47
|
raise ExtractionFailed, result if $? != 0
|
48
48
|
end
|
@@ -11,7 +11,12 @@ module Docsplit
|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
12
|
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
-
|
14
|
+
|
15
|
+
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
+
"pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
|
17
|
+
else
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
19
|
+
end
|
15
20
|
result = `#{cmd}`.chomp
|
16
21
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
22
|
raise ExtractionFailed, result if $? != 0
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 15
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 6
|
8
|
-
-
|
9
|
-
version: 0.6.
|
9
|
+
- 4
|
10
|
+
version: 0.6.4
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jeremy Ashkenas
|
@@ -16,8 +17,7 @@ autorequire:
|
|
16
17
|
bindir: bin
|
17
18
|
cert_chain: []
|
18
19
|
|
19
|
-
date:
|
20
|
-
default_executable:
|
20
|
+
date: 2012-11-12 00:00:00 Z
|
21
21
|
dependencies: []
|
22
22
|
|
23
23
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
@@ -51,7 +51,6 @@ files:
|
|
51
51
|
- docsplit.gemspec
|
52
52
|
- LICENSE
|
53
53
|
- README
|
54
|
-
has_rdoc: true
|
55
54
|
homepage: http://documentcloud.github.com/docsplit/
|
56
55
|
licenses: []
|
57
56
|
|
@@ -61,23 +60,27 @@ rdoc_options: []
|
|
61
60
|
require_paths:
|
62
61
|
- lib
|
63
62
|
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
64
|
requirements:
|
65
65
|
- - ">="
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
+
hash: 3
|
67
68
|
segments:
|
68
69
|
- 0
|
69
70
|
version: "0"
|
70
71
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
71
73
|
requirements:
|
72
74
|
- - ">="
|
73
75
|
- !ruby/object:Gem::Version
|
76
|
+
hash: 3
|
74
77
|
segments:
|
75
78
|
- 0
|
76
79
|
version: "0"
|
77
80
|
requirements: []
|
78
81
|
|
79
82
|
rubyforge_project: docsplit
|
80
|
-
rubygems_version: 1.
|
83
|
+
rubygems_version: 1.8.24
|
81
84
|
signing_key:
|
82
85
|
specification_version: 3
|
83
86
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|