docsplit 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +12 -10
- data/lib/docsplit/command_line.rb +3 -0
- data/lib/docsplit/image_extractor.rb +1 -1
- data/lib/docsplit/page_extractor.rb +6 -1
- metadata +10 -7
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.6.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.6.4' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2012-11-12'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,13 +1,20 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'shellwords'
|
4
|
+
|
1
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
6
|
module Docsplit
|
3
7
|
|
4
|
-
VERSION = '0.6.
|
8
|
+
VERSION = '0.6.4' # Keep in sync with gemspec.
|
9
|
+
|
10
|
+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
5
11
|
|
6
12
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
|
+
ESCAPED_ROOT = ESCAPE[ROOT]
|
7
14
|
|
8
|
-
CLASSPATH = "#{
|
15
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
9
16
|
|
10
|
-
LOGGING = "-Djava.util.logging.config.file=#{
|
17
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
11
18
|
|
12
19
|
HEADLESS = "-Djava.awt.headless=true"
|
13
20
|
|
@@ -20,9 +27,7 @@ module Docsplit
|
|
20
27
|
|
21
28
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
22
29
|
|
23
|
-
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
24
|
-
|
25
|
-
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
30
|
+
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
|
26
31
|
|
27
32
|
# Check for all dependencies, and note their absence.
|
28
33
|
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
|
@@ -71,7 +76,7 @@ module Docsplit
|
|
71
76
|
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
72
77
|
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
73
78
|
else
|
74
|
-
options = "-jar #{
|
79
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
75
80
|
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
76
81
|
end
|
77
82
|
end
|
@@ -117,9 +122,6 @@ module Docsplit
|
|
117
122
|
|
118
123
|
end
|
119
124
|
|
120
|
-
require 'tmpdir'
|
121
|
-
require 'fileutils'
|
122
|
-
require 'shellwords'
|
123
125
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
124
126
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
125
127
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -94,6 +94,9 @@ Options:
|
|
94
94
|
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
95
95
|
@options[:clean] = false
|
96
96
|
end
|
97
|
+
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
|
+
@options[:language] = l
|
99
|
+
end
|
97
100
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
98
101
|
@options[:rolling] = true
|
99
102
|
end
|
@@ -42,7 +42,7 @@ module Docsplit
|
|
42
42
|
else
|
43
43
|
page_list(pages).each do |page|
|
44
44
|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
|
45
|
-
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
45
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
|
46
46
|
result = `#{cmd}`.chomp
|
47
47
|
raise ExtractionFailed, result if $? != 0
|
48
48
|
end
|
@@ -11,7 +11,12 @@ module Docsplit
|
|
11
11
|
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
12
|
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
|
13
13
|
FileUtils.mkdir_p @output unless File.exists?(@output)
|
14
|
-
|
14
|
+
|
15
|
+
cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
|
16
|
+
"pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
|
17
|
+
else
|
18
|
+
"pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
|
19
|
+
end
|
15
20
|
result = `#{cmd}`.chomp
|
16
21
|
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
|
17
22
|
raise ExtractionFailed, result if $? != 0
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 15
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 6
|
8
|
-
-
|
9
|
-
version: 0.6.
|
9
|
+
- 4
|
10
|
+
version: 0.6.4
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jeremy Ashkenas
|
@@ -16,8 +17,7 @@ autorequire:
|
|
16
17
|
bindir: bin
|
17
18
|
cert_chain: []
|
18
19
|
|
19
|
-
date:
|
20
|
-
default_executable:
|
20
|
+
date: 2012-11-12 00:00:00 Z
|
21
21
|
dependencies: []
|
22
22
|
|
23
23
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
@@ -51,7 +51,6 @@ files:
|
|
51
51
|
- docsplit.gemspec
|
52
52
|
- LICENSE
|
53
53
|
- README
|
54
|
-
has_rdoc: true
|
55
54
|
homepage: http://documentcloud.github.com/docsplit/
|
56
55
|
licenses: []
|
57
56
|
|
@@ -61,23 +60,27 @@ rdoc_options: []
|
|
61
60
|
require_paths:
|
62
61
|
- lib
|
63
62
|
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
64
|
requirements:
|
65
65
|
- - ">="
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
+
hash: 3
|
67
68
|
segments:
|
68
69
|
- 0
|
69
70
|
version: "0"
|
70
71
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
71
73
|
requirements:
|
72
74
|
- - ">="
|
73
75
|
- !ruby/object:Gem::Version
|
76
|
+
hash: 3
|
74
77
|
segments:
|
75
78
|
- 0
|
76
79
|
version: "0"
|
77
80
|
requirements: []
|
78
81
|
|
79
82
|
rubyforge_project: docsplit
|
80
|
-
rubygems_version: 1.
|
83
|
+
rubygems_version: 1.8.24
|
81
84
|
signing_key:
|
82
85
|
specification_version: 3
|
83
86
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|