docsplit 0.6.4 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +3 -3
- data/lib/docsplit.rb +10 -38
- data/lib/docsplit/info_extractor.rb +23 -5
- data/lib/docsplit/pdf_extractor.rb +132 -0
- data/lib/docsplit/transparent_pdfs.rb +1 -1
- metadata +25 -39
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.7.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2013-02-21'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
EOS
|
14
14
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
16
|
-
s.email = '
|
16
|
+
s.email = 'opensource@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
18
|
|
19
19
|
s.require_paths = ['lib']
|
data/lib/docsplit.rb
CHANGED
@@ -5,24 +5,13 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.
|
8
|
+
VERSION = '0.7.0' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
12
12
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
13
|
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
14
|
|
15
|
-
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
16
|
-
|
17
|
-
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
18
|
-
|
19
|
-
HEADLESS = "-Djava.awt.headless=true"
|
20
|
-
|
21
|
-
office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
|
22
|
-
office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
|
23
|
-
|
24
|
-
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
|
25
|
-
|
26
15
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
27
16
|
|
28
17
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
@@ -66,20 +55,7 @@ module Docsplit
|
|
66
55
|
# Use JODCConverter to extract the documents as PDFs.
|
67
56
|
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
68
57
|
def self.extract_pdf(docs, opts={})
|
69
|
-
|
70
|
-
FileUtils.mkdir_p out unless File.exists?(out)
|
71
|
-
[docs].flatten.each do |doc|
|
72
|
-
ext = File.extname(doc)
|
73
|
-
basename = File.basename(doc, ext)
|
74
|
-
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
75
|
-
|
76
|
-
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
77
|
-
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
78
|
-
else
|
79
|
-
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
80
|
-
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
81
|
-
end
|
82
|
-
end
|
58
|
+
PdfExtractor.new.extract(docs, opts)
|
83
59
|
end
|
84
60
|
|
85
61
|
# Define custom methods for each of the metadata keys that we support.
|
@@ -92,30 +68,25 @@ module Docsplit
|
|
92
68
|
end
|
93
69
|
EOS
|
94
70
|
end
|
71
|
+
|
72
|
+
def self.extract_info(pdfs, opts={})
|
73
|
+
pdfs = ensure_pdfs(pdfs)
|
74
|
+
InfoExtractor.new.extract_all(pdfs, opts)
|
75
|
+
end
|
95
76
|
|
96
77
|
# Utility method to clean OCR'd text with garbage characters.
|
97
78
|
def self.clean_text(text)
|
98
79
|
TextCleaner.new.clean(text)
|
99
80
|
end
|
100
81
|
|
101
|
-
|
102
82
|
private
|
103
83
|
|
104
|
-
# Runs a Java command, with quieted logging, and the classpath set properly.
|
105
|
-
def self.run(command, pdfs, opts, return_output=false)
|
106
|
-
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
107
|
-
cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
108
|
-
result = `#{cmd}`.chomp
|
109
|
-
raise ExtractionFailed, result if $? != 0
|
110
|
-
return return_output ? (result.empty? ? nil : result) : true
|
111
|
-
end
|
112
|
-
|
113
84
|
# Normalize a value in an options hash for the command line.
|
114
85
|
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
115
86
|
def self.normalize_value(value)
|
116
87
|
case value
|
117
|
-
when Range then
|
118
|
-
when Array then value.map! {|v| v.is_a?(Range) ?
|
88
|
+
when Range then value.to_a.join(',')
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
119
90
|
else value.to_s
|
120
91
|
end
|
121
92
|
end
|
@@ -126,5 +97,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
|
126
97
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
127
98
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
128
99
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
100
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
|
129
101
|
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
130
102
|
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
@@ -17,16 +17,34 @@ module Docsplit
|
|
17
17
|
|
18
18
|
# Pull out a single datum from a pdf.
|
19
19
|
def extract(key, pdfs, opts)
|
20
|
+
extract_all(pdfs, opts)[key]
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_all(pdfs, opts)
|
20
24
|
pdf = [pdfs].flatten.first
|
21
25
|
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
22
26
|
result = `#{cmd}`.chomp
|
23
27
|
raise ExtractionFailed, result if $? != 0
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
|
+
if String.method_defined?(:encode)
|
30
|
+
result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
31
|
+
else
|
32
|
+
require 'iconv' unless defined?(Iconv)
|
33
|
+
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
34
|
+
result = ic.iconv(result)
|
35
|
+
end
|
36
|
+
info = {}
|
37
|
+
MATCHERS.each do |key, matcher|
|
38
|
+
match = result.match(matcher)
|
39
|
+
answer = match && match[1]
|
40
|
+
if answer
|
41
|
+
answer = answer.to_i if key == :length
|
42
|
+
info[key] = answer
|
43
|
+
end
|
44
|
+
end
|
45
|
+
info
|
28
46
|
end
|
29
47
|
|
30
48
|
end
|
31
49
|
|
32
|
-
end
|
50
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
class PdfExtractor
|
5
|
+
@@executable = nil
|
6
|
+
|
7
|
+
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
8
|
+
def windows?
|
9
|
+
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
10
|
+
end
|
11
|
+
def osx?
|
12
|
+
!!HOST_OS.match(/darwin/i)
|
13
|
+
end
|
14
|
+
def linux?
|
15
|
+
!!HOST_OS.match(/linux/i)
|
16
|
+
end
|
17
|
+
|
18
|
+
def version_string
|
19
|
+
@@help ||= `#{office_executable} -h 2>&1`.split("\n").first
|
20
|
+
end
|
21
|
+
|
22
|
+
def libre_office?
|
23
|
+
!!version_string.match(/^LibreOffice/)
|
24
|
+
end
|
25
|
+
|
26
|
+
def open_office?
|
27
|
+
!!version_string.match(/^OpenOffice.org/)
|
28
|
+
end
|
29
|
+
|
30
|
+
def office_search_paths
|
31
|
+
if windows?
|
32
|
+
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
33
|
+
program_files_path = ENV["CommonProgramFiles"]
|
34
|
+
search_paths = office_name.map{ |program| File.join(program_files_path, program) }
|
35
|
+
elsif osx?
|
36
|
+
search_paths = %w(
|
37
|
+
/Applications/LibreOffice.app/Contents
|
38
|
+
/Applications/OpenOffice.org.app/Contents
|
39
|
+
)
|
40
|
+
else # probably linux/unix
|
41
|
+
search_paths = %w(
|
42
|
+
/usr/lib/libreoffice
|
43
|
+
/opt/libreoffice
|
44
|
+
/usr/lib/openoffice
|
45
|
+
/opt/openoffice.org3
|
46
|
+
)
|
47
|
+
end
|
48
|
+
search_paths
|
49
|
+
end
|
50
|
+
|
51
|
+
def office_executable
|
52
|
+
paths = office_search_paths
|
53
|
+
|
54
|
+
if ENV['OFFICE_PATH']
|
55
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
56
|
+
paths.unshift(ENV['OFFICE_PATH'])
|
57
|
+
end
|
58
|
+
|
59
|
+
path_pieces = ["soffice"]
|
60
|
+
if windows?
|
61
|
+
path_pieces += [["program", "soffice.bin"]]
|
62
|
+
elsif osx?
|
63
|
+
path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
|
64
|
+
else
|
65
|
+
path_pieces += [["program", "soffice"]]
|
66
|
+
end
|
67
|
+
|
68
|
+
paths.each do |path|
|
69
|
+
if File.exists? path
|
70
|
+
@@executable ||= path unless File.directory? path
|
71
|
+
path_pieces.each do |pieces|
|
72
|
+
check_path = File.join(path, pieces)
|
73
|
+
@@executable ||= check_path if File.exists? check_path
|
74
|
+
end
|
75
|
+
end
|
76
|
+
break if @@executable
|
77
|
+
end
|
78
|
+
raise OfficeNotFound, "No office software found" unless @@executable
|
79
|
+
@@executable
|
80
|
+
end
|
81
|
+
|
82
|
+
def office_path
|
83
|
+
File.dirname(File.dirname(office_executable))
|
84
|
+
end
|
85
|
+
|
86
|
+
def extract(docs, opts)
|
87
|
+
out = opts[:output] || '.'
|
88
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
89
|
+
[docs].flatten.each do |doc|
|
90
|
+
ext = File.extname(doc)
|
91
|
+
basename = File.basename(doc, ext)
|
92
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
93
|
+
|
94
|
+
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
95
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
96
|
+
else
|
97
|
+
if libre_office?
|
98
|
+
options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
99
|
+
cmd = "#{office_executable} #{options} 2>&1"
|
100
|
+
result = `#{cmd}`.chomp
|
101
|
+
raise ExtractionFailed, result if $? != 0
|
102
|
+
true
|
103
|
+
else # open office presumably
|
104
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
105
|
+
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
112
|
+
|
113
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
114
|
+
|
115
|
+
HEADLESS = "-Djava.awt.headless=true"
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
120
|
+
def run_jod(command, pdfs, opts, return_output=false)
|
121
|
+
|
122
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
123
|
+
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
124
|
+
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
125
|
+
result = `#{cmd}`.chomp
|
126
|
+
raise ExtractionFailed, result if $? != 0
|
127
|
+
return return_output ? (result.empty? ? nil : result) : true
|
128
|
+
end
|
129
|
+
|
130
|
+
class OfficeNotFound < StandardError; end
|
131
|
+
end
|
132
|
+
end
|
metadata
CHANGED
@@ -1,38 +1,33 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 6
|
9
|
-
- 4
|
10
|
-
version: 0.6.4
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Jeremy Ashkenas
|
14
9
|
- Samuel Clay
|
15
10
|
- Ted Han
|
16
11
|
autorequire:
|
17
12
|
bindir: bin
|
18
13
|
cert_chain: []
|
19
|
-
|
20
|
-
date: 2012-11-12 00:00:00 Z
|
14
|
+
date: 2013-02-21 00:00:00.000000000 Z
|
21
15
|
dependencies: []
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
description: ! " Docsplit is a command-line utility and Ruby library for splitting
|
17
|
+
apart\n documents into their component parts: searchable UTF-8 plain text, page\n
|
18
|
+
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|
19
|
+
(title, author, number of pages...)\n"
|
20
|
+
email: opensource@documentcloud.org
|
21
|
+
executables:
|
26
22
|
- docsplit
|
27
23
|
extensions: []
|
28
|
-
|
29
24
|
extra_rdoc_files: []
|
30
|
-
|
31
|
-
files:
|
25
|
+
files:
|
32
26
|
- lib/docsplit/command_line.rb
|
33
27
|
- lib/docsplit/image_extractor.rb
|
34
28
|
- lib/docsplit/info_extractor.rb
|
35
29
|
- lib/docsplit/page_extractor.rb
|
30
|
+
- lib/docsplit/pdf_extractor.rb
|
36
31
|
- lib/docsplit/text_cleaner.rb
|
37
32
|
- lib/docsplit/text_extractor.rb
|
38
33
|
- lib/docsplit/transparent_pdfs.rb
|
@@ -53,36 +48,27 @@ files:
|
|
53
48
|
- README
|
54
49
|
homepage: http://documentcloud.github.com/docsplit/
|
55
50
|
licenses: []
|
56
|
-
|
57
51
|
post_install_message:
|
58
52
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
53
|
+
require_paths:
|
61
54
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
56
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
|
68
|
-
|
69
|
-
- 0
|
70
|
-
version: "0"
|
71
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
62
|
none: false
|
73
|
-
requirements:
|
74
|
-
- -
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
|
77
|
-
segments:
|
78
|
-
- 0
|
79
|
-
version: "0"
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
80
67
|
requirements: []
|
81
|
-
|
82
68
|
rubyforge_project: docsplit
|
83
69
|
rubygems_version: 1.8.24
|
84
70
|
signing_key:
|
85
71
|
specification_version: 3
|
86
72
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
87
73
|
test_files: []
|
88
|
-
|
74
|
+
has_rdoc:
|