docsplit 0.6.4 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +3 -3
- data/lib/docsplit.rb +10 -38
- data/lib/docsplit/info_extractor.rb +23 -5
- data/lib/docsplit/pdf_extractor.rb +132 -0
- data/lib/docsplit/transparent_pdfs.rb +1 -1
- metadata +25 -39
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.7.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2013-02-21'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
EOS
|
14
14
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
16
|
-
s.email = '
|
16
|
+
s.email = 'opensource@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
18
|
|
19
19
|
s.require_paths = ['lib']
|
data/lib/docsplit.rb
CHANGED
@@ -5,24 +5,13 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.
|
8
|
+
VERSION = '0.7.0' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
12
12
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
13
13
|
ESCAPED_ROOT = ESCAPE[ROOT]
|
14
14
|
|
15
|
-
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
16
|
-
|
17
|
-
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
18
|
-
|
19
|
-
HEADLESS = "-Djava.awt.headless=true"
|
20
|
-
|
21
|
-
office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
|
22
|
-
office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
|
23
|
-
|
24
|
-
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
|
25
|
-
|
26
15
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
27
16
|
|
28
17
|
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
@@ -66,20 +55,7 @@ module Docsplit
|
|
66
55
|
# Use JODCConverter to extract the documents as PDFs.
|
67
56
|
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
68
57
|
def self.extract_pdf(docs, opts={})
|
69
|
-
|
70
|
-
FileUtils.mkdir_p out unless File.exists?(out)
|
71
|
-
[docs].flatten.each do |doc|
|
72
|
-
ext = File.extname(doc)
|
73
|
-
basename = File.basename(doc, ext)
|
74
|
-
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
75
|
-
|
76
|
-
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
77
|
-
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
78
|
-
else
|
79
|
-
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
80
|
-
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
81
|
-
end
|
82
|
-
end
|
58
|
+
PdfExtractor.new.extract(docs, opts)
|
83
59
|
end
|
84
60
|
|
85
61
|
# Define custom methods for each of the metadata keys that we support.
|
@@ -92,30 +68,25 @@ module Docsplit
|
|
92
68
|
end
|
93
69
|
EOS
|
94
70
|
end
|
71
|
+
|
72
|
+
def self.extract_info(pdfs, opts={})
|
73
|
+
pdfs = ensure_pdfs(pdfs)
|
74
|
+
InfoExtractor.new.extract_all(pdfs, opts)
|
75
|
+
end
|
95
76
|
|
96
77
|
# Utility method to clean OCR'd text with garbage characters.
|
97
78
|
def self.clean_text(text)
|
98
79
|
TextCleaner.new.clean(text)
|
99
80
|
end
|
100
81
|
|
101
|
-
|
102
82
|
private
|
103
83
|
|
104
|
-
# Runs a Java command, with quieted logging, and the classpath set properly.
|
105
|
-
def self.run(command, pdfs, opts, return_output=false)
|
106
|
-
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
107
|
-
cmd = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
108
|
-
result = `#{cmd}`.chomp
|
109
|
-
raise ExtractionFailed, result if $? != 0
|
110
|
-
return return_output ? (result.empty? ? nil : result) : true
|
111
|
-
end
|
112
|
-
|
113
84
|
# Normalize a value in an options hash for the command line.
|
114
85
|
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
115
86
|
def self.normalize_value(value)
|
116
87
|
case value
|
117
|
-
when Range then
|
118
|
-
when Array then value.map! {|v| v.is_a?(Range) ?
|
88
|
+
when Range then value.to_a.join(',')
|
89
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
|
119
90
|
else value.to_s
|
120
91
|
end
|
121
92
|
end
|
@@ -126,5 +97,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
|
126
97
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
127
98
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
128
99
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
100
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
|
129
101
|
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
130
102
|
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
@@ -17,16 +17,34 @@ module Docsplit
|
|
17
17
|
|
18
18
|
# Pull out a single datum from a pdf.
|
19
19
|
def extract(key, pdfs, opts)
|
20
|
+
extract_all(pdfs, opts)[key]
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_all(pdfs, opts)
|
20
24
|
pdf = [pdfs].flatten.first
|
21
25
|
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
|
22
26
|
result = `#{cmd}`.chomp
|
23
27
|
raise ExtractionFailed, result if $? != 0
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
|
+
if String.method_defined?(:encode)
|
30
|
+
result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
31
|
+
else
|
32
|
+
require 'iconv' unless defined?(Iconv)
|
33
|
+
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
34
|
+
result = ic.iconv(result)
|
35
|
+
end
|
36
|
+
info = {}
|
37
|
+
MATCHERS.each do |key, matcher|
|
38
|
+
match = result.match(matcher)
|
39
|
+
answer = match && match[1]
|
40
|
+
if answer
|
41
|
+
answer = answer.to_i if key == :length
|
42
|
+
info[key] = answer
|
43
|
+
end
|
44
|
+
end
|
45
|
+
info
|
28
46
|
end
|
29
47
|
|
30
48
|
end
|
31
49
|
|
32
|
-
end
|
50
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
|
3
|
+
module Docsplit
|
4
|
+
class PdfExtractor
|
5
|
+
@@executable = nil
|
6
|
+
|
7
|
+
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
|
8
|
+
def windows?
|
9
|
+
!!HOST_OS.match(/mswin|windows|cygwin/i)
|
10
|
+
end
|
11
|
+
def osx?
|
12
|
+
!!HOST_OS.match(/darwin/i)
|
13
|
+
end
|
14
|
+
def linux?
|
15
|
+
!!HOST_OS.match(/linux/i)
|
16
|
+
end
|
17
|
+
|
18
|
+
def version_string
|
19
|
+
@@help ||= `#{office_executable} -h 2>&1`.split("\n").first
|
20
|
+
end
|
21
|
+
|
22
|
+
def libre_office?
|
23
|
+
!!version_string.match(/^LibreOffice/)
|
24
|
+
end
|
25
|
+
|
26
|
+
def open_office?
|
27
|
+
!!version_string.match(/^OpenOffice.org/)
|
28
|
+
end
|
29
|
+
|
30
|
+
def office_search_paths
|
31
|
+
if windows?
|
32
|
+
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
33
|
+
program_files_path = ENV["CommonProgramFiles"]
|
34
|
+
search_paths = office_name.map{ |program| File.join(program_files_path, program) }
|
35
|
+
elsif osx?
|
36
|
+
search_paths = %w(
|
37
|
+
/Applications/LibreOffice.app/Contents
|
38
|
+
/Applications/OpenOffice.org.app/Contents
|
39
|
+
)
|
40
|
+
else # probably linux/unix
|
41
|
+
search_paths = %w(
|
42
|
+
/usr/lib/libreoffice
|
43
|
+
/opt/libreoffice
|
44
|
+
/usr/lib/openoffice
|
45
|
+
/opt/openoffice.org3
|
46
|
+
)
|
47
|
+
end
|
48
|
+
search_paths
|
49
|
+
end
|
50
|
+
|
51
|
+
def office_executable
|
52
|
+
paths = office_search_paths
|
53
|
+
|
54
|
+
if ENV['OFFICE_PATH']
|
55
|
+
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
|
56
|
+
paths.unshift(ENV['OFFICE_PATH'])
|
57
|
+
end
|
58
|
+
|
59
|
+
path_pieces = ["soffice"]
|
60
|
+
if windows?
|
61
|
+
path_pieces += [["program", "soffice.bin"]]
|
62
|
+
elsif osx?
|
63
|
+
path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
|
64
|
+
else
|
65
|
+
path_pieces += [["program", "soffice"]]
|
66
|
+
end
|
67
|
+
|
68
|
+
paths.each do |path|
|
69
|
+
if File.exists? path
|
70
|
+
@@executable ||= path unless File.directory? path
|
71
|
+
path_pieces.each do |pieces|
|
72
|
+
check_path = File.join(path, pieces)
|
73
|
+
@@executable ||= check_path if File.exists? check_path
|
74
|
+
end
|
75
|
+
end
|
76
|
+
break if @@executable
|
77
|
+
end
|
78
|
+
raise OfficeNotFound, "No office software found" unless @@executable
|
79
|
+
@@executable
|
80
|
+
end
|
81
|
+
|
82
|
+
def office_path
|
83
|
+
File.dirname(File.dirname(office_executable))
|
84
|
+
end
|
85
|
+
|
86
|
+
def extract(docs, opts)
|
87
|
+
out = opts[:output] || '.'
|
88
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
89
|
+
[docs].flatten.each do |doc|
|
90
|
+
ext = File.extname(doc)
|
91
|
+
basename = File.basename(doc, ext)
|
92
|
+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
93
|
+
|
94
|
+
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
|
95
|
+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
96
|
+
else
|
97
|
+
if libre_office?
|
98
|
+
options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
|
99
|
+
cmd = "#{office_executable} #{options} 2>&1"
|
100
|
+
result = `#{cmd}`.chomp
|
101
|
+
raise ExtractionFailed, result if $? != 0
|
102
|
+
true
|
103
|
+
else # open office presumably
|
104
|
+
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
|
105
|
+
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
|
112
|
+
|
113
|
+
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
|
114
|
+
|
115
|
+
HEADLESS = "-Djava.awt.headless=true"
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
120
|
+
def run_jod(command, pdfs, opts, return_output=false)
|
121
|
+
|
122
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
123
|
+
office = osx? ? "-Doffice.home=#{office_path}" : office_path
|
124
|
+
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
|
125
|
+
result = `#{cmd}`.chomp
|
126
|
+
raise ExtractionFailed, result if $? != 0
|
127
|
+
return return_output ? (result.empty? ? nil : result) : true
|
128
|
+
end
|
129
|
+
|
130
|
+
class OfficeNotFound < StandardError; end
|
131
|
+
end
|
132
|
+
end
|
metadata
CHANGED
@@ -1,38 +1,33 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 6
|
9
|
-
- 4
|
10
|
-
version: 0.6.4
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Jeremy Ashkenas
|
14
9
|
- Samuel Clay
|
15
10
|
- Ted Han
|
16
11
|
autorequire:
|
17
12
|
bindir: bin
|
18
13
|
cert_chain: []
|
19
|
-
|
20
|
-
date: 2012-11-12 00:00:00 Z
|
14
|
+
date: 2013-02-21 00:00:00.000000000 Z
|
21
15
|
dependencies: []
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
description: ! " Docsplit is a command-line utility and Ruby library for splitting
|
17
|
+
apart\n documents into their component parts: searchable UTF-8 plain text, page\n
|
18
|
+
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|
19
|
+
(title, author, number of pages...)\n"
|
20
|
+
email: opensource@documentcloud.org
|
21
|
+
executables:
|
26
22
|
- docsplit
|
27
23
|
extensions: []
|
28
|
-
|
29
24
|
extra_rdoc_files: []
|
30
|
-
|
31
|
-
files:
|
25
|
+
files:
|
32
26
|
- lib/docsplit/command_line.rb
|
33
27
|
- lib/docsplit/image_extractor.rb
|
34
28
|
- lib/docsplit/info_extractor.rb
|
35
29
|
- lib/docsplit/page_extractor.rb
|
30
|
+
- lib/docsplit/pdf_extractor.rb
|
36
31
|
- lib/docsplit/text_cleaner.rb
|
37
32
|
- lib/docsplit/text_extractor.rb
|
38
33
|
- lib/docsplit/transparent_pdfs.rb
|
@@ -53,36 +48,27 @@ files:
|
|
53
48
|
- README
|
54
49
|
homepage: http://documentcloud.github.com/docsplit/
|
55
50
|
licenses: []
|
56
|
-
|
57
51
|
post_install_message:
|
58
52
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
53
|
+
require_paths:
|
61
54
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
56
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
|
68
|
-
|
69
|
-
- 0
|
70
|
-
version: "0"
|
71
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
62
|
none: false
|
73
|
-
requirements:
|
74
|
-
- -
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
|
77
|
-
segments:
|
78
|
-
- 0
|
79
|
-
version: "0"
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
80
67
|
requirements: []
|
81
|
-
|
82
68
|
rubyforge_project: docsplit
|
83
69
|
rubygems_version: 1.8.24
|
84
70
|
signing_key:
|
85
71
|
specification_version: 3
|
86
72
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
87
73
|
test_files: []
|
88
|
-
|
74
|
+
has_rdoc:
|