talentbox-docsplit 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +3 -1
- data/lib/docsplit.rb +6 -6
- data/lib/docsplit/transparent_pdfs.rb +3 -3
- metadata +13 -2
data/docsplit.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'talentbox-docsplit'
|
3
|
-
s.version = '0.5.
|
3
|
+
s.version = '0.5.3' # Keep version in sync with docsplit.rb
|
4
4
|
s.date = '2011-05-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
|
|
21
21
|
|
22
22
|
s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
|
23
23
|
'docsplit.gemspec', 'LICENSE', 'README']
|
24
|
+
|
25
|
+
s.add_runtime_dependency("file_wrapper", ["~> 0.4.1"])
|
24
26
|
end
|
data/lib/docsplit.rb
CHANGED
@@ -14,8 +14,8 @@ module Docsplit
|
|
14
14
|
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
|
15
15
|
|
16
16
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
17
|
-
|
18
|
-
|
17
|
+
|
18
|
+
GM_MIME_TYPES = ["image/png", "image/gif", "image/jpeg", "image/tiff", "image/x-ms-bmp", "image/x-portable-anymap", "image/x-portable-pixmap", "mage/svg+xml", "application/postscript"]
|
19
19
|
|
20
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
21
21
|
|
@@ -60,9 +60,9 @@ module Docsplit
|
|
60
60
|
out = opts[:output] || '.'
|
61
61
|
FileUtils.mkdir_p out unless File.exists?(out)
|
62
62
|
[docs].flatten.each do |doc|
|
63
|
-
|
64
|
-
basename = File.basename(doc,
|
65
|
-
if
|
63
|
+
mime = FileWrapper.get_mime(doc)
|
64
|
+
basename = File.basename(doc, File.extname(doc))
|
65
|
+
if GM_MIME_TYPES.include?(mime)
|
66
66
|
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
|
67
67
|
else
|
68
68
|
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
@@ -108,11 +108,11 @@ module Docsplit
|
|
108
108
|
else value.to_s
|
109
109
|
end
|
110
110
|
end
|
111
|
-
|
112
111
|
end
|
113
112
|
|
114
113
|
require 'tmpdir'
|
115
114
|
require 'fileutils'
|
115
|
+
require 'file_wrapper'
|
116
116
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
117
117
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
118
118
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -8,13 +8,13 @@ module Docsplit
|
|
8
8
|
# through further extraction.
|
9
9
|
def ensure_pdfs(docs)
|
10
10
|
[docs].flatten.map do |doc|
|
11
|
-
|
12
|
-
if
|
11
|
+
mime = FileWrapper.get_mime(doc)
|
12
|
+
if mime == "application/pdf"
|
13
13
|
doc
|
14
14
|
else
|
15
15
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
16
|
extract_pdf([doc], {:output => tempdir})
|
17
|
-
File.join(tempdir, File.basename(doc,
|
17
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: talentbox-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,7 +12,18 @@ bindir: bin
|
|
12
12
|
cert_chain: []
|
13
13
|
date: 2011-05-13 00:00:00.000000000 +02:00
|
14
14
|
default_executable:
|
15
|
-
dependencies:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: file_wrapper
|
18
|
+
requirement: &2152606320 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.4.1
|
24
|
+
type: :runtime
|
25
|
+
prerelease: false
|
26
|
+
version_requirements: *2152606320
|
16
27
|
description: ! " Docsplit is a command-line utility and Ruby library for splitting
|
17
28
|
apart\n documents into their component parts: searchable UTF-8 plain text, page\n
|
18
29
|
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|