talentbox-docsplit 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +3 -1
- data/lib/docsplit.rb +6 -6
- data/lib/docsplit/transparent_pdfs.rb +3 -3
- metadata +13 -2
data/docsplit.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'talentbox-docsplit'
|
3
|
-
s.version = '0.5.
|
3
|
+
s.version = '0.5.3' # Keep version in sync with docsplit.rb
|
4
4
|
s.date = '2011-05-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
|
|
21
21
|
|
22
22
|
s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
|
23
23
|
'docsplit.gemspec', 'LICENSE', 'README']
|
24
|
+
|
25
|
+
s.add_runtime_dependency("file_wrapper", ["~> 0.4.1"])
|
24
26
|
end
|
data/lib/docsplit.rb
CHANGED
@@ -14,8 +14,8 @@ module Docsplit
|
|
14
14
|
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
|
15
15
|
|
16
16
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
17
|
-
|
18
|
-
|
17
|
+
|
18
|
+
GM_MIME_TYPES = ["image/png", "image/gif", "image/jpeg", "image/tiff", "image/x-ms-bmp", "image/x-portable-anymap", "image/x-portable-pixmap", "mage/svg+xml", "application/postscript"]
|
19
19
|
|
20
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
21
21
|
|
@@ -60,9 +60,9 @@ module Docsplit
|
|
60
60
|
out = opts[:output] || '.'
|
61
61
|
FileUtils.mkdir_p out unless File.exists?(out)
|
62
62
|
[docs].flatten.each do |doc|
|
63
|
-
|
64
|
-
basename = File.basename(doc,
|
65
|
-
if
|
63
|
+
mime = FileWrapper.get_mime(doc)
|
64
|
+
basename = File.basename(doc, File.extname(doc))
|
65
|
+
if GM_MIME_TYPES.include?(mime)
|
66
66
|
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
|
67
67
|
else
|
68
68
|
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
@@ -108,11 +108,11 @@ module Docsplit
|
|
108
108
|
else value.to_s
|
109
109
|
end
|
110
110
|
end
|
111
|
-
|
112
111
|
end
|
113
112
|
|
114
113
|
require 'tmpdir'
|
115
114
|
require 'fileutils'
|
115
|
+
require 'file_wrapper'
|
116
116
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
117
117
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
118
118
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -8,13 +8,13 @@ module Docsplit
|
|
8
8
|
# through further extraction.
|
9
9
|
def ensure_pdfs(docs)
|
10
10
|
[docs].flatten.map do |doc|
|
11
|
-
|
12
|
-
if
|
11
|
+
mime = FileWrapper.get_mime(doc)
|
12
|
+
if mime == "application/pdf"
|
13
13
|
doc
|
14
14
|
else
|
15
15
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
16
|
extract_pdf([doc], {:output => tempdir})
|
17
|
-
File.join(tempdir, File.basename(doc,
|
17
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: talentbox-docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,7 +12,18 @@ bindir: bin
|
|
12
12
|
cert_chain: []
|
13
13
|
date: 2011-05-13 00:00:00.000000000 +02:00
|
14
14
|
default_executable:
|
15
|
-
dependencies:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: file_wrapper
|
18
|
+
requirement: &2152606320 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.4.1
|
24
|
+
type: :runtime
|
25
|
+
prerelease: false
|
26
|
+
version_requirements: *2152606320
|
16
27
|
description: ! " Docsplit is a command-line utility and Ruby library for splitting
|
17
28
|
apart\n documents into their component parts: searchable UTF-8 plain text, page\n
|
18
29
|
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|