talentbox-docsplit 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'talentbox-docsplit'
3
- s.version = '0.5.2' # Keep version in sync with docsplit.rb
3
+ s.version = '0.5.3' # Keep version in sync with docsplit.rb
4
4
  s.date = '2011-05-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
23
  'docsplit.gemspec', 'LICENSE', 'README']
24
+
25
+ s.add_runtime_dependency("file_wrapper", ["~> 0.4.1"])
24
26
  end
@@ -14,8 +14,8 @@ module Docsplit
14
14
  OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
15
 
16
16
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
-
18
- GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
17
+
18
+ GM_MIME_TYPES = ["image/png", "image/gif", "image/jpeg", "image/tiff", "image/x-ms-bmp", "image/x-portable-anymap", "image/x-portable-pixmap", "mage/svg+xml", "application/postscript"]
19
19
 
20
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
21
 
@@ -60,9 +60,9 @@ module Docsplit
60
60
  out = opts[:output] || '.'
61
61
  FileUtils.mkdir_p out unless File.exists?(out)
62
62
  [docs].flatten.each do |doc|
63
- ext = File.extname(doc)
64
- basename = File.basename(doc, ext)
65
- if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
63
+ mime = FileWrapper.get_mime(doc)
64
+ basename = File.basename(doc, File.extname(doc))
65
+ if GM_MIME_TYPES.include?(mime)
66
66
  `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
67
  else
68
68
  options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
@@ -108,11 +108,11 @@ module Docsplit
108
108
  else value.to_s
109
109
  end
110
110
  end
111
-
112
111
  end
113
112
 
114
113
  require 'tmpdir'
115
114
  require 'fileutils'
115
+ require 'file_wrapper'
116
116
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
117
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
118
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -8,13 +8,13 @@ module Docsplit
8
8
  # through further extraction.
9
9
  def ensure_pdfs(docs)
10
10
  [docs].flatten.map do |doc|
11
- ext = File.extname(doc)
12
- if ext.downcase == '.pdf'
11
+ mime = FileWrapper.get_mime(doc)
12
+ if mime == "application/pdf"
13
13
  doc
14
14
  else
15
15
  tempdir = File.join(Dir.tmpdir, 'docsplit')
16
16
  extract_pdf([doc], {:output => tempdir})
17
- File.join(tempdir, File.basename(doc, ext) + '.pdf')
17
+ File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
18
18
  end
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: talentbox-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,7 +12,18 @@ bindir: bin
12
12
  cert_chain: []
13
13
  date: 2011-05-13 00:00:00.000000000 +02:00
14
14
  default_executable:
15
- dependencies: []
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: file_wrapper
18
+ requirement: &2152606320 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 0.4.1
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: *2152606320
16
27
  description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
28
  apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
29
  \ images or thumbnails in any format, PDFs, single pages, and document\n metadata