talentbox-docsplit 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'talentbox-docsplit'
3
- s.version = '0.5.2' # Keep version in sync with docsplit.rb
3
+ s.version = '0.5.3' # Keep version in sync with docsplit.rb
4
4
  s.date = '2011-05-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
23
23
  'docsplit.gemspec', 'LICENSE', 'README']
24
+
25
+ s.add_runtime_dependency("file_wrapper", ["~> 0.4.1"])
24
26
  end
@@ -14,8 +14,8 @@ module Docsplit
14
14
  OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
15
 
16
16
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
-
18
- GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
17
+
18
+ GM_MIME_TYPES = ["image/png", "image/gif", "image/jpeg", "image/tiff", "image/x-ms-bmp", "image/x-portable-anymap", "image/x-portable-pixmap", "mage/svg+xml", "application/postscript"]
19
19
 
20
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
21
 
@@ -60,9 +60,9 @@ module Docsplit
60
60
  out = opts[:output] || '.'
61
61
  FileUtils.mkdir_p out unless File.exists?(out)
62
62
  [docs].flatten.each do |doc|
63
- ext = File.extname(doc)
64
- basename = File.basename(doc, ext)
65
- if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
63
+ mime = FileWrapper.get_mime(doc)
64
+ basename = File.basename(doc, File.extname(doc))
65
+ if GM_MIME_TYPES.include?(mime)
66
66
  `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
67
  else
68
68
  options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
@@ -108,11 +108,11 @@ module Docsplit
108
108
  else value.to_s
109
109
  end
110
110
  end
111
-
112
111
  end
113
112
 
114
113
  require 'tmpdir'
115
114
  require 'fileutils'
115
+ require 'file_wrapper'
116
116
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
117
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
118
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -8,13 +8,13 @@ module Docsplit
8
8
  # through further extraction.
9
9
  def ensure_pdfs(docs)
10
10
  [docs].flatten.map do |doc|
11
- ext = File.extname(doc)
12
- if ext.downcase == '.pdf'
11
+ mime = FileWrapper.get_mime(doc)
12
+ if mime == "application/pdf"
13
13
  doc
14
14
  else
15
15
  tempdir = File.join(Dir.tmpdir, 'docsplit')
16
16
  extract_pdf([doc], {:output => tempdir})
17
- File.join(tempdir, File.basename(doc, ext) + '.pdf')
17
+ File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
18
18
  end
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: talentbox-docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,7 +12,18 @@ bindir: bin
12
12
  cert_chain: []
13
13
  date: 2011-05-13 00:00:00.000000000 +02:00
14
14
  default_executable:
15
- dependencies: []
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: file_wrapper
18
+ requirement: &2152606320 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 0.4.1
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: *2152606320
16
27
  description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
28
  apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
29
  \ images or thumbnails in any format, PDFs, single pages, and document\n metadata