docsplit 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-04-26'
3
+ s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-05-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
16
16
  s.email = 'jeremy@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
- s.has_rdoc = false
19
18
 
20
19
  s.require_paths = ['lib']
21
20
  s.executables = ['docsplit']
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.1' # Keep in sync with gemspec.
4
+ VERSION = '0.5.2' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -14,6 +14,8 @@ module Docsplit
14
14
  OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
15
 
16
16
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
+
18
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
17
19
 
18
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
19
21
 
@@ -53,11 +55,19 @@ module Docsplit
53
55
  end
54
56
 
55
57
  # Use JODCConverter to extract the documents as PDFs.
58
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
56
59
  def self.extract_pdf(docs, opts={})
60
+ out = opts[:output] || '.'
61
+ FileUtils.mkdir_p out unless File.exists?(out)
57
62
  [docs].flatten.each do |doc|
58
- basename = File.basename(doc, File.extname(doc))
59
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
60
- run "#{options} \"#{doc}\" \"#{opts[:output] || '.'}/#{basename}.pdf\"", [], {}
63
+ ext = File.extname(doc)
64
+ basename = File.basename(doc, ext)
65
+ if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
+ else
68
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
70
+ end
61
71
  end
62
72
  end
63
73
 
@@ -1,4 +1,3 @@
1
- require 'iconv'
2
1
  require 'strscan'
3
2
 
4
3
  module Docsplit
@@ -36,6 +35,7 @@ module Docsplit
36
35
  # For the time being, `clean` uses the regular StringScanner, and not the
37
36
  # multibyte-aware version, coercing to ASCII first.
38
37
  def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
39
  text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
40
  scanner = StringScanner.new(text)
41
41
  cleaned = []
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-04-26 00:00:00 Z
19
+ date: 2011-05-13 00:00:00 Z
20
20
  dependencies: []
21
21
 
22
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"