docsplit 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-04-26'
3
+ s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-05-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
16
16
  s.email = 'jeremy@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
- s.has_rdoc = false
19
18
 
20
19
  s.require_paths = ['lib']
21
20
  s.executables = ['docsplit']
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.1' # Keep in sync with gemspec.
4
+ VERSION = '0.5.2' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -14,6 +14,8 @@ module Docsplit
14
14
  OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
15
15
 
16
16
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
+
18
+ GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
17
19
 
18
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
19
21
 
@@ -53,11 +55,19 @@ module Docsplit
53
55
  end
54
56
 
55
57
  # Use JODCConverter to extract the documents as PDFs.
58
+ # If the document is in an image format, use GraphicsMagick to extract the PDF.
56
59
  def self.extract_pdf(docs, opts={})
60
+ out = opts[:output] || '.'
61
+ FileUtils.mkdir_p out unless File.exists?(out)
57
62
  [docs].flatten.each do |doc|
58
- basename = File.basename(doc, File.extname(doc))
59
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
60
- run "#{options} \"#{doc}\" \"#{opts[:output] || '.'}/#{basename}.pdf\"", [], {}
63
+ ext = File.extname(doc)
64
+ basename = File.basename(doc, ext)
65
+ if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
+ `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
67
+ else
68
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
+ run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
70
+ end
61
71
  end
62
72
  end
63
73
 
@@ -1,4 +1,3 @@
1
- require 'iconv'
2
1
  require 'strscan'
3
2
 
4
3
  module Docsplit
@@ -36,6 +35,7 @@ module Docsplit
36
35
  # For the time being, `clean` uses the regular StringScanner, and not the
37
36
  # multibyte-aware version, coercing to ASCII first.
38
37
  def clean(text)
38
+ require 'iconv' unless defined?(Iconv)
39
39
  text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
40
  scanner = StringScanner.new(text)
41
41
  cleaned = []
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-04-26 00:00:00 Z
19
+ date: 2011-05-13 00:00:00 Z
20
20
  dependencies: []
21
21
 
22
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"