docsplit 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -3
- data/lib/docsplit.rb +14 -4
- data/lib/docsplit/text_cleaner.rb +1 -1
- metadata +4 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.5.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.5.2' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2011-05-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
|
16
16
|
s.email = 'jeremy@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
|
-
s.has_rdoc = false
|
19
18
|
|
20
19
|
s.require_paths = ['lib']
|
21
20
|
s.executables = ['docsplit']
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.5.
|
4
|
+
VERSION = '0.5.2' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -14,6 +14,8 @@ module Docsplit
|
|
14
14
|
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
|
15
15
|
|
16
16
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
17
|
+
|
18
|
+
GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
|
17
19
|
|
18
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
19
21
|
|
@@ -53,11 +55,19 @@ module Docsplit
|
|
53
55
|
end
|
54
56
|
|
55
57
|
# Use JODCConverter to extract the documents as PDFs.
|
58
|
+
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
56
59
|
def self.extract_pdf(docs, opts={})
|
60
|
+
out = opts[:output] || '.'
|
61
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
57
62
|
[docs].flatten.each do |doc|
|
58
|
-
|
59
|
-
|
60
|
-
|
63
|
+
ext = File.extname(doc)
|
64
|
+
basename = File.basename(doc, ext)
|
65
|
+
if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
|
66
|
+
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
|
67
|
+
else
|
68
|
+
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
69
|
+
run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
|
70
|
+
end
|
61
71
|
end
|
62
72
|
end
|
63
73
|
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'iconv'
|
2
1
|
require 'strscan'
|
3
2
|
|
4
3
|
module Docsplit
|
@@ -36,6 +35,7 @@ module Docsplit
|
|
36
35
|
# For the time being, `clean` uses the regular StringScanner, and not the
|
37
36
|
# multibyte-aware version, coercing to ASCII first.
|
38
37
|
def clean(text)
|
38
|
+
require 'iconv' unless defined?(Iconv)
|
39
39
|
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
40
40
|
scanner = StringScanner.new(text)
|
41
41
|
cleaned = []
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date:
|
19
|
+
date: 2011-05-13 00:00:00 Z
|
20
20
|
dependencies: []
|
21
21
|
|
22
22
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|