docsplit 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -3
- data/lib/docsplit.rb +14 -4
- data/lib/docsplit/text_cleaner.rb +1 -1
- metadata +4 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.5.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.5.2' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2011-05-13'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
|
16
16
|
s.email = 'jeremy@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
|
-
s.has_rdoc = false
|
19
18
|
|
20
19
|
s.require_paths = ['lib']
|
21
20
|
s.executables = ['docsplit']
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.5.
|
4
|
+
VERSION = '0.5.2' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -14,6 +14,8 @@ module Docsplit
|
|
14
14
|
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
|
15
15
|
|
16
16
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
17
|
+
|
18
|
+
GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
|
17
19
|
|
18
20
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
19
21
|
|
@@ -53,11 +55,19 @@ module Docsplit
|
|
53
55
|
end
|
54
56
|
|
55
57
|
# Use JODCConverter to extract the documents as PDFs.
|
58
|
+
# If the document is in an image format, use GraphicsMagick to extract the PDF.
|
56
59
|
def self.extract_pdf(docs, opts={})
|
60
|
+
out = opts[:output] || '.'
|
61
|
+
FileUtils.mkdir_p out unless File.exists?(out)
|
57
62
|
[docs].flatten.each do |doc|
|
58
|
-
|
59
|
-
|
60
|
-
|
63
|
+
ext = File.extname(doc)
|
64
|
+
basename = File.basename(doc, ext)
|
65
|
+
if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
|
66
|
+
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
|
67
|
+
else
|
68
|
+
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
69
|
+
run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
|
70
|
+
end
|
61
71
|
end
|
62
72
|
end
|
63
73
|
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'iconv'
|
2
1
|
require 'strscan'
|
3
2
|
|
4
3
|
module Docsplit
|
@@ -36,6 +35,7 @@ module Docsplit
|
|
36
35
|
# For the time being, `clean` uses the regular StringScanner, and not the
|
37
36
|
# multibyte-aware version, coercing to ASCII first.
|
38
37
|
def clean(text)
|
38
|
+
require 'iconv' unless defined?(Iconv)
|
39
39
|
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
40
40
|
scanner = StringScanner.new(text)
|
41
41
|
cleaned = []
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date:
|
19
|
+
date: 2011-05-13 00:00:00 Z
|
20
20
|
dependencies: []
|
21
21
|
|
22
22
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|