docsplit 0.7.4 → 0.7.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -7
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/pdf_extractor.rb +2 -0
- data/lib/docsplit/transparent_pdfs.rb +7 -4
- metadata +30 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
5
|
-
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ebdecba4d9b5b3a19244e08a2f3bcbaff8d8fab1
|
4
|
+
data.tar.gz: adb719d204a184313c1d282c837a7d1c929977ff
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8e58b660472bbff77ce500e50007c1dcdeec9adce59d527d9e331b42bde76d4ff9f2df9aece56e3793eef9d49795aafeddc8a7dfc9bd56a4fa07ca69fa080ca2
|
7
|
+
data.tar.gz: 1ca0706ec0b4eca050c9a1d0a45858365f5bde18b639c3ff641278dc777a1edf17128585f264b2a128e36a5388d4b2c7b9386e8f0b1993e850bc995d625c731b
|
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.7.
|
4
|
-
s.date = '2014-
|
3
|
+
s.version = '0.7.5' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2014-05-28'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -52,6 +52,7 @@ module Docsplit
|
|
52
52
|
/Applications/OpenOffice.org.app/Contents
|
53
53
|
)
|
54
54
|
else # probably linux/unix
|
55
|
+
# heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
|
55
56
|
search_paths = %w(
|
56
57
|
/usr/lib/libreoffice
|
57
58
|
/usr/lib64/libreoffice
|
@@ -59,6 +60,7 @@ module Docsplit
|
|
59
60
|
/usr/lib/openoffice
|
60
61
|
/usr/lib64/openoffice
|
61
62
|
/opt/openoffice.org3
|
63
|
+
/app/vendor/libreoffice
|
62
64
|
)
|
63
65
|
end
|
64
66
|
search_paths
|
@@ -8,19 +8,22 @@ module Docsplit
|
|
8
8
|
# through further extraction.
|
9
9
|
def ensure_pdfs(docs)
|
10
10
|
[docs].flatten.map do |doc|
|
11
|
-
|
12
|
-
if ext.downcase == '.pdf'
|
11
|
+
if is_pdf?(doc)
|
13
12
|
doc
|
14
13
|
else
|
15
14
|
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
15
|
extract_pdf([doc], {:output => tempdir})
|
17
|
-
File.join(tempdir, File.basename(doc,
|
16
|
+
File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
|
18
17
|
end
|
19
18
|
end
|
20
19
|
end
|
21
20
|
|
21
|
+
def is_pdf?(doc)
|
22
|
+
File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
|
23
|
+
end
|
24
|
+
|
22
25
|
end
|
23
26
|
|
24
27
|
extend TransparentPDFs
|
25
28
|
|
26
|
-
end
|
29
|
+
end
|
metadata
CHANGED
@@ -1,28 +1,33 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.5
|
5
5
|
platform: ruby
|
6
|
-
authors:
|
6
|
+
authors:
|
7
7
|
- Jeremy Ashkenas
|
8
8
|
- Samuel Clay
|
9
9
|
- Ted Han
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
|
14
|
-
date: 2014-02-16 00:00:00 Z
|
13
|
+
date: 2014-05-28 00:00:00.000000000 Z
|
15
14
|
dependencies: []
|
16
|
-
|
17
|
-
|
15
|
+
description: |2
|
16
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
17
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
18
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
19
|
+
metadata (title, author, number of pages...)
|
18
20
|
email: opensource@documentcloud.org
|
19
|
-
executables:
|
21
|
+
executables:
|
20
22
|
- docsplit
|
21
23
|
extensions: []
|
22
|
-
|
23
24
|
extra_rdoc_files: []
|
24
|
-
|
25
|
-
|
25
|
+
files:
|
26
|
+
- LICENSE
|
27
|
+
- README
|
28
|
+
- bin/docsplit
|
29
|
+
- docsplit.gemspec
|
30
|
+
- lib/docsplit.rb
|
26
31
|
- lib/docsplit/command_line.rb
|
27
32
|
- lib/docsplit/image_extractor.rb
|
28
33
|
- lib/docsplit/info_extractor.rb
|
@@ -31,8 +36,6 @@ files:
|
|
31
36
|
- lib/docsplit/text_cleaner.rb
|
32
37
|
- lib/docsplit/text_extractor.rb
|
33
38
|
- lib/docsplit/transparent_pdfs.rb
|
34
|
-
- lib/docsplit.rb
|
35
|
-
- bin/docsplit
|
36
39
|
- vendor/conf/document-formats.js
|
37
40
|
- vendor/jodconverter/commons-cli-1.1.jar
|
38
41
|
- vendor/jodconverter/commons-io-1.4.jar
|
@@ -43,34 +46,28 @@ files:
|
|
43
46
|
- vendor/jodconverter/ridl-3.2.1.jar
|
44
47
|
- vendor/jodconverter/unoil-3.2.1.jar
|
45
48
|
- vendor/logging.properties
|
46
|
-
- docsplit.gemspec
|
47
|
-
- LICENSE
|
48
|
-
- README
|
49
49
|
homepage: http://documentcloud.github.com/docsplit/
|
50
|
-
licenses:
|
50
|
+
licenses:
|
51
51
|
- MIT
|
52
52
|
metadata: {}
|
53
|
-
|
54
53
|
post_install_message:
|
55
54
|
rdoc_options: []
|
56
|
-
|
57
|
-
require_paths:
|
55
|
+
require_paths:
|
58
56
|
- lib
|
59
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
-
requirements:
|
61
|
-
-
|
62
|
-
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
68
67
|
requirements: []
|
69
|
-
|
70
68
|
rubyforge_project: docsplit
|
71
|
-
rubygems_version: 2.
|
69
|
+
rubygems_version: 2.2.2
|
72
70
|
signing_key:
|
73
71
|
specification_version: 4
|
74
72
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
75
73
|
test_files: []
|
76
|
-
|