docsplit 0.7.2 → 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +2 -1
- data/docsplit.gemspec +3 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/command_line.rb +1 -0
- data/lib/docsplit/info_extractor.rb +1 -1
- data/lib/docsplit/pdf_extractor.rb +9 -2
- data/lib/docsplit/text_cleaner.rb +7 -2
- metadata +31 -29
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA512:
|
3
|
+
data.tar.gz: e50c360ba40c1aa51cd563d9a08d0c5a444b50c4e97635133aec13f17fcc4e541e20d2bb9867ca316486df9a501852ed482801bed26986e88dbc472bd2609d65
|
4
|
+
metadata.gz: 5da217d234a3b390963f2c6259ab859ecf805d71c60138c335eebdd9ae4c166cce63e825cee2e10c7e3029c47e4530e53b2d84dbdc91ab222c0a98c6f342d32a
|
5
|
+
SHA1:
|
6
|
+
data.tar.gz: 03c2ec2439773679080a055463d35e7004d1decf
|
7
|
+
metadata.gz: ddd8d4475b4a34fcfd2abb013d2824295718aa8c
|
data/LICENSE
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
|
2
2
|
|
3
|
-
Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
|
3
|
+
Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
|
4
|
+
Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
|
4
5
|
|
5
6
|
Permission is hereby granted, free of charge, to any person
|
6
7
|
obtaining a copy of this software and associated documentation
|
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.7.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.7.3' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2014-02-16'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
16
16
|
s.email = 'opensource@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
|
+
s.license = 'MIT'
|
18
19
|
|
19
20
|
s.require_paths = ['lib']
|
20
21
|
s.executables = ['docsplit']
|
data/lib/docsplit.rb
CHANGED
@@ -96,6 +96,7 @@ Options:
|
|
96
96
|
end
|
97
97
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
98
|
@options[:language] = l
|
99
|
+
@options[:clean] = false
|
99
100
|
end
|
100
101
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
101
102
|
@options[:rolling] = true
|
@@ -27,7 +27,7 @@ module Docsplit
|
|
27
27
|
raise ExtractionFailed, result if $? != 0
|
28
28
|
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
29
|
if String.method_defined?(:encode)
|
30
|
-
result.encode!('UTF-8', '
|
30
|
+
result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
|
31
31
|
else
|
32
32
|
require 'iconv' unless defined?(Iconv)
|
33
33
|
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
@@ -19,7 +19,12 @@ module Docsplit
|
|
19
19
|
# The first line of the help output holds the name and version number
|
20
20
|
# of the office software to be used for extraction.
|
21
21
|
def version_string
|
22
|
-
|
22
|
+
versionstr = `#{office_executable} -h 2>&1`.split("\n").first
|
23
|
+
if !!versionstr.match(/[0-9]*/)
|
24
|
+
versionstr = `#{office_executable} --version`.split("\n").first
|
25
|
+
end
|
26
|
+
@@help ||= versionstr
|
27
|
+
|
23
28
|
end
|
24
29
|
def libre_office?
|
25
30
|
!!version_string.match(/^LibreOffice/)
|
@@ -37,7 +42,7 @@ module Docsplit
|
|
37
42
|
if windows?
|
38
43
|
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
39
44
|
program_files_path = ENV["CommonProgramFiles"]
|
40
|
-
search_paths =
|
45
|
+
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
41
46
|
elsif osx?
|
42
47
|
search_paths = %w(
|
43
48
|
/Applications/LibreOffice.app/Contents
|
@@ -46,8 +51,10 @@ module Docsplit
|
|
46
51
|
else # probably linux/unix
|
47
52
|
search_paths = %w(
|
48
53
|
/usr/lib/libreoffice
|
54
|
+
/usr/lib64/libreoffice
|
49
55
|
/opt/libreoffice
|
50
56
|
/usr/lib/openoffice
|
57
|
+
/usr/lib64/openoffice
|
51
58
|
/opt/openoffice.org3
|
52
59
|
)
|
53
60
|
end
|
@@ -35,8 +35,13 @@ module Docsplit
|
|
35
35
|
# For the time being, `clean` uses the regular StringScanner, and not the
|
36
36
|
# multibyte-aware version, coercing to ASCII first.
|
37
37
|
def clean(text)
|
38
|
-
|
39
|
-
|
38
|
+
if String.method_defined?(:encode)
|
39
|
+
text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
|
40
|
+
else
|
41
|
+
require 'iconv' unless defined?(Iconv)
|
42
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
43
|
+
end
|
44
|
+
|
40
45
|
scanner = StringScanner.new(text)
|
41
46
|
cleaned = []
|
42
47
|
spaced = false
|
metadata
CHANGED
@@ -1,28 +1,28 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
5
|
-
prerelease:
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.3
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
6
|
+
authors:
|
8
7
|
- Jeremy Ashkenas
|
9
8
|
- Samuel Clay
|
10
9
|
- Ted Han
|
11
10
|
autorequire:
|
12
11
|
bindir: bin
|
13
12
|
cert_chain: []
|
14
|
-
|
13
|
+
|
14
|
+
date: 2014-02-16 00:00:00 Z
|
15
15
|
dependencies: []
|
16
|
-
|
17
|
-
|
18
|
-
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|
19
|
-
(title, author, number of pages...)\n"
|
16
|
+
|
17
|
+
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
20
18
|
email: opensource@documentcloud.org
|
21
|
-
executables:
|
19
|
+
executables:
|
22
20
|
- docsplit
|
23
21
|
extensions: []
|
22
|
+
|
24
23
|
extra_rdoc_files: []
|
25
|
-
|
24
|
+
|
25
|
+
files:
|
26
26
|
- lib/docsplit/command_line.rb
|
27
27
|
- lib/docsplit/image_extractor.rb
|
28
28
|
- lib/docsplit/info_extractor.rb
|
@@ -47,28 +47,30 @@ files:
|
|
47
47
|
- LICENSE
|
48
48
|
- README
|
49
49
|
homepage: http://documentcloud.github.com/docsplit/
|
50
|
-
licenses:
|
50
|
+
licenses:
|
51
|
+
- MIT
|
52
|
+
metadata: {}
|
53
|
+
|
51
54
|
post_install_message:
|
52
55
|
rdoc_options: []
|
53
|
-
|
56
|
+
|
57
|
+
require_paths:
|
54
58
|
- lib
|
55
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version:
|
61
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
-
|
63
|
-
|
64
|
-
- - ! '>='
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: '0'
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- &id001
|
62
|
+
- ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- *id001
|
67
68
|
requirements: []
|
69
|
+
|
68
70
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version:
|
71
|
+
rubygems_version: 2.0.13
|
70
72
|
signing_key:
|
71
|
-
specification_version:
|
73
|
+
specification_version: 4
|
72
74
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
73
75
|
test_files: []
|
74
|
-
|
76
|
+
|