docsplit 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +2 -1
- data/docsplit.gemspec +3 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/command_line.rb +1 -0
- data/lib/docsplit/info_extractor.rb +1 -1
- data/lib/docsplit/pdf_extractor.rb +9 -2
- data/lib/docsplit/text_cleaner.rb +7 -2
- metadata +31 -29
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA512:
|
3
|
+
data.tar.gz: e50c360ba40c1aa51cd563d9a08d0c5a444b50c4e97635133aec13f17fcc4e541e20d2bb9867ca316486df9a501852ed482801bed26986e88dbc472bd2609d65
|
4
|
+
metadata.gz: 5da217d234a3b390963f2c6259ab859ecf805d71c60138c335eebdd9ae4c166cce63e825cee2e10c7e3029c47e4530e53b2d84dbdc91ab222c0a98c6f342d32a
|
5
|
+
SHA1:
|
6
|
+
data.tar.gz: 03c2ec2439773679080a055463d35e7004d1decf
|
7
|
+
metadata.gz: ddd8d4475b4a34fcfd2abb013d2824295718aa8c
|
data/LICENSE
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
|
2
2
|
|
3
|
-
Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
|
3
|
+
Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
|
4
|
+
Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
|
4
5
|
|
5
6
|
Permission is hereby granted, free of charge, to any person
|
6
7
|
obtaining a copy of this software and associated documentation
|
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.7.
|
4
|
-
s.date = '
|
3
|
+
s.version = '0.7.3' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2014-02-16'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
16
16
|
s.email = 'opensource@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
|
+
s.license = 'MIT'
|
18
19
|
|
19
20
|
s.require_paths = ['lib']
|
20
21
|
s.executables = ['docsplit']
|
data/lib/docsplit.rb
CHANGED
@@ -96,6 +96,7 @@ Options:
|
|
96
96
|
end
|
97
97
|
opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
|
98
98
|
@options[:language] = l
|
99
|
+
@options[:clean] = false
|
99
100
|
end
|
100
101
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
101
102
|
@options[:rolling] = true
|
@@ -27,7 +27,7 @@ module Docsplit
|
|
27
27
|
raise ExtractionFailed, result if $? != 0
|
28
28
|
# ruby 1.8 (iconv) and 1.9 (String#encode) :
|
29
29
|
if String.method_defined?(:encode)
|
30
|
-
result.encode!('UTF-8', '
|
30
|
+
result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
|
31
31
|
else
|
32
32
|
require 'iconv' unless defined?(Iconv)
|
33
33
|
ic = Iconv.new('UTF-8//IGNORE','UTF-8')
|
@@ -19,7 +19,12 @@ module Docsplit
|
|
19
19
|
# The first line of the help output holds the name and version number
|
20
20
|
# of the office software to be used for extraction.
|
21
21
|
def version_string
|
22
|
-
|
22
|
+
versionstr = `#{office_executable} -h 2>&1`.split("\n").first
|
23
|
+
if !!versionstr.match(/[0-9]*/)
|
24
|
+
versionstr = `#{office_executable} --version`.split("\n").first
|
25
|
+
end
|
26
|
+
@@help ||= versionstr
|
27
|
+
|
23
28
|
end
|
24
29
|
def libre_office?
|
25
30
|
!!version_string.match(/^LibreOffice/)
|
@@ -37,7 +42,7 @@ module Docsplit
|
|
37
42
|
if windows?
|
38
43
|
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
|
39
44
|
program_files_path = ENV["CommonProgramFiles"]
|
40
|
-
search_paths =
|
45
|
+
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
|
41
46
|
elsif osx?
|
42
47
|
search_paths = %w(
|
43
48
|
/Applications/LibreOffice.app/Contents
|
@@ -46,8 +51,10 @@ module Docsplit
|
|
46
51
|
else # probably linux/unix
|
47
52
|
search_paths = %w(
|
48
53
|
/usr/lib/libreoffice
|
54
|
+
/usr/lib64/libreoffice
|
49
55
|
/opt/libreoffice
|
50
56
|
/usr/lib/openoffice
|
57
|
+
/usr/lib64/openoffice
|
51
58
|
/opt/openoffice.org3
|
52
59
|
)
|
53
60
|
end
|
@@ -35,8 +35,13 @@ module Docsplit
|
|
35
35
|
# For the time being, `clean` uses the regular StringScanner, and not the
|
36
36
|
# multibyte-aware version, coercing to ASCII first.
|
37
37
|
def clean(text)
|
38
|
-
|
39
|
-
|
38
|
+
if String.method_defined?(:encode)
|
39
|
+
text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
|
40
|
+
else
|
41
|
+
require 'iconv' unless defined?(Iconv)
|
42
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
43
|
+
end
|
44
|
+
|
40
45
|
scanner = StringScanner.new(text)
|
41
46
|
cleaned = []
|
42
47
|
spaced = false
|
metadata
CHANGED
@@ -1,28 +1,28 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
5
|
-
prerelease:
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.3
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
6
|
+
authors:
|
8
7
|
- Jeremy Ashkenas
|
9
8
|
- Samuel Clay
|
10
9
|
- Ted Han
|
11
10
|
autorequire:
|
12
11
|
bindir: bin
|
13
12
|
cert_chain: []
|
14
|
-
|
13
|
+
|
14
|
+
date: 2014-02-16 00:00:00 Z
|
15
15
|
dependencies: []
|
16
|
-
|
17
|
-
|
18
|
-
\ images or thumbnails in any format, PDFs, single pages, and document\n metadata
|
19
|
-
(title, author, number of pages...)\n"
|
16
|
+
|
17
|
+
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
20
18
|
email: opensource@documentcloud.org
|
21
|
-
executables:
|
19
|
+
executables:
|
22
20
|
- docsplit
|
23
21
|
extensions: []
|
22
|
+
|
24
23
|
extra_rdoc_files: []
|
25
|
-
|
24
|
+
|
25
|
+
files:
|
26
26
|
- lib/docsplit/command_line.rb
|
27
27
|
- lib/docsplit/image_extractor.rb
|
28
28
|
- lib/docsplit/info_extractor.rb
|
@@ -47,28 +47,30 @@ files:
|
|
47
47
|
- LICENSE
|
48
48
|
- README
|
49
49
|
homepage: http://documentcloud.github.com/docsplit/
|
50
|
-
licenses:
|
50
|
+
licenses:
|
51
|
+
- MIT
|
52
|
+
metadata: {}
|
53
|
+
|
51
54
|
post_install_message:
|
52
55
|
rdoc_options: []
|
53
|
-
|
56
|
+
|
57
|
+
require_paths:
|
54
58
|
- lib
|
55
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version:
|
61
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
-
|
63
|
-
|
64
|
-
- - ! '>='
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: '0'
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- &id001
|
62
|
+
- ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- *id001
|
67
68
|
requirements: []
|
69
|
+
|
68
70
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version:
|
71
|
+
rubygems_version: 2.0.13
|
70
72
|
signing_key:
|
71
|
-
specification_version:
|
73
|
+
specification_version: 4
|
72
74
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
73
75
|
test_files: []
|
74
|
-
|
76
|
+
|