henkei 1.17.3 → 1.17.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -0
- data/.travis.yml +5 -2
- data/Rakefile +1 -1
- data/henkei.gemspec +7 -5
- data/lib/henkei.rb +28 -35
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +45 -34
- metadata +26 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4215a4071c7a618e66330cb7eca8873eae851b7b
|
4
|
+
data.tar.gz: 488366df7fdf1863272dc121b7fa74f077036352
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21ba327a8c1139b586944ca1d28e35158c66386a721da9fc6cb71d400f52e9f339931d17f39a3e1b2344e65f65a7ccddeeff84e515dc6c5d491e4005749071f4
|
7
|
+
data.tar.gz: 880bd8f6721d5c6659da8d10aac9949a582d670bdd18b64de16089c738c57c64fe7499d5405e0af736571317f79dedcf783b8ba0389b9794b90cbc939548916c
|
data/.rubocop.yml
ADDED
data/.travis.yml
CHANGED
@@ -4,8 +4,6 @@ env:
|
|
4
4
|
|
5
5
|
language: ruby
|
6
6
|
rvm:
|
7
|
-
- 1.9.3
|
8
|
-
- 2.0.0
|
9
7
|
- 2.1.10
|
10
8
|
- 2.2.7
|
11
9
|
- 2.3.6
|
@@ -15,12 +13,17 @@ rvm:
|
|
15
13
|
before_install:
|
16
14
|
- gem update bundler
|
17
15
|
|
16
|
+
install:
|
17
|
+
- bundle install --jobs=3 --retry=3
|
18
|
+
- gem install rubocop
|
19
|
+
|
18
20
|
before_script:
|
19
21
|
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
20
22
|
- chmod +x ./cc-test-reporter
|
21
23
|
- ./cc-test-reporter before-build
|
22
24
|
|
23
25
|
script:
|
26
|
+
- rubocop
|
24
27
|
- bundle exec rspec
|
25
28
|
|
26
29
|
after_script:
|
data/Rakefile
CHANGED
data/henkei.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
4
|
require 'henkei/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
@@ -9,20 +9,22 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
10
10
|
spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
|
11
11
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
12
|
-
spec.summary = 'Read text and metadata from files and documents
|
12
|
+
spec.summary = 'Read text and metadata from files and documents ' \
|
13
|
+
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
13
14
|
spec.homepage = 'http://github.com/abrom/henkei'
|
14
15
|
spec.license = 'MIT'
|
15
16
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
+
spec.files = `git ls-files`.split("\n")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
20
|
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
22
22
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
23
|
+
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
23
24
|
|
24
25
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
25
26
|
spec.add_development_dependency 'rake', '~> 12.3'
|
26
27
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
28
|
+
spec.add_development_dependency 'rubocop', '~> 0.53'
|
27
29
|
spec.add_development_dependency 'simplecov', '~> 0.15'
|
28
30
|
end
|
data/lib/henkei.rb
CHANGED
@@ -9,7 +9,8 @@ require 'json'
|
|
9
9
|
require 'socket'
|
10
10
|
require 'stringio'
|
11
11
|
|
12
|
-
|
12
|
+
# Read text and metadata from files and documents using Apache Tika toolkit
|
13
|
+
class Henkei # rubocop:disable Metrics/ClassLength
|
13
14
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
14
15
|
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
|
15
16
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
@@ -25,17 +26,13 @@ class Henkei
|
|
25
26
|
# metadata = Henkei.read :metadata, data
|
26
27
|
#
|
27
28
|
def self.read(type, data)
|
28
|
-
result = @@server_pid ? server_read(
|
29
|
+
result = @@server_pid ? server_read(data) : client_read(type, data)
|
29
30
|
|
30
31
|
case type
|
31
|
-
when :text
|
32
|
-
|
33
|
-
when :
|
34
|
-
|
35
|
-
when :metadata
|
36
|
-
JSON.parse(result)
|
37
|
-
when :mimetype
|
38
|
-
MIME::Types[JSON.parse(result)['Content-Type']].first
|
32
|
+
when :text then result
|
33
|
+
when :html then result
|
34
|
+
when :metadata then JSON.parse(result)
|
35
|
+
when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
|
39
36
|
end
|
40
37
|
end
|
41
38
|
|
@@ -55,17 +52,17 @@ class Henkei
|
|
55
52
|
#
|
56
53
|
def initialize(input)
|
57
54
|
if input.is_a? String
|
58
|
-
if File.
|
55
|
+
if File.exist? input
|
59
56
|
@path = input
|
60
|
-
elsif input =~ URI::
|
57
|
+
elsif input =~ URI::DEFAULT_PARSER.make_regexp
|
61
58
|
@uri = URI.parse input
|
62
59
|
else
|
63
|
-
raise Errno::ENOENT
|
60
|
+
raise Errno::ENOENT, "missing file or invalid URI - #{input}"
|
64
61
|
end
|
65
62
|
elsif input.respond_to? :read
|
66
63
|
@stream = input
|
67
64
|
else
|
68
|
-
raise TypeError
|
65
|
+
raise TypeError, "can't read from #{input.class.name}"
|
69
66
|
end
|
70
67
|
end
|
71
68
|
|
@@ -112,7 +109,7 @@ class Henkei
|
|
112
109
|
return @mimetype if defined? @mimetype
|
113
110
|
|
114
111
|
type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
|
115
|
-
|
112
|
+
|
116
113
|
@mimetype = MIME::Types[type].first
|
117
114
|
end
|
118
115
|
|
@@ -123,12 +120,8 @@ class Henkei
|
|
123
120
|
#
|
124
121
|
def creation_date
|
125
122
|
return @creation_date if defined? @creation_date
|
126
|
-
|
127
|
-
|
128
|
-
@creation_date = Time.parse(metadata['Creation-Date'])
|
129
|
-
else
|
130
|
-
nil
|
131
|
-
end
|
123
|
+
return unless metadata['Creation-Date']
|
124
|
+
@creation_date = Time.parse(metadata['Creation-Date'])
|
132
125
|
end
|
133
126
|
|
134
127
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -182,19 +175,19 @@ class Henkei
|
|
182
175
|
#
|
183
176
|
# type :html, :text or :metadata
|
184
177
|
# custom_port e.g. 9293
|
185
|
-
#
|
178
|
+
#
|
186
179
|
# Henkei.server(:text, 9294)
|
187
180
|
#
|
188
|
-
def self.server(type, custom_port=nil)
|
181
|
+
def self.server(type, custom_port = nil)
|
189
182
|
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
190
|
-
|
183
|
+
|
191
184
|
@@server_pid = Process.spawn tika_command(type, true)
|
192
185
|
sleep(2) # Give the server 2 seconds to spin up.
|
193
186
|
@@server_pid
|
194
187
|
end
|
195
188
|
|
196
189
|
# Kills server started by Henkei.server
|
197
|
-
#
|
190
|
+
#
|
198
191
|
# Always run this when you're done, or else Tika might run until you kill it manually
|
199
192
|
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
200
193
|
# putting this method in the ensure block.
|
@@ -209,11 +202,11 @@ class Henkei
|
|
209
202
|
# end
|
210
203
|
#
|
211
204
|
def self.kill_server!
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
205
|
+
return unless @@server_pid
|
206
|
+
|
207
|
+
Process.kill('INT', @@server_pid)
|
208
|
+
@@server_pid = nil
|
209
|
+
@@server_port = nil
|
217
210
|
end
|
218
211
|
|
219
212
|
### Private class methods
|
@@ -238,12 +231,12 @@ class Henkei
|
|
238
231
|
|
239
232
|
# Internal helper for calling to running Tika server
|
240
233
|
#
|
241
|
-
def self.server_read(
|
234
|
+
def self.server_read(data)
|
242
235
|
s = TCPSocket.new('localhost', @@server_port)
|
243
236
|
file = StringIO.new(data, 'r')
|
244
237
|
|
245
|
-
|
246
|
-
chunk = file.read(
|
238
|
+
loop do
|
239
|
+
chunk = file.read(65_536)
|
247
240
|
break unless chunk
|
248
241
|
s.write(chunk)
|
249
242
|
end
|
@@ -252,8 +245,8 @@ class Henkei
|
|
252
245
|
s.shutdown(Socket::SHUT_WR)
|
253
246
|
|
254
247
|
resp = ''
|
255
|
-
|
256
|
-
chunk = s.recv(
|
248
|
+
loop do
|
249
|
+
chunk = s.recv(65_536)
|
257
250
|
break if chunk.empty? || !chunk
|
258
251
|
resp << chunk
|
259
252
|
end
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -12,27 +12,31 @@ describe Henkei do
|
|
12
12
|
it 'reads text' do
|
13
13
|
text = Henkei.read :text, data
|
14
14
|
|
15
|
-
expect(
|
15
|
+
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'reads metadata' do
|
19
19
|
metadata = Henkei.read :metadata, data
|
20
20
|
|
21
|
-
expect(
|
21
|
+
expect(metadata['Content-Type']).to(
|
22
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
23
|
+
)
|
22
24
|
end
|
23
25
|
|
24
26
|
it 'reads metadata values with colons as strings' do
|
25
27
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
26
28
|
metadata = Henkei.read :metadata, data
|
27
29
|
|
28
|
-
expect(
|
30
|
+
expect(metadata['dc:title']).to eq 'problem: test'
|
29
31
|
end
|
30
32
|
|
31
33
|
it 'reads mimetype' do
|
32
34
|
mimetype = Henkei.read :mimetype, data
|
33
35
|
|
34
|
-
expect(
|
35
|
-
|
36
|
+
expect(mimetype.content_type).to(
|
37
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
38
|
+
)
|
39
|
+
expect(mimetype.extensions).to include 'docx'
|
36
40
|
end
|
37
41
|
end
|
38
42
|
|
@@ -44,47 +48,47 @@ describe Henkei do
|
|
44
48
|
it 'accepts a root path' do
|
45
49
|
henkei = Henkei.new 'spec/samples/sample.pages'
|
46
50
|
|
47
|
-
expect(
|
48
|
-
expect(
|
49
|
-
expect(
|
51
|
+
expect(henkei).to be_path
|
52
|
+
expect(henkei).not_to be_uri
|
53
|
+
expect(henkei).not_to be_stream
|
50
54
|
end
|
51
55
|
|
52
56
|
it 'accepts a relative path' do
|
53
57
|
henkei = Henkei.new 'spec/samples/sample.pages'
|
54
58
|
|
55
|
-
expect(
|
56
|
-
expect(
|
57
|
-
expect(
|
59
|
+
expect(henkei).to be_path
|
60
|
+
expect(henkei).not_to be_uri
|
61
|
+
expect(henkei).not_to be_stream
|
58
62
|
end
|
59
63
|
|
60
64
|
it 'accepts a path with spaces' do
|
61
65
|
henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
|
62
66
|
|
63
|
-
expect(
|
64
|
-
expect(
|
65
|
-
expect(
|
67
|
+
expect(henkei).to be_path
|
68
|
+
expect(henkei).not_to be_uri
|
69
|
+
expect(henkei).not_to be_stream
|
66
70
|
end
|
67
71
|
|
68
72
|
it 'accepts a URI' do
|
69
73
|
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
70
74
|
|
71
|
-
expect(
|
72
|
-
expect(
|
73
|
-
expect(
|
75
|
+
expect(henkei).to be_uri
|
76
|
+
expect(henkei).not_to be_path
|
77
|
+
expect(henkei).not_to be_stream
|
74
78
|
end
|
75
79
|
|
76
80
|
it 'accepts a stream or object that can be read' do
|
77
81
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
78
82
|
henkei = Henkei.new file
|
79
83
|
|
80
|
-
expect(
|
81
|
-
expect(
|
82
|
-
expect(
|
84
|
+
expect(henkei).to be_stream
|
85
|
+
expect(henkei).not_to be_path
|
86
|
+
expect(henkei).not_to be_uri
|
83
87
|
end
|
84
88
|
end
|
85
89
|
|
86
90
|
it 'refuses a path to a missing file' do
|
87
|
-
expect { Henkei.new 'test/sample/missing.pages'}.to raise_error Errno::ENOENT
|
91
|
+
expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
88
92
|
end
|
89
93
|
|
90
94
|
it 'refuses other objects' do
|
@@ -94,23 +98,22 @@ describe Henkei do
|
|
94
98
|
end
|
95
99
|
end
|
96
100
|
|
97
|
-
|
98
101
|
describe '.creation_date' do
|
99
102
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
100
103
|
it 'should return Time' do
|
101
|
-
expect(
|
104
|
+
expect(henkei.creation_date).to be_a Time
|
102
105
|
end
|
103
106
|
end
|
104
107
|
|
105
108
|
describe '.java' do
|
106
109
|
specify 'with no specified JAVA_HOME' do
|
107
|
-
expect(
|
110
|
+
expect(Henkei.send(:java_path)).to eq 'java'
|
108
111
|
end
|
109
112
|
|
110
113
|
specify 'with a specified JAVA_HOME' do
|
111
114
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
112
115
|
|
113
|
-
expect(
|
116
|
+
expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
114
117
|
end
|
115
118
|
end
|
116
119
|
|
@@ -118,11 +121,11 @@ describe Henkei do
|
|
118
121
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
119
122
|
|
120
123
|
specify '#text reads text' do
|
121
|
-
expect(
|
124
|
+
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
122
125
|
end
|
123
126
|
|
124
127
|
specify '#metadata reads metadata' do
|
125
|
-
expect(
|
128
|
+
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
126
129
|
end
|
127
130
|
end
|
128
131
|
|
@@ -130,11 +133,13 @@ describe Henkei do
|
|
130
133
|
let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
131
134
|
|
132
135
|
specify '#text reads text' do
|
133
|
-
expect(
|
136
|
+
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
134
137
|
end
|
135
138
|
|
136
139
|
specify '#metadata reads metadata' do
|
137
|
-
expect(
|
140
|
+
expect(henkei.metadata['Content-Type']).to(
|
141
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
142
|
+
)
|
138
143
|
end
|
139
144
|
end
|
140
145
|
|
@@ -142,11 +147,11 @@ describe Henkei do
|
|
142
147
|
let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
|
143
148
|
|
144
149
|
specify '#text reads text' do
|
145
|
-
expect(
|
150
|
+
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
146
151
|
end
|
147
152
|
|
148
153
|
specify '#metadata reads metadata' do
|
149
|
-
expect(
|
154
|
+
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
150
155
|
end
|
151
156
|
end
|
152
157
|
|
@@ -171,9 +176,15 @@ describe Henkei do
|
|
171
176
|
specify '#runs samples through server mode' do
|
172
177
|
begin
|
173
178
|
Henkei.server(:text)
|
174
|
-
expect(Henkei.new('spec/samples/sample.pages').text).to
|
175
|
-
|
176
|
-
|
179
|
+
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
180
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
181
|
+
)
|
182
|
+
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
183
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
184
|
+
)
|
185
|
+
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
186
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
187
|
+
)
|
177
188
|
ensure
|
178
189
|
Henkei.kill_server!
|
179
190
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.17.
|
4
|
+
version: 1.17.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -12,45 +12,45 @@ cert_chain: []
|
|
12
12
|
date: 2018-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: json
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: '1.
|
20
|
+
version: '1.8'
|
21
21
|
- - "<"
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version: '
|
23
|
+
version: '3'
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
27
27
|
requirements:
|
28
28
|
- - ">="
|
29
29
|
- !ruby/object:Gem::Version
|
30
|
-
version: '1.
|
30
|
+
version: '1.8'
|
31
31
|
- - "<"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '3'
|
34
34
|
- !ruby/object:Gem::Dependency
|
35
|
-
name:
|
35
|
+
name: mime-types
|
36
36
|
requirement: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1.
|
40
|
+
version: '1.23'
|
41
41
|
- - "<"
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '
|
43
|
+
version: '4'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
46
|
version_requirements: !ruby/object:Gem::Requirement
|
47
47
|
requirements:
|
48
48
|
- - ">="
|
49
49
|
- !ruby/object:Gem::Version
|
50
|
-
version: '1.
|
50
|
+
version: '1.23'
|
51
51
|
- - "<"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
53
|
+
version: '4'
|
54
54
|
- !ruby/object:Gem::Dependency
|
55
55
|
name: bundler
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -93,6 +93,20 @@ dependencies:
|
|
93
93
|
- - "~>"
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '3.7'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: rubocop
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0.53'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0.53'
|
96
110
|
- !ruby/object:Gem::Dependency
|
97
111
|
name: simplecov
|
98
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -117,6 +131,7 @@ extra_rdoc_files: []
|
|
117
131
|
files:
|
118
132
|
- ".gitignore"
|
119
133
|
- ".rspec"
|
134
|
+
- ".rubocop.yml"
|
120
135
|
- ".travis.yml"
|
121
136
|
- Gemfile
|
122
137
|
- LICENSE
|