henkei 1.17.3 → 1.17.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -0
- data/.travis.yml +5 -2
- data/Rakefile +1 -1
- data/henkei.gemspec +7 -5
- data/lib/henkei.rb +28 -35
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +45 -34
- metadata +26 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4215a4071c7a618e66330cb7eca8873eae851b7b
|
4
|
+
data.tar.gz: 488366df7fdf1863272dc121b7fa74f077036352
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21ba327a8c1139b586944ca1d28e35158c66386a721da9fc6cb71d400f52e9f339931d17f39a3e1b2344e65f65a7ccddeeff84e515dc6c5d491e4005749071f4
|
7
|
+
data.tar.gz: 880bd8f6721d5c6659da8d10aac9949a582d670bdd18b64de16089c738c57c64fe7499d5405e0af736571317f79dedcf783b8ba0389b9794b90cbc939548916c
|
data/.rubocop.yml
ADDED
data/.travis.yml
CHANGED
@@ -4,8 +4,6 @@ env:
|
|
4
4
|
|
5
5
|
language: ruby
|
6
6
|
rvm:
|
7
|
-
- 1.9.3
|
8
|
-
- 2.0.0
|
9
7
|
- 2.1.10
|
10
8
|
- 2.2.7
|
11
9
|
- 2.3.6
|
@@ -15,12 +13,17 @@ rvm:
|
|
15
13
|
before_install:
|
16
14
|
- gem update bundler
|
17
15
|
|
16
|
+
install:
|
17
|
+
- bundle install --jobs=3 --retry=3
|
18
|
+
- gem install rubocop
|
19
|
+
|
18
20
|
before_script:
|
19
21
|
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
20
22
|
- chmod +x ./cc-test-reporter
|
21
23
|
- ./cc-test-reporter before-build
|
22
24
|
|
23
25
|
script:
|
26
|
+
- rubocop
|
24
27
|
- bundle exec rspec
|
25
28
|
|
26
29
|
after_script:
|
data/Rakefile
CHANGED
data/henkei.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
4
|
require 'henkei/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
@@ -9,20 +9,22 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
10
10
|
spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
|
11
11
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
12
|
-
spec.summary = 'Read text and metadata from files and documents
|
12
|
+
spec.summary = 'Read text and metadata from files and documents ' \
|
13
|
+
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
13
14
|
spec.homepage = 'http://github.com/abrom/henkei'
|
14
15
|
spec.license = 'MIT'
|
15
16
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
+
spec.files = `git ls-files`.split("\n")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
20
|
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
22
22
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
23
|
+
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
23
24
|
|
24
25
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
25
26
|
spec.add_development_dependency 'rake', '~> 12.3'
|
26
27
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
28
|
+
spec.add_development_dependency 'rubocop', '~> 0.53'
|
27
29
|
spec.add_development_dependency 'simplecov', '~> 0.15'
|
28
30
|
end
|
data/lib/henkei.rb
CHANGED
@@ -9,7 +9,8 @@ require 'json'
|
|
9
9
|
require 'socket'
|
10
10
|
require 'stringio'
|
11
11
|
|
12
|
-
|
12
|
+
# Read text and metadata from files and documents using Apache Tika toolkit
|
13
|
+
class Henkei # rubocop:disable Metrics/ClassLength
|
13
14
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
14
15
|
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
|
15
16
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
@@ -25,17 +26,13 @@ class Henkei
|
|
25
26
|
# metadata = Henkei.read :metadata, data
|
26
27
|
#
|
27
28
|
def self.read(type, data)
|
28
|
-
result = @@server_pid ? server_read(
|
29
|
+
result = @@server_pid ? server_read(data) : client_read(type, data)
|
29
30
|
|
30
31
|
case type
|
31
|
-
when :text
|
32
|
-
|
33
|
-
when :
|
34
|
-
|
35
|
-
when :metadata
|
36
|
-
JSON.parse(result)
|
37
|
-
when :mimetype
|
38
|
-
MIME::Types[JSON.parse(result)['Content-Type']].first
|
32
|
+
when :text then result
|
33
|
+
when :html then result
|
34
|
+
when :metadata then JSON.parse(result)
|
35
|
+
when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
|
39
36
|
end
|
40
37
|
end
|
41
38
|
|
@@ -55,17 +52,17 @@ class Henkei
|
|
55
52
|
#
|
56
53
|
def initialize(input)
|
57
54
|
if input.is_a? String
|
58
|
-
if File.
|
55
|
+
if File.exist? input
|
59
56
|
@path = input
|
60
|
-
elsif input =~ URI::
|
57
|
+
elsif input =~ URI::DEFAULT_PARSER.make_regexp
|
61
58
|
@uri = URI.parse input
|
62
59
|
else
|
63
|
-
raise Errno::ENOENT
|
60
|
+
raise Errno::ENOENT, "missing file or invalid URI - #{input}"
|
64
61
|
end
|
65
62
|
elsif input.respond_to? :read
|
66
63
|
@stream = input
|
67
64
|
else
|
68
|
-
raise TypeError
|
65
|
+
raise TypeError, "can't read from #{input.class.name}"
|
69
66
|
end
|
70
67
|
end
|
71
68
|
|
@@ -112,7 +109,7 @@ class Henkei
|
|
112
109
|
return @mimetype if defined? @mimetype
|
113
110
|
|
114
111
|
type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
|
115
|
-
|
112
|
+
|
116
113
|
@mimetype = MIME::Types[type].first
|
117
114
|
end
|
118
115
|
|
@@ -123,12 +120,8 @@ class Henkei
|
|
123
120
|
#
|
124
121
|
def creation_date
|
125
122
|
return @creation_date if defined? @creation_date
|
126
|
-
|
127
|
-
|
128
|
-
@creation_date = Time.parse(metadata['Creation-Date'])
|
129
|
-
else
|
130
|
-
nil
|
131
|
-
end
|
123
|
+
return unless metadata['Creation-Date']
|
124
|
+
@creation_date = Time.parse(metadata['Creation-Date'])
|
132
125
|
end
|
133
126
|
|
134
127
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -182,19 +175,19 @@ class Henkei
|
|
182
175
|
#
|
183
176
|
# type :html, :text or :metadata
|
184
177
|
# custom_port e.g. 9293
|
185
|
-
#
|
178
|
+
#
|
186
179
|
# Henkei.server(:text, 9294)
|
187
180
|
#
|
188
|
-
def self.server(type, custom_port=nil)
|
181
|
+
def self.server(type, custom_port = nil)
|
189
182
|
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
190
|
-
|
183
|
+
|
191
184
|
@@server_pid = Process.spawn tika_command(type, true)
|
192
185
|
sleep(2) # Give the server 2 seconds to spin up.
|
193
186
|
@@server_pid
|
194
187
|
end
|
195
188
|
|
196
189
|
# Kills server started by Henkei.server
|
197
|
-
#
|
190
|
+
#
|
198
191
|
# Always run this when you're done, or else Tika might run until you kill it manually
|
199
192
|
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
200
193
|
# putting this method in the ensure block.
|
@@ -209,11 +202,11 @@ class Henkei
|
|
209
202
|
# end
|
210
203
|
#
|
211
204
|
def self.kill_server!
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
205
|
+
return unless @@server_pid
|
206
|
+
|
207
|
+
Process.kill('INT', @@server_pid)
|
208
|
+
@@server_pid = nil
|
209
|
+
@@server_port = nil
|
217
210
|
end
|
218
211
|
|
219
212
|
### Private class methods
|
@@ -238,12 +231,12 @@ class Henkei
|
|
238
231
|
|
239
232
|
# Internal helper for calling to running Tika server
|
240
233
|
#
|
241
|
-
def self.server_read(
|
234
|
+
def self.server_read(data)
|
242
235
|
s = TCPSocket.new('localhost', @@server_port)
|
243
236
|
file = StringIO.new(data, 'r')
|
244
237
|
|
245
|
-
|
246
|
-
chunk = file.read(
|
238
|
+
loop do
|
239
|
+
chunk = file.read(65_536)
|
247
240
|
break unless chunk
|
248
241
|
s.write(chunk)
|
249
242
|
end
|
@@ -252,8 +245,8 @@ class Henkei
|
|
252
245
|
s.shutdown(Socket::SHUT_WR)
|
253
246
|
|
254
247
|
resp = ''
|
255
|
-
|
256
|
-
chunk = s.recv(
|
248
|
+
loop do
|
249
|
+
chunk = s.recv(65_536)
|
257
250
|
break if chunk.empty? || !chunk
|
258
251
|
resp << chunk
|
259
252
|
end
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -12,27 +12,31 @@ describe Henkei do
|
|
12
12
|
it 'reads text' do
|
13
13
|
text = Henkei.read :text, data
|
14
14
|
|
15
|
-
expect(
|
15
|
+
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'reads metadata' do
|
19
19
|
metadata = Henkei.read :metadata, data
|
20
20
|
|
21
|
-
expect(
|
21
|
+
expect(metadata['Content-Type']).to(
|
22
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
23
|
+
)
|
22
24
|
end
|
23
25
|
|
24
26
|
it 'reads metadata values with colons as strings' do
|
25
27
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
26
28
|
metadata = Henkei.read :metadata, data
|
27
29
|
|
28
|
-
expect(
|
30
|
+
expect(metadata['dc:title']).to eq 'problem: test'
|
29
31
|
end
|
30
32
|
|
31
33
|
it 'reads mimetype' do
|
32
34
|
mimetype = Henkei.read :mimetype, data
|
33
35
|
|
34
|
-
expect(
|
35
|
-
|
36
|
+
expect(mimetype.content_type).to(
|
37
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
38
|
+
)
|
39
|
+
expect(mimetype.extensions).to include 'docx'
|
36
40
|
end
|
37
41
|
end
|
38
42
|
|
@@ -44,47 +48,47 @@ describe Henkei do
|
|
44
48
|
it 'accepts a root path' do
|
45
49
|
henkei = Henkei.new 'spec/samples/sample.pages'
|
46
50
|
|
47
|
-
expect(
|
48
|
-
expect(
|
49
|
-
expect(
|
51
|
+
expect(henkei).to be_path
|
52
|
+
expect(henkei).not_to be_uri
|
53
|
+
expect(henkei).not_to be_stream
|
50
54
|
end
|
51
55
|
|
52
56
|
it 'accepts a relative path' do
|
53
57
|
henkei = Henkei.new 'spec/samples/sample.pages'
|
54
58
|
|
55
|
-
expect(
|
56
|
-
expect(
|
57
|
-
expect(
|
59
|
+
expect(henkei).to be_path
|
60
|
+
expect(henkei).not_to be_uri
|
61
|
+
expect(henkei).not_to be_stream
|
58
62
|
end
|
59
63
|
|
60
64
|
it 'accepts a path with spaces' do
|
61
65
|
henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
|
62
66
|
|
63
|
-
expect(
|
64
|
-
expect(
|
65
|
-
expect(
|
67
|
+
expect(henkei).to be_path
|
68
|
+
expect(henkei).not_to be_uri
|
69
|
+
expect(henkei).not_to be_stream
|
66
70
|
end
|
67
71
|
|
68
72
|
it 'accepts a URI' do
|
69
73
|
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
70
74
|
|
71
|
-
expect(
|
72
|
-
expect(
|
73
|
-
expect(
|
75
|
+
expect(henkei).to be_uri
|
76
|
+
expect(henkei).not_to be_path
|
77
|
+
expect(henkei).not_to be_stream
|
74
78
|
end
|
75
79
|
|
76
80
|
it 'accepts a stream or object that can be read' do
|
77
81
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
78
82
|
henkei = Henkei.new file
|
79
83
|
|
80
|
-
expect(
|
81
|
-
expect(
|
82
|
-
expect(
|
84
|
+
expect(henkei).to be_stream
|
85
|
+
expect(henkei).not_to be_path
|
86
|
+
expect(henkei).not_to be_uri
|
83
87
|
end
|
84
88
|
end
|
85
89
|
|
86
90
|
it 'refuses a path to a missing file' do
|
87
|
-
expect { Henkei.new 'test/sample/missing.pages'}.to raise_error Errno::ENOENT
|
91
|
+
expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
88
92
|
end
|
89
93
|
|
90
94
|
it 'refuses other objects' do
|
@@ -94,23 +98,22 @@ describe Henkei do
|
|
94
98
|
end
|
95
99
|
end
|
96
100
|
|
97
|
-
|
98
101
|
describe '.creation_date' do
|
99
102
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
100
103
|
it 'should return Time' do
|
101
|
-
expect(
|
104
|
+
expect(henkei.creation_date).to be_a Time
|
102
105
|
end
|
103
106
|
end
|
104
107
|
|
105
108
|
describe '.java' do
|
106
109
|
specify 'with no specified JAVA_HOME' do
|
107
|
-
expect(
|
110
|
+
expect(Henkei.send(:java_path)).to eq 'java'
|
108
111
|
end
|
109
112
|
|
110
113
|
specify 'with a specified JAVA_HOME' do
|
111
114
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
112
115
|
|
113
|
-
expect(
|
116
|
+
expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
114
117
|
end
|
115
118
|
end
|
116
119
|
|
@@ -118,11 +121,11 @@ describe Henkei do
|
|
118
121
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
119
122
|
|
120
123
|
specify '#text reads text' do
|
121
|
-
expect(
|
124
|
+
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
122
125
|
end
|
123
126
|
|
124
127
|
specify '#metadata reads metadata' do
|
125
|
-
expect(
|
128
|
+
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
126
129
|
end
|
127
130
|
end
|
128
131
|
|
@@ -130,11 +133,13 @@ describe Henkei do
|
|
130
133
|
let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
131
134
|
|
132
135
|
specify '#text reads text' do
|
133
|
-
expect(
|
136
|
+
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
134
137
|
end
|
135
138
|
|
136
139
|
specify '#metadata reads metadata' do
|
137
|
-
expect(
|
140
|
+
expect(henkei.metadata['Content-Type']).to(
|
141
|
+
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
142
|
+
)
|
138
143
|
end
|
139
144
|
end
|
140
145
|
|
@@ -142,11 +147,11 @@ describe Henkei do
|
|
142
147
|
let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
|
143
148
|
|
144
149
|
specify '#text reads text' do
|
145
|
-
expect(
|
150
|
+
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
146
151
|
end
|
147
152
|
|
148
153
|
specify '#metadata reads metadata' do
|
149
|
-
expect(
|
154
|
+
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
150
155
|
end
|
151
156
|
end
|
152
157
|
|
@@ -171,9 +176,15 @@ describe Henkei do
|
|
171
176
|
specify '#runs samples through server mode' do
|
172
177
|
begin
|
173
178
|
Henkei.server(:text)
|
174
|
-
expect(Henkei.new('spec/samples/sample.pages').text).to
|
175
|
-
|
176
|
-
|
179
|
+
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
180
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
181
|
+
)
|
182
|
+
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
183
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
184
|
+
)
|
185
|
+
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
186
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
187
|
+
)
|
177
188
|
ensure
|
178
189
|
Henkei.kill_server!
|
179
190
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.17.
|
4
|
+
version: 1.17.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -12,45 +12,45 @@ cert_chain: []
|
|
12
12
|
date: 2018-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: json
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: '1.
|
20
|
+
version: '1.8'
|
21
21
|
- - "<"
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version: '
|
23
|
+
version: '3'
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
27
27
|
requirements:
|
28
28
|
- - ">="
|
29
29
|
- !ruby/object:Gem::Version
|
30
|
-
version: '1.
|
30
|
+
version: '1.8'
|
31
31
|
- - "<"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '3'
|
34
34
|
- !ruby/object:Gem::Dependency
|
35
|
-
name:
|
35
|
+
name: mime-types
|
36
36
|
requirement: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1.
|
40
|
+
version: '1.23'
|
41
41
|
- - "<"
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '
|
43
|
+
version: '4'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
46
|
version_requirements: !ruby/object:Gem::Requirement
|
47
47
|
requirements:
|
48
48
|
- - ">="
|
49
49
|
- !ruby/object:Gem::Version
|
50
|
-
version: '1.
|
50
|
+
version: '1.23'
|
51
51
|
- - "<"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
53
|
+
version: '4'
|
54
54
|
- !ruby/object:Gem::Dependency
|
55
55
|
name: bundler
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -93,6 +93,20 @@ dependencies:
|
|
93
93
|
- - "~>"
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '3.7'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: rubocop
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0.53'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0.53'
|
96
110
|
- !ruby/object:Gem::Dependency
|
97
111
|
name: simplecov
|
98
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -117,6 +131,7 @@ extra_rdoc_files: []
|
|
117
131
|
files:
|
118
132
|
- ".gitignore"
|
119
133
|
- ".rspec"
|
134
|
+
- ".rubocop.yml"
|
120
135
|
- ".travis.yml"
|
121
136
|
- Gemfile
|
122
137
|
- LICENSE
|