henkei 1.17.2 → 1.17.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/henkei.gemspec +7 -7
- data/jar/tika-config.xml +3 -0
- data/lib/henkei.rb +29 -31
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +2 -2
- metadata +26 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d629bcc0d435522497e749752307e6c33844d29
|
4
|
+
data.tar.gz: 490b5e0c89b43f4ec83434e3861c85d69de7a8bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f4e76efb3cf67bca58db9e6dc074511af3e8661b5c03800bbf28cce9939b01774c33aaa65ae24df5e46a5346cda97347e17add23e934cbaf0cbaf7c48e73246
|
7
|
+
data.tar.gz: 21cf99d84f4428f3db892aabc5827f02c848e70bb693080ac33789f4ea66d64de03f88856af7e89c14a4b783a038e33c5fba7ad6229e316f99be2cc2c0ab5fa1
|
data/henkei.gemspec
CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Henkei::VERSION
|
9
9
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
10
10
|
spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
11
|
+
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
12
|
+
spec.summary = 'Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
13
13
|
spec.homepage = 'http://github.com/abrom/henkei'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'mime-types', '>= 1.23'
|
22
|
-
spec.add_runtime_dependency 'json', '>= 1.8'
|
21
|
+
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
22
|
+
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
23
23
|
|
24
24
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
25
|
-
spec.add_development_dependency 'rake'
|
26
|
-
spec.add_development_dependency 'rspec', '~> 3.
|
27
|
-
spec.add_development_dependency 'simplecov'
|
25
|
+
spec.add_development_dependency 'rake', '~> 12.3'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.7'
|
27
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
28
28
|
end
|
data/jar/tika-config.xml
ADDED
data/lib/henkei.rb
CHANGED
@@ -10,8 +10,9 @@ require 'socket'
|
|
10
10
|
require 'stringio'
|
11
11
|
|
12
12
|
class Henkei
|
13
|
-
|
14
|
-
|
13
|
+
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
14
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
|
15
|
+
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
15
16
|
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
16
17
|
|
17
18
|
@@server_port = nil
|
@@ -185,21 +186,9 @@ class Henkei
|
|
185
186
|
# Henkei.server(:text, 9294)
|
186
187
|
#
|
187
188
|
def self.server(type, custom_port=nil)
|
188
|
-
switch =
|
189
|
-
case type
|
190
|
-
when :text
|
191
|
-
'-t'
|
192
|
-
when :html
|
193
|
-
'-h'
|
194
|
-
when :metadata
|
195
|
-
'-m -j'
|
196
|
-
when :mimetype
|
197
|
-
'-m -j'
|
198
|
-
end
|
199
|
-
|
200
189
|
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
201
190
|
|
202
|
-
@@server_pid = Process.spawn(
|
191
|
+
@@server_pid = Process.spawn tika_command(type, true)
|
203
192
|
sleep(2) # Give the server 2 seconds to spin up.
|
204
193
|
@@server_pid
|
205
194
|
end
|
@@ -213,7 +202,7 @@ class Henkei
|
|
213
202
|
# Henkei.server(:text)
|
214
203
|
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
215
204
|
# begin
|
216
|
-
# my_texts = reports.map{|report_path| Henkei.new(report_path).text }
|
205
|
+
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
217
206
|
# rescue
|
218
207
|
# ensure
|
219
208
|
# Henkei.kill_server!
|
@@ -231,27 +220,15 @@ class Henkei
|
|
231
220
|
|
232
221
|
# Provide the path to the Java binary
|
233
222
|
#
|
234
|
-
def self.
|
223
|
+
def self.java_path
|
235
224
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
236
225
|
end
|
237
|
-
private_class_method :
|
226
|
+
private_class_method :java_path
|
238
227
|
|
239
228
|
# Internal helper for calling to Tika library directly
|
240
229
|
#
|
241
230
|
def self.client_read(type, data)
|
242
|
-
|
243
|
-
case type
|
244
|
-
when :text
|
245
|
-
'-t'
|
246
|
-
when :html
|
247
|
-
'-h'
|
248
|
-
when :metadata
|
249
|
-
'-m -j'
|
250
|
-
when :mimetype
|
251
|
-
'-m -j'
|
252
|
-
end
|
253
|
-
|
254
|
-
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
231
|
+
IO.popen tika_command(type), 'r+' do |io|
|
255
232
|
io.write data
|
256
233
|
io.close_write
|
257
234
|
io.read
|
@@ -283,4 +260,25 @@ class Henkei
|
|
283
260
|
resp
|
284
261
|
end
|
285
262
|
private_class_method :server_read
|
263
|
+
|
264
|
+
# Internal helper for building the Java command to call Tika
|
265
|
+
#
|
266
|
+
def self.tika_command(type, server = false)
|
267
|
+
command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
|
268
|
+
command << "--server --port #{@@server_port}" if server
|
269
|
+
command << switch_for_type(type)
|
270
|
+
command.join ' '
|
271
|
+
end
|
272
|
+
|
273
|
+
# Internal helper for building the Java command to call Tika
|
274
|
+
#
|
275
|
+
def self.switch_for_type(type)
|
276
|
+
case type
|
277
|
+
when :text then '-t'
|
278
|
+
when :html then '-h'
|
279
|
+
when :metadata then '-m -j'
|
280
|
+
when :mimetype then '-m -j'
|
281
|
+
end
|
282
|
+
end
|
283
|
+
private_class_method :switch_for_type
|
286
284
|
end
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -104,13 +104,13 @@ describe Henkei do
|
|
104
104
|
|
105
105
|
describe '.java' do
|
106
106
|
specify 'with no specified JAVA_HOME' do
|
107
|
-
expect( Henkei.send(:
|
107
|
+
expect( Henkei.send(:java_path) ).to eql 'java'
|
108
108
|
end
|
109
109
|
|
110
110
|
specify 'with a specified JAVA_HOME' do
|
111
111
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
112
112
|
|
113
|
-
expect( Henkei.send(:
|
113
|
+
expect( Henkei.send(:java_path) ).to eql '/path/to/java/home/bin/java'
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.17.
|
4
|
+
version: 1.17.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mime-types
|
@@ -18,6 +18,9 @@ dependencies:
|
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
20
|
version: '1.23'
|
21
|
+
- - "<"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: '4'
|
21
24
|
type: :runtime
|
22
25
|
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -25,6 +28,9 @@ dependencies:
|
|
25
28
|
- - ">="
|
26
29
|
- !ruby/object:Gem::Version
|
27
30
|
version: '1.23'
|
31
|
+
- - "<"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '4'
|
28
34
|
- !ruby/object:Gem::Dependency
|
29
35
|
name: json
|
30
36
|
requirement: !ruby/object:Gem::Requirement
|
@@ -32,6 +38,9 @@ dependencies:
|
|
32
38
|
- - ">="
|
33
39
|
- !ruby/object:Gem::Version
|
34
40
|
version: '1.8'
|
41
|
+
- - "<"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '3'
|
35
44
|
type: :runtime
|
36
45
|
prerelease: false
|
37
46
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -39,6 +48,9 @@ dependencies:
|
|
39
48
|
- - ">="
|
40
49
|
- !ruby/object:Gem::Version
|
41
50
|
version: '1.8'
|
51
|
+
- - "<"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '3'
|
42
54
|
- !ruby/object:Gem::Dependency
|
43
55
|
name: bundler
|
44
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,46 +69,45 @@ dependencies:
|
|
57
69
|
name: rake
|
58
70
|
requirement: !ruby/object:Gem::Requirement
|
59
71
|
requirements:
|
60
|
-
- - "
|
72
|
+
- - "~>"
|
61
73
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
74
|
+
version: '12.3'
|
63
75
|
type: :development
|
64
76
|
prerelease: false
|
65
77
|
version_requirements: !ruby/object:Gem::Requirement
|
66
78
|
requirements:
|
67
|
-
- - "
|
79
|
+
- - "~>"
|
68
80
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
81
|
+
version: '12.3'
|
70
82
|
- !ruby/object:Gem::Dependency
|
71
83
|
name: rspec
|
72
84
|
requirement: !ruby/object:Gem::Requirement
|
73
85
|
requirements:
|
74
86
|
- - "~>"
|
75
87
|
- !ruby/object:Gem::Version
|
76
|
-
version: '3.
|
88
|
+
version: '3.7'
|
77
89
|
type: :development
|
78
90
|
prerelease: false
|
79
91
|
version_requirements: !ruby/object:Gem::Requirement
|
80
92
|
requirements:
|
81
93
|
- - "~>"
|
82
94
|
- !ruby/object:Gem::Version
|
83
|
-
version: '3.
|
95
|
+
version: '3.7'
|
84
96
|
- !ruby/object:Gem::Dependency
|
85
97
|
name: simplecov
|
86
98
|
requirement: !ruby/object:Gem::Requirement
|
87
99
|
requirements:
|
88
|
-
- - "
|
100
|
+
- - "~>"
|
89
101
|
- !ruby/object:Gem::Version
|
90
|
-
version: '0'
|
102
|
+
version: '0.15'
|
91
103
|
type: :development
|
92
104
|
prerelease: false
|
93
105
|
version_requirements: !ruby/object:Gem::Requirement
|
94
106
|
requirements:
|
95
|
-
- - "
|
107
|
+
- - "~>"
|
96
108
|
- !ruby/object:Gem::Version
|
97
|
-
version: '0'
|
98
|
-
description: Read text and metadata from files and documents
|
99
|
-
.odt, .rtf, .pdf) using Apache Tika toolkit
|
109
|
+
version: '0.15'
|
110
|
+
description: Read text and metadata from files and documents using Apache Tika toolkit
|
100
111
|
email:
|
101
112
|
- erol.fornoles@gmail.com
|
102
113
|
- a.bromwich@gmail.com
|
@@ -114,6 +125,7 @@ files:
|
|
114
125
|
- Rakefile
|
115
126
|
- henkei.gemspec
|
116
127
|
- jar/tika-app-1.17.jar
|
128
|
+
- jar/tika-config.xml
|
117
129
|
- lib/henkei.rb
|
118
130
|
- lib/henkei/version.rb
|
119
131
|
- lib/henkei/yomu.rb
|