henkei 1.17.2 → 1.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/henkei.gemspec +7 -7
- data/jar/tika-config.xml +3 -0
- data/lib/henkei.rb +29 -31
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +2 -2
- metadata +26 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d629bcc0d435522497e749752307e6c33844d29
|
4
|
+
data.tar.gz: 490b5e0c89b43f4ec83434e3861c85d69de7a8bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f4e76efb3cf67bca58db9e6dc074511af3e8661b5c03800bbf28cce9939b01774c33aaa65ae24df5e46a5346cda97347e17add23e934cbaf0cbaf7c48e73246
|
7
|
+
data.tar.gz: 21cf99d84f4428f3db892aabc5827f02c848e70bb693080ac33789f4ea66d64de03f88856af7e89c14a4b783a038e33c5fba7ad6229e316f99be2cc2c0ab5fa1
|
data/henkei.gemspec
CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Henkei::VERSION
|
9
9
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
10
10
|
spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
11
|
+
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
12
|
+
spec.summary = 'Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
13
13
|
spec.homepage = 'http://github.com/abrom/henkei'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'mime-types', '>= 1.23'
|
22
|
-
spec.add_runtime_dependency 'json', '>= 1.8'
|
21
|
+
spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
|
22
|
+
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
23
23
|
|
24
24
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
25
|
-
spec.add_development_dependency 'rake'
|
26
|
-
spec.add_development_dependency 'rspec', '~> 3.
|
27
|
-
spec.add_development_dependency 'simplecov'
|
25
|
+
spec.add_development_dependency 'rake', '~> 12.3'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.7'
|
27
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
28
28
|
end
|
data/jar/tika-config.xml
ADDED
data/lib/henkei.rb
CHANGED
@@ -10,8 +10,9 @@ require 'socket'
|
|
10
10
|
require 'stringio'
|
11
11
|
|
12
12
|
class Henkei
|
13
|
-
|
14
|
-
|
13
|
+
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
14
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
|
15
|
+
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
15
16
|
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
16
17
|
|
17
18
|
@@server_port = nil
|
@@ -185,21 +186,9 @@ class Henkei
|
|
185
186
|
# Henkei.server(:text, 9294)
|
186
187
|
#
|
187
188
|
def self.server(type, custom_port=nil)
|
188
|
-
switch =
|
189
|
-
case type
|
190
|
-
when :text
|
191
|
-
'-t'
|
192
|
-
when :html
|
193
|
-
'-h'
|
194
|
-
when :metadata
|
195
|
-
'-m -j'
|
196
|
-
when :mimetype
|
197
|
-
'-m -j'
|
198
|
-
end
|
199
|
-
|
200
189
|
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
201
190
|
|
202
|
-
@@server_pid = Process.spawn(
|
191
|
+
@@server_pid = Process.spawn tika_command(type, true)
|
203
192
|
sleep(2) # Give the server 2 seconds to spin up.
|
204
193
|
@@server_pid
|
205
194
|
end
|
@@ -213,7 +202,7 @@ class Henkei
|
|
213
202
|
# Henkei.server(:text)
|
214
203
|
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
215
204
|
# begin
|
216
|
-
# my_texts = reports.map{|report_path| Henkei.new(report_path).text }
|
205
|
+
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
217
206
|
# rescue
|
218
207
|
# ensure
|
219
208
|
# Henkei.kill_server!
|
@@ -231,27 +220,15 @@ class Henkei
|
|
231
220
|
|
232
221
|
# Provide the path to the Java binary
|
233
222
|
#
|
234
|
-
def self.
|
223
|
+
def self.java_path
|
235
224
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
236
225
|
end
|
237
|
-
private_class_method :
|
226
|
+
private_class_method :java_path
|
238
227
|
|
239
228
|
# Internal helper for calling to Tika library directly
|
240
229
|
#
|
241
230
|
def self.client_read(type, data)
|
242
|
-
|
243
|
-
case type
|
244
|
-
when :text
|
245
|
-
'-t'
|
246
|
-
when :html
|
247
|
-
'-h'
|
248
|
-
when :metadata
|
249
|
-
'-m -j'
|
250
|
-
when :mimetype
|
251
|
-
'-m -j'
|
252
|
-
end
|
253
|
-
|
254
|
-
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
231
|
+
IO.popen tika_command(type), 'r+' do |io|
|
255
232
|
io.write data
|
256
233
|
io.close_write
|
257
234
|
io.read
|
@@ -283,4 +260,25 @@ class Henkei
|
|
283
260
|
resp
|
284
261
|
end
|
285
262
|
private_class_method :server_read
|
263
|
+
|
264
|
+
# Internal helper for building the Java command to call Tika
|
265
|
+
#
|
266
|
+
def self.tika_command(type, server = false)
|
267
|
+
command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
|
268
|
+
command << "--server --port #{@@server_port}" if server
|
269
|
+
command << switch_for_type(type)
|
270
|
+
command.join ' '
|
271
|
+
end
|
272
|
+
|
273
|
+
# Internal helper for building the Java command to call Tika
|
274
|
+
#
|
275
|
+
def self.switch_for_type(type)
|
276
|
+
case type
|
277
|
+
when :text then '-t'
|
278
|
+
when :html then '-h'
|
279
|
+
when :metadata then '-m -j'
|
280
|
+
when :mimetype then '-m -j'
|
281
|
+
end
|
282
|
+
end
|
283
|
+
private_class_method :switch_for_type
|
286
284
|
end
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -104,13 +104,13 @@ describe Henkei do
|
|
104
104
|
|
105
105
|
describe '.java' do
|
106
106
|
specify 'with no specified JAVA_HOME' do
|
107
|
-
expect( Henkei.send(:
|
107
|
+
expect( Henkei.send(:java_path) ).to eql 'java'
|
108
108
|
end
|
109
109
|
|
110
110
|
specify 'with a specified JAVA_HOME' do
|
111
111
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
112
112
|
|
113
|
-
expect( Henkei.send(:
|
113
|
+
expect( Henkei.send(:java_path) ).to eql '/path/to/java/home/bin/java'
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.17.
|
4
|
+
version: 1.17.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mime-types
|
@@ -18,6 +18,9 @@ dependencies:
|
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
20
|
version: '1.23'
|
21
|
+
- - "<"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: '4'
|
21
24
|
type: :runtime
|
22
25
|
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -25,6 +28,9 @@ dependencies:
|
|
25
28
|
- - ">="
|
26
29
|
- !ruby/object:Gem::Version
|
27
30
|
version: '1.23'
|
31
|
+
- - "<"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '4'
|
28
34
|
- !ruby/object:Gem::Dependency
|
29
35
|
name: json
|
30
36
|
requirement: !ruby/object:Gem::Requirement
|
@@ -32,6 +38,9 @@ dependencies:
|
|
32
38
|
- - ">="
|
33
39
|
- !ruby/object:Gem::Version
|
34
40
|
version: '1.8'
|
41
|
+
- - "<"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '3'
|
35
44
|
type: :runtime
|
36
45
|
prerelease: false
|
37
46
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -39,6 +48,9 @@ dependencies:
|
|
39
48
|
- - ">="
|
40
49
|
- !ruby/object:Gem::Version
|
41
50
|
version: '1.8'
|
51
|
+
- - "<"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '3'
|
42
54
|
- !ruby/object:Gem::Dependency
|
43
55
|
name: bundler
|
44
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,46 +69,45 @@ dependencies:
|
|
57
69
|
name: rake
|
58
70
|
requirement: !ruby/object:Gem::Requirement
|
59
71
|
requirements:
|
60
|
-
- - "
|
72
|
+
- - "~>"
|
61
73
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
74
|
+
version: '12.3'
|
63
75
|
type: :development
|
64
76
|
prerelease: false
|
65
77
|
version_requirements: !ruby/object:Gem::Requirement
|
66
78
|
requirements:
|
67
|
-
- - "
|
79
|
+
- - "~>"
|
68
80
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
81
|
+
version: '12.3'
|
70
82
|
- !ruby/object:Gem::Dependency
|
71
83
|
name: rspec
|
72
84
|
requirement: !ruby/object:Gem::Requirement
|
73
85
|
requirements:
|
74
86
|
- - "~>"
|
75
87
|
- !ruby/object:Gem::Version
|
76
|
-
version: '3.
|
88
|
+
version: '3.7'
|
77
89
|
type: :development
|
78
90
|
prerelease: false
|
79
91
|
version_requirements: !ruby/object:Gem::Requirement
|
80
92
|
requirements:
|
81
93
|
- - "~>"
|
82
94
|
- !ruby/object:Gem::Version
|
83
|
-
version: '3.
|
95
|
+
version: '3.7'
|
84
96
|
- !ruby/object:Gem::Dependency
|
85
97
|
name: simplecov
|
86
98
|
requirement: !ruby/object:Gem::Requirement
|
87
99
|
requirements:
|
88
|
-
- - "
|
100
|
+
- - "~>"
|
89
101
|
- !ruby/object:Gem::Version
|
90
|
-
version: '0'
|
102
|
+
version: '0.15'
|
91
103
|
type: :development
|
92
104
|
prerelease: false
|
93
105
|
version_requirements: !ruby/object:Gem::Requirement
|
94
106
|
requirements:
|
95
|
-
- - "
|
107
|
+
- - "~>"
|
96
108
|
- !ruby/object:Gem::Version
|
97
|
-
version: '0'
|
98
|
-
description: Read text and metadata from files and documents
|
99
|
-
.odt, .rtf, .pdf) using Apache Tika toolkit
|
109
|
+
version: '0.15'
|
110
|
+
description: Read text and metadata from files and documents using Apache Tika toolkit
|
100
111
|
email:
|
101
112
|
- erol.fornoles@gmail.com
|
102
113
|
- a.bromwich@gmail.com
|
@@ -114,6 +125,7 @@ files:
|
|
114
125
|
- Rakefile
|
115
126
|
- henkei.gemspec
|
116
127
|
- jar/tika-app-1.17.jar
|
128
|
+
- jar/tika-config.xml
|
117
129
|
- lib/henkei.rb
|
118
130
|
- lib/henkei/version.rb
|
119
131
|
- lib/henkei/yomu.rb
|