yomu 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/yomu.rb +93 -12
- data/lib/yomu/version.rb +1 -1
- data/spec/yomu_spec.rb +30 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b10bc01c33add15e33c6dc602f7a511c43ae5d2
|
4
|
+
data.tar.gz: e8a3e57a23dcea65214dee6bc66888ab6ef8edc2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bfcc975fb5fd48b23bfbfc9ea8c22e536e0539e1a2783babdcfb07675c9409be446eb8ed3ee005d34d14df11d83686a72b1a5ae7f310dffffaa09140f3ff2ec5
|
7
|
+
data.tar.gz: 903302d0a7dd19cca22a37cc8311131fb1b7282cef3a9e8be0271bbb5c9f47516431adac606b8e44fd09e67f858daa25445fc35919f11873ce684e81be4f5f61
|
data/lib/yomu.rb
CHANGED
@@ -4,9 +4,16 @@ require 'net/http'
|
|
4
4
|
require 'mime/types'
|
5
5
|
require 'json'
|
6
6
|
|
7
|
+
require 'socket'
|
8
|
+
require 'stringio'
|
9
|
+
|
7
10
|
class Yomu
|
8
11
|
GEMPATH = File.dirname(File.dirname(__FILE__))
|
9
12
|
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar')
|
13
|
+
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
14
|
+
|
15
|
+
@@server_port = nil
|
16
|
+
@@server_pid = nil
|
10
17
|
|
11
18
|
# Read text or metadata from a data buffer.
|
12
19
|
#
|
@@ -15,6 +22,21 @@ class Yomu
|
|
15
22
|
# metadata = Yomu.read :metadata, data
|
16
23
|
|
17
24
|
def self.read(type, data)
|
25
|
+
result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
|
26
|
+
|
27
|
+
case type
|
28
|
+
when :text
|
29
|
+
result
|
30
|
+
when :html
|
31
|
+
result
|
32
|
+
when :metadata
|
33
|
+
JSON.parse(result)
|
34
|
+
when :mimetype
|
35
|
+
MIME::Types[JSON.parse(result)['Content-Type']].first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self._client_read(type, data)
|
18
40
|
switch = case type
|
19
41
|
when :text
|
20
42
|
'-t'
|
@@ -25,23 +47,35 @@ class Yomu
|
|
25
47
|
when :mimetype
|
26
48
|
'-m -j'
|
27
49
|
end
|
28
|
-
|
29
|
-
|
50
|
+
|
51
|
+
IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
|
30
52
|
io.write data
|
31
53
|
io.close_write
|
32
54
|
io.read
|
33
55
|
end
|
56
|
+
end
|
34
57
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
|
59
|
+
def self._server_read(_, data)
|
60
|
+
s = TCPSocket.new('localhost', @@server_port)
|
61
|
+
file = StringIO.new(data, 'r')
|
62
|
+
|
63
|
+
while 1
|
64
|
+
chunk = file.read(65536)
|
65
|
+
break unless chunk
|
66
|
+
s.write(chunk)
|
44
67
|
end
|
68
|
+
|
69
|
+
# tell Tika that we're done sending data
|
70
|
+
s.shutdown(Socket::SHUT_WR)
|
71
|
+
|
72
|
+
resp = ''
|
73
|
+
while 1
|
74
|
+
chunk = s.recv(65536)
|
75
|
+
break if chunk.empty? || !chunk
|
76
|
+
resp << chunk
|
77
|
+
end
|
78
|
+
resp
|
45
79
|
end
|
46
80
|
|
47
81
|
# Create a new instance of Yomu with a given document.
|
@@ -137,7 +171,6 @@ class Yomu
|
|
137
171
|
end
|
138
172
|
end
|
139
173
|
|
140
|
-
|
141
174
|
def path?
|
142
175
|
defined? @path
|
143
176
|
end
|
@@ -180,6 +213,54 @@ class Yomu
|
|
180
213
|
@data
|
181
214
|
end
|
182
215
|
|
216
|
+
# Returns pid of Tika server, started as a new spawned process.
|
217
|
+
#
|
218
|
+
# type :html, :text or :metadata
|
219
|
+
# custom_port e.g. 9293
|
220
|
+
#
|
221
|
+
# Yomu.server(:text, 9294)
|
222
|
+
#
|
223
|
+
def self.server(type, custom_port=nil)
|
224
|
+
switch = case type
|
225
|
+
when :text
|
226
|
+
'-t'
|
227
|
+
when :html
|
228
|
+
'-h'
|
229
|
+
when :metadata
|
230
|
+
'-m -j'
|
231
|
+
when :mimetype
|
232
|
+
'-m -j'
|
233
|
+
end
|
234
|
+
|
235
|
+
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
236
|
+
|
237
|
+
@@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
|
238
|
+
sleep(2) # Give the server 2 seconds to spin up.
|
239
|
+
@@server_pid
|
240
|
+
end
|
241
|
+
|
242
|
+
# Kills server started by Yomu.server
|
243
|
+
#
|
244
|
+
# Always run this when you're done, or else Tika might run until you kill it manually
|
245
|
+
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
246
|
+
# putting this method in the ensure block.
|
247
|
+
#
|
248
|
+
# Yomu.server(:text)
|
249
|
+
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
250
|
+
# begin
|
251
|
+
# my_texts = reports.map{|report_path| Yomu.new(report_path).text }
|
252
|
+
# rescue
|
253
|
+
# ensure
|
254
|
+
# Yomu.kill_server!
|
255
|
+
# end
|
256
|
+
def self.kill_server!
|
257
|
+
if @@server_pid
|
258
|
+
Process.kill('INT', @@server_pid)
|
259
|
+
@@server_pid = nil
|
260
|
+
@@server_port = nil
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
183
264
|
def self.java
|
184
265
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
185
266
|
end
|
data/lib/yomu/version.rb
CHANGED
data/spec/yomu_spec.rb
CHANGED
@@ -149,4 +149,34 @@ describe Yomu do
|
|
149
149
|
expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
|
150
150
|
end
|
151
151
|
end
|
152
|
+
|
153
|
+
context 'working as server mode' do
|
154
|
+
specify '#starts and kills server' do
|
155
|
+
begin
|
156
|
+
Yomu.server(:text)
|
157
|
+
expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil
|
158
|
+
expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil
|
159
|
+
|
160
|
+
s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port))
|
161
|
+
expect(s).to be_a TCPSocket
|
162
|
+
s.close
|
163
|
+
ensure
|
164
|
+
port = Yomu.class_variable_get(:@@server_port)
|
165
|
+
Yomu.kill_server!
|
166
|
+
sleep 2
|
167
|
+
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
specify '#runs samples through server mode' do
|
172
|
+
begin
|
173
|
+
Yomu.server(:text)
|
174
|
+
expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
175
|
+
expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
176
|
+
expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
|
177
|
+
ensure
|
178
|
+
Yomu.kill_server!
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
152
182
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mime-types
|
@@ -126,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
126
|
version: '0'
|
127
127
|
requirements: []
|
128
128
|
rubyforge_project:
|
129
|
-
rubygems_version: 2.2.
|
129
|
+
rubygems_version: 2.2.2
|
130
130
|
signing_key:
|
131
131
|
specification_version: 4
|
132
132
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|