yomu 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 763e0175f0348e0128b7a24344c58ef9580991c2
4
- data.tar.gz: 3ac16148db3fbe1f3f464745156e42ec490228dd
3
+ metadata.gz: 2b10bc01c33add15e33c6dc602f7a511c43ae5d2
4
+ data.tar.gz: e8a3e57a23dcea65214dee6bc66888ab6ef8edc2
5
5
  SHA512:
6
- metadata.gz: c6cbd2370dfa5569e9d5efdc8214c24788d60a3f33981a07e603a45442e48781452004a76a0d5f75484dbb825320caa7a8f7314e7d8a19d807abb4178edb6849
7
- data.tar.gz: c425f26e52562d5690f5bd8a104dd0763101b191983a6b58f2f10f3e1c278fa146d668f4e2dbce5b144a6a76125684b4139cee48550c793bb2b9d3ba53d0f0fd
6
+ metadata.gz: bfcc975fb5fd48b23bfbfc9ea8c22e536e0539e1a2783babdcfb07675c9409be446eb8ed3ee005d34d14df11d83686a72b1a5ae7f310dffffaa09140f3ff2ec5
7
+ data.tar.gz: 903302d0a7dd19cca22a37cc8311131fb1b7282cef3a9e8be0271bbb5c9f47516431adac606b8e44fd09e67f858daa25445fc35919f11873ce684e81be4f5f61
data/lib/yomu.rb CHANGED
@@ -4,9 +4,16 @@ require 'net/http'
4
4
  require 'mime/types'
5
5
  require 'json'
6
6
 
7
+ require 'socket'
8
+ require 'stringio'
9
+
7
10
  class Yomu
8
11
  GEMPATH = File.dirname(File.dirname(__FILE__))
9
12
  JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar')
13
+ DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
14
+
15
+ @@server_port = nil
16
+ @@server_pid = nil
10
17
 
11
18
  # Read text or metadata from a data buffer.
12
19
  #
@@ -15,6 +22,21 @@ class Yomu
15
22
  # metadata = Yomu.read :metadata, data
16
23
 
17
24
  def self.read(type, data)
25
+ result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
26
+
27
+ case type
28
+ when :text
29
+ result
30
+ when :html
31
+ result
32
+ when :metadata
33
+ JSON.parse(result)
34
+ when :mimetype
35
+ MIME::Types[JSON.parse(result)['Content-Type']].first
36
+ end
37
+ end
38
+
39
+ def self._client_read(type, data)
18
40
  switch = case type
19
41
  when :text
20
42
  '-t'
@@ -25,23 +47,35 @@ class Yomu
25
47
  when :mimetype
26
48
  '-m -j'
27
49
  end
28
-
29
- result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
50
+
51
+ IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
30
52
  io.write data
31
53
  io.close_write
32
54
  io.read
33
55
  end
56
+ end
34
57
 
35
- case type
36
- when :text
37
- result
38
- when :html
39
- result
40
- when :metadata
41
- JSON.parse(result)
42
- when :mimetype
43
- MIME::Types[JSON.parse(result)['Content-Type']].first
58
+
59
+ def self._server_read(_, data)
60
+ s = TCPSocket.new('localhost', @@server_port)
61
+ file = StringIO.new(data, 'r')
62
+
63
+ while 1
64
+ chunk = file.read(65536)
65
+ break unless chunk
66
+ s.write(chunk)
44
67
  end
68
+
69
+ # tell Tika that we're done sending data
70
+ s.shutdown(Socket::SHUT_WR)
71
+
72
+ resp = ''
73
+ while 1
74
+ chunk = s.recv(65536)
75
+ break if chunk.empty? || !chunk
76
+ resp << chunk
77
+ end
78
+ resp
45
79
  end
46
80
 
47
81
  # Create a new instance of Yomu with a given document.
@@ -137,7 +171,6 @@ class Yomu
137
171
  end
138
172
  end
139
173
 
140
-
141
174
  def path?
142
175
  defined? @path
143
176
  end
@@ -180,6 +213,54 @@ class Yomu
180
213
  @data
181
214
  end
182
215
 
216
+ # Returns pid of Tika server, started as a new spawned process.
217
+ #
218
+ # type :html, :text or :metadata
219
+ # custom_port e.g. 9293
220
+ #
221
+ # Yomu.server(:text, 9294)
222
+ #
223
+ def self.server(type, custom_port=nil)
224
+ switch = case type
225
+ when :text
226
+ '-t'
227
+ when :html
228
+ '-h'
229
+ when :metadata
230
+ '-m -j'
231
+ when :mimetype
232
+ '-m -j'
233
+ end
234
+
235
+ @@server_port = custom_port || DEFAULT_SERVER_PORT
236
+
237
+ @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
238
+ sleep(2) # Give the server 2 seconds to spin up.
239
+ @@server_pid
240
+ end
241
+
242
+ # Kills server started by Yomu.server
243
+ #
244
+ # Always run this when you're done, or else Tika might run until you kill it manually
245
+ # You might try putting your extraction in a begin..rescue...ensure...end block and
246
+ # putting this method in the ensure block.
247
+ #
248
+ # Yomu.server(:text)
249
+ # reports = ["report1.docx", "report2.doc", "report3.pdf"]
250
+ # begin
251
+ # my_texts = reports.map{|report_path| Yomu.new(report_path).text }
252
+ # rescue
253
+ # ensure
254
+ # Yomu.kill_server!
255
+ # end
256
+ def self.kill_server!
257
+ if @@server_pid
258
+ Process.kill('INT', @@server_pid)
259
+ @@server_pid = nil
260
+ @@server_port = nil
261
+ end
262
+ end
263
+
183
264
  def self.java
184
265
  ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
185
266
  end
data/lib/yomu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  class Yomu
2
- VERSION = '0.2.1'
2
+ VERSION = '0.2.2'
3
3
  end
data/spec/yomu_spec.rb CHANGED
@@ -149,4 +149,34 @@ describe Yomu do
149
149
  expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
150
150
  end
151
151
  end
152
+
153
+ context 'working as server mode' do
154
+ specify '#starts and kills server' do
155
+ begin
156
+ Yomu.server(:text)
157
+ expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil
158
+ expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil
159
+
160
+ s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port))
161
+ expect(s).to be_a TCPSocket
162
+ s.close
163
+ ensure
164
+ port = Yomu.class_variable_get(:@@server_port)
165
+ Yomu.kill_server!
166
+ sleep 2
167
+ expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
168
+ end
169
+ end
170
+
171
+ specify '#runs samples through server mode' do
172
+ begin
173
+ Yomu.server(:text)
174
+ expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
175
+ expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
176
+ expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
177
+ ensure
178
+ Yomu.kill_server!
179
+ end
180
+ end
181
+ end
152
182
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yomu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-19 00:00:00.000000000 Z
11
+ date: 2014-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mime-types
@@ -126,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
126
126
  version: '0'
127
127
  requirements: []
128
128
  rubyforge_project:
129
- rubygems_version: 2.2.0
129
+ rubygems_version: 2.2.2
130
130
  signing_key:
131
131
  specification_version: 4
132
132
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,