yomu 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ef1ce415813b44ff2a36e29a49b59768dd72f7f3
4
- data.tar.gz: 17c69e97c25e156d101e6cef2c5b3f1fb149190d
3
+ metadata.gz: aa980254ce68ce8915308c43f3df4cc9f83e528a
4
+ data.tar.gz: ff59cad4c46193986f83cf29aecadba165b3f8dd
5
5
  SHA512:
6
- metadata.gz: 3ad3d4a04d785d0a31028f91d196d797d98e93f9289693d53987f9f72086c6e1c2a86057bc54055e2eb5db87324d370229cb92e8d2bc1ccccf8ecc09a8764f44
7
- data.tar.gz: f1e70048c8c42b9750e14bb0b85250114e0674599663da6418cdc4d893947220247be46e16e002f909170b8f86526bd96c4610a515f1ee0c14728e2a77705a55
6
+ metadata.gz: 10cd091d61c360f3a9d10a6b94402c362dae43e1c06de1a3b54711d5738b3c89e2b1625f580b783f7eb5960c95bb12fa5cc73dbdff894802f1b63dfc15bcdf25
7
+ data.tar.gz: 329d48756a77ac689f2247d93b656fd8b772ac1ef617be95a81631d4704030bb17b4765d6df128218efea4fec8fd938470045d9fc0208aa455910977c157331a
data/README.md CHANGED
@@ -1,8 +1,11 @@
1
- [![Build Status](https://travis-ci.org/Erol/yomu.png?branch=master)](https://travis-ci.org/Erol/yomu)
2
- [![Code Climate](https://codeclimate.com/github/Erol/yomu.png)](https://codeclimate.com/github/Erol/yomu)
1
+ [![Travis Build Status](http://img.shields.io/travis/Erol/yomu.svg?style=flat)](https://travis-ci.org/Erol/yomu)
2
+ [![Code Climate Score](http://img.shields.io/codeclimate/github/Erol/yomu.svg?style=flat)](https://codeclimate.com/github/Erol/yomu)
3
+ [![Gem Version](http://img.shields.io/gem/v/yomu.svg?style=flat)](#)
3
4
 
4
5
  # Yomu 読む
5
6
 
7
+ [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/Erol/yomu?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
8
+
6
9
  [Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache Tika](http://tika.apache.org/) content analysis toolkit.
7
10
 
8
11
  Here are some of the formats supported:
@@ -80,6 +83,12 @@ yomu.mimetype.extensions #=> ['docx']
80
83
 
81
84
  ## Installation and Dependencies
82
85
 
86
+ ### Java Runtime
87
+
88
+ Yomu packages the Apache Tika application jar and requires a working JRE for it to work.
89
+
90
+ ### Gem
91
+
83
92
  Add this line to your application's Gemfile:
84
93
 
85
94
  gem 'yomu'
@@ -92,8 +101,6 @@ Or install it yourself as:
92
101
 
93
102
  $ gem install yomu
94
103
 
95
- **Yomu packages the Apache Tika application jar and requires a working JRE for it to work.**
96
-
97
104
  ## Contributing
98
105
 
99
106
  1. Fork it
data/lib/yomu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  class Yomu
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.4'
3
3
  end
data/lib/yomu.rb CHANGED
@@ -4,9 +4,16 @@ require 'net/http'
4
4
  require 'mime/types'
5
5
  require 'json'
6
6
 
7
+ require 'socket'
8
+ require 'stringio'
9
+
7
10
  class Yomu
8
11
  GEMPATH = File.dirname(File.dirname(__FILE__))
9
- JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.5.jar')
12
+ JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.11.jar')
13
+ DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
14
+
15
+ @@server_port = nil
16
+ @@server_pid = nil
10
17
 
11
18
  # Read text or metadata from a data buffer.
12
19
  #
@@ -15,6 +22,21 @@ class Yomu
15
22
  # metadata = Yomu.read :metadata, data
16
23
 
17
24
  def self.read(type, data)
25
+ result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
26
+
27
+ case type
28
+ when :text
29
+ result
30
+ when :html
31
+ result
32
+ when :metadata
33
+ JSON.parse(result)
34
+ when :mimetype
35
+ MIME::Types[JSON.parse(result)['Content-Type']].first
36
+ end
37
+ end
38
+
39
+ def self._client_read(type, data)
18
40
  switch = case type
19
41
  when :text
20
42
  '-t'
@@ -25,23 +47,35 @@ class Yomu
25
47
  when :mimetype
26
48
  '-m -j'
27
49
  end
28
-
29
- result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
50
+
51
+ IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
30
52
  io.write data
31
53
  io.close_write
32
54
  io.read
33
55
  end
56
+ end
34
57
 
35
- case type
36
- when :text
37
- result
38
- when :html
39
- result
40
- when :metadata
41
- JSON.parse(result)
42
- when :mimetype
43
- MIME::Types[JSON.parse(result)['Content-Type']].first
58
+
59
+ def self._server_read(_, data)
60
+ s = TCPSocket.new('localhost', @@server_port)
61
+ file = StringIO.new(data, 'r')
62
+
63
+ while 1
64
+ chunk = file.read(65536)
65
+ break unless chunk
66
+ s.write(chunk)
44
67
  end
68
+
69
+ # tell Tika that we're done sending data
70
+ s.shutdown(Socket::SHUT_WR)
71
+
72
+ resp = ''
73
+ while 1
74
+ chunk = s.recv(65536)
75
+ break if chunk.empty? || !chunk
76
+ resp << chunk
77
+ end
78
+ resp
45
79
  end
46
80
 
47
81
  # Create a new instance of Yomu with a given document.
@@ -137,7 +171,6 @@ class Yomu
137
171
  end
138
172
  end
139
173
 
140
-
141
174
  def path?
142
175
  defined? @path
143
176
  end
@@ -180,6 +213,54 @@ class Yomu
180
213
  @data
181
214
  end
182
215
 
216
+ # Returns pid of Tika server, started as a new spawned process.
217
+ #
218
+ # type :html, :text or :metadata
219
+ # custom_port e.g. 9293
220
+ #
221
+ # Yomu.server(:text, 9294)
222
+ #
223
+ def self.server(type, custom_port=nil)
224
+ switch = case type
225
+ when :text
226
+ '-t'
227
+ when :html
228
+ '-h'
229
+ when :metadata
230
+ '-m -j'
231
+ when :mimetype
232
+ '-m -j'
233
+ end
234
+
235
+ @@server_port = custom_port || DEFAULT_SERVER_PORT
236
+
237
+ @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
238
+ sleep(2) # Give the server 2 seconds to spin up.
239
+ @@server_pid
240
+ end
241
+
242
+ # Kills server started by Yomu.server
243
+ #
244
+ # Always run this when you're done, or else Tika might run until you kill it manually
245
+ # You might try putting your extraction in a begin..rescue...ensure...end block and
246
+ # putting this method in the ensure block.
247
+ #
248
+ # Yomu.server(:text)
249
+ # reports = ["report1.docx", "report2.doc", "report3.pdf"]
250
+ # begin
251
+ # my_texts = reports.map{|report_path| Yomu.new(report_path).text }
252
+ # rescue
253
+ # ensure
254
+ # Yomu.kill_server!
255
+ # end
256
+ def self.kill_server!
257
+ if @@server_pid
258
+ Process.kill('INT', @@server_pid)
259
+ @@server_pid = nil
260
+ @@server_port = nil
261
+ end
262
+ end
263
+
183
264
  def self.java
184
265
  ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
185
266
  end
data/spec/yomu_spec.rb CHANGED
@@ -149,4 +149,34 @@ describe Yomu do
149
149
  expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
150
150
  end
151
151
  end
152
+
153
+ context 'working as server mode' do
154
+ specify '#starts and kills server' do
155
+ begin
156
+ Yomu.server(:text)
157
+ expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil
158
+ expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil
159
+
160
+ s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port))
161
+ expect(s).to be_a TCPSocket
162
+ s.close
163
+ ensure
164
+ port = Yomu.class_variable_get(:@@server_port)
165
+ Yomu.kill_server!
166
+ sleep 2
167
+ expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
168
+ end
169
+ end
170
+
171
+ specify '#runs samples through server mode' do
172
+ begin
173
+ Yomu.server(:text)
174
+ expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
175
+ expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
176
+ expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
177
+ ensure
178
+ Yomu.kill_server!
179
+ end
180
+ end
181
+ end
152
182
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yomu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-30 00:00:00.000000000 Z
11
+ date: 2015-12-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mime-types
@@ -96,7 +96,7 @@ files:
96
96
  - NOTICE.txt
97
97
  - README.md
98
98
  - Rakefile
99
- - jar/tika-app-1.5.jar
99
+ - jar/tika-app-1.11.jar
100
100
  - lib/yomu.rb
101
101
  - lib/yomu/version.rb
102
102
  - spec/helper.rb
@@ -126,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
126
126
  version: '0'
127
127
  requirements: []
128
128
  rubyforge_project:
129
- rubygems_version: 2.2.2
129
+ rubygems_version: 2.4.6
130
130
  signing_key:
131
131
  specification_version: 4
132
132
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,