yomu 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -4
- data/jar/{tika-app-1.5.jar → tika-app-1.11.jar} +0 -0
- data/lib/yomu/version.rb +1 -1
- data/lib/yomu.rb +94 -13
- data/spec/yomu_spec.rb +30 -0
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa980254ce68ce8915308c43f3df4cc9f83e528a
|
4
|
+
data.tar.gz: ff59cad4c46193986f83cf29aecadba165b3f8dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10cd091d61c360f3a9d10a6b94402c362dae43e1c06de1a3b54711d5738b3c89e2b1625f580b783f7eb5960c95bb12fa5cc73dbdff894802f1b63dfc15bcdf25
|
7
|
+
data.tar.gz: 329d48756a77ac689f2247d93b656fd8b772ac1ef617be95a81631d4704030bb17b4765d6df128218efea4fec8fd938470045d9fc0208aa455910977c157331a
|
data/README.md
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
[](https://travis-ci.org/Erol/yomu)
|
2
|
+
[](https://codeclimate.com/github/Erol/yomu)
|
3
|
+
[](#)
|
3
4
|
|
4
5
|
# Yomu 読む
|
5
6
|
|
7
|
+
[](https://gitter.im/Erol/yomu?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
|
8
|
+
|
6
9
|
[Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache Tika](http://tika.apache.org/) content analysis toolkit.
|
7
10
|
|
8
11
|
Here are some of the formats supported:
|
@@ -80,6 +83,12 @@ yomu.mimetype.extensions #=> ['docx']
|
|
80
83
|
|
81
84
|
## Installation and Dependencies
|
82
85
|
|
86
|
+
### Java Runtime
|
87
|
+
|
88
|
+
Yomu packages the Apache Tika application jar and requires a working JRE for it to work.
|
89
|
+
|
90
|
+
### Gem
|
91
|
+
|
83
92
|
Add this line to your application's Gemfile:
|
84
93
|
|
85
94
|
gem 'yomu'
|
@@ -92,8 +101,6 @@ Or install it yourself as:
|
|
92
101
|
|
93
102
|
$ gem install yomu
|
94
103
|
|
95
|
-
**Yomu packages the Apache Tika application jar and requires a working JRE for it to work.**
|
96
|
-
|
97
104
|
## Contributing
|
98
105
|
|
99
106
|
1. Fork it
|
Binary file
|
data/lib/yomu/version.rb
CHANGED
data/lib/yomu.rb
CHANGED
@@ -4,9 +4,16 @@ require 'net/http'
|
|
4
4
|
require 'mime/types'
|
5
5
|
require 'json'
|
6
6
|
|
7
|
+
require 'socket'
|
8
|
+
require 'stringio'
|
9
|
+
|
7
10
|
class Yomu
|
8
11
|
GEMPATH = File.dirname(File.dirname(__FILE__))
|
9
|
-
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.
|
12
|
+
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.11.jar')
|
13
|
+
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
14
|
+
|
15
|
+
@@server_port = nil
|
16
|
+
@@server_pid = nil
|
10
17
|
|
11
18
|
# Read text or metadata from a data buffer.
|
12
19
|
#
|
@@ -15,6 +22,21 @@ class Yomu
|
|
15
22
|
# metadata = Yomu.read :metadata, data
|
16
23
|
|
17
24
|
def self.read(type, data)
|
25
|
+
result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
|
26
|
+
|
27
|
+
case type
|
28
|
+
when :text
|
29
|
+
result
|
30
|
+
when :html
|
31
|
+
result
|
32
|
+
when :metadata
|
33
|
+
JSON.parse(result)
|
34
|
+
when :mimetype
|
35
|
+
MIME::Types[JSON.parse(result)['Content-Type']].first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self._client_read(type, data)
|
18
40
|
switch = case type
|
19
41
|
when :text
|
20
42
|
'-t'
|
@@ -25,23 +47,35 @@ class Yomu
|
|
25
47
|
when :mimetype
|
26
48
|
'-m -j'
|
27
49
|
end
|
28
|
-
|
29
|
-
|
50
|
+
|
51
|
+
IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
|
30
52
|
io.write data
|
31
53
|
io.close_write
|
32
54
|
io.read
|
33
55
|
end
|
56
|
+
end
|
34
57
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
|
59
|
+
def self._server_read(_, data)
|
60
|
+
s = TCPSocket.new('localhost', @@server_port)
|
61
|
+
file = StringIO.new(data, 'r')
|
62
|
+
|
63
|
+
while 1
|
64
|
+
chunk = file.read(65536)
|
65
|
+
break unless chunk
|
66
|
+
s.write(chunk)
|
44
67
|
end
|
68
|
+
|
69
|
+
# tell Tika that we're done sending data
|
70
|
+
s.shutdown(Socket::SHUT_WR)
|
71
|
+
|
72
|
+
resp = ''
|
73
|
+
while 1
|
74
|
+
chunk = s.recv(65536)
|
75
|
+
break if chunk.empty? || !chunk
|
76
|
+
resp << chunk
|
77
|
+
end
|
78
|
+
resp
|
45
79
|
end
|
46
80
|
|
47
81
|
# Create a new instance of Yomu with a given document.
|
@@ -137,7 +171,6 @@ class Yomu
|
|
137
171
|
end
|
138
172
|
end
|
139
173
|
|
140
|
-
|
141
174
|
def path?
|
142
175
|
defined? @path
|
143
176
|
end
|
@@ -180,6 +213,54 @@ class Yomu
|
|
180
213
|
@data
|
181
214
|
end
|
182
215
|
|
216
|
+
# Returns pid of Tika server, started as a new spawned process.
|
217
|
+
#
|
218
|
+
# type :html, :text or :metadata
|
219
|
+
# custom_port e.g. 9293
|
220
|
+
#
|
221
|
+
# Yomu.server(:text, 9294)
|
222
|
+
#
|
223
|
+
def self.server(type, custom_port=nil)
|
224
|
+
switch = case type
|
225
|
+
when :text
|
226
|
+
'-t'
|
227
|
+
when :html
|
228
|
+
'-h'
|
229
|
+
when :metadata
|
230
|
+
'-m -j'
|
231
|
+
when :mimetype
|
232
|
+
'-m -j'
|
233
|
+
end
|
234
|
+
|
235
|
+
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
236
|
+
|
237
|
+
@@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
|
238
|
+
sleep(2) # Give the server 2 seconds to spin up.
|
239
|
+
@@server_pid
|
240
|
+
end
|
241
|
+
|
242
|
+
# Kills server started by Yomu.server
|
243
|
+
#
|
244
|
+
# Always run this when you're done, or else Tika might run until you kill it manually
|
245
|
+
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
246
|
+
# putting this method in the ensure block.
|
247
|
+
#
|
248
|
+
# Yomu.server(:text)
|
249
|
+
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
250
|
+
# begin
|
251
|
+
# my_texts = reports.map{|report_path| Yomu.new(report_path).text }
|
252
|
+
# rescue
|
253
|
+
# ensure
|
254
|
+
# Yomu.kill_server!
|
255
|
+
# end
|
256
|
+
def self.kill_server!
|
257
|
+
if @@server_pid
|
258
|
+
Process.kill('INT', @@server_pid)
|
259
|
+
@@server_pid = nil
|
260
|
+
@@server_port = nil
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
183
264
|
def self.java
|
184
265
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
185
266
|
end
|
data/spec/yomu_spec.rb
CHANGED
@@ -149,4 +149,34 @@ describe Yomu do
|
|
149
149
|
expect( yomu.metadata['Content-Type'] ).to eql ["application/vnd.apple.pages", "application/vnd.apple.pages"]
|
150
150
|
end
|
151
151
|
end
|
152
|
+
|
153
|
+
context 'working as server mode' do
|
154
|
+
specify '#starts and kills server' do
|
155
|
+
begin
|
156
|
+
Yomu.server(:text)
|
157
|
+
expect(Yomu.class_variable_get(:@@server_pid)).not_to be_nil
|
158
|
+
expect(Yomu.class_variable_get(:@@server_port)).not_to be_nil
|
159
|
+
|
160
|
+
s = TCPSocket.new('localhost', Yomu.class_variable_get(:@@server_port))
|
161
|
+
expect(s).to be_a TCPSocket
|
162
|
+
s.close
|
163
|
+
ensure
|
164
|
+
port = Yomu.class_variable_get(:@@server_port)
|
165
|
+
Yomu.kill_server!
|
166
|
+
sleep 2
|
167
|
+
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
specify '#runs samples through server mode' do
|
172
|
+
begin
|
173
|
+
Yomu.server(:text)
|
174
|
+
expect(Yomu.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
175
|
+
expect(Yomu.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
|
176
|
+
expect(Yomu.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
|
177
|
+
ensure
|
178
|
+
Yomu.kill_server!
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
152
182
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mime-types
|
@@ -96,7 +96,7 @@ files:
|
|
96
96
|
- NOTICE.txt
|
97
97
|
- README.md
|
98
98
|
- Rakefile
|
99
|
-
- jar/tika-app-1.
|
99
|
+
- jar/tika-app-1.11.jar
|
100
100
|
- lib/yomu.rb
|
101
101
|
- lib/yomu/version.rb
|
102
102
|
- spec/helper.rb
|
@@ -126,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
126
|
version: '0'
|
127
127
|
requirements: []
|
128
128
|
rubyforge_project:
|
129
|
-
rubygems_version: 2.
|
129
|
+
rubygems_version: 2.4.6
|
130
130
|
signing_key:
|
131
131
|
specification_version: 4
|
132
132
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|