henkei 1.14.3 → 1.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/henkei.rb +72 -58
- data/lib/henkei/version.rb +1 -1
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3ab53035e222c201453eaf44efc1186a9c9ae2f
|
4
|
+
data.tar.gz: 9e527cde254131285d90ccd69f7c1ef57e8eaa53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 757063d9530cf4d88988bb2494e4f3f198465ea27c5603c2aed9bd6cfbbeb008a89d766f79fcb522f769ce49c078f7d4fe34b1256b4cbe241ae5dbe4543f5010
|
7
|
+
data.tar.gz: 7dd520e273bd2808871af156f53b5b8a39476fb8f969fee52e6424c5782c50122b4a0f3644199fdc9213973331ffe4a265fbd36c6a3b0d97a4990f8c8e9da98f
|
data/lib/henkei.rb
CHANGED
@@ -22,9 +22,9 @@ class Henkei
|
|
22
22
|
# data = File.read 'sample.pages'
|
23
23
|
# text = Henkei.read :text, data
|
24
24
|
# metadata = Henkei.read :metadata, data
|
25
|
-
|
25
|
+
#
|
26
26
|
def self.read(type, data)
|
27
|
-
result = @@server_pid ?
|
27
|
+
result = @@server_pid ? server_read(type, data) : client_read(type, data)
|
28
28
|
|
29
29
|
case type
|
30
30
|
when :text
|
@@ -38,49 +38,6 @@ class Henkei
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
-
def self._client_read(type, data)
|
42
|
-
switch =
|
43
|
-
case type
|
44
|
-
when :text
|
45
|
-
'-t'
|
46
|
-
when :html
|
47
|
-
'-h'
|
48
|
-
when :metadata
|
49
|
-
'-m -j'
|
50
|
-
when :mimetype
|
51
|
-
'-m -j'
|
52
|
-
end
|
53
|
-
|
54
|
-
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
55
|
-
io.write data
|
56
|
-
io.close_write
|
57
|
-
io.read
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
|
62
|
-
def self._server_read(_, data)
|
63
|
-
s = TCPSocket.new('localhost', @@server_port)
|
64
|
-
file = StringIO.new(data, 'r')
|
65
|
-
|
66
|
-
while 1
|
67
|
-
chunk = file.read(65536)
|
68
|
-
break unless chunk
|
69
|
-
s.write(chunk)
|
70
|
-
end
|
71
|
-
|
72
|
-
# tell Tika that we're done sending data
|
73
|
-
s.shutdown(Socket::SHUT_WR)
|
74
|
-
|
75
|
-
resp = ''
|
76
|
-
while 1
|
77
|
-
chunk = s.recv(65536)
|
78
|
-
break if chunk.empty? || !chunk
|
79
|
-
resp << chunk
|
80
|
-
end
|
81
|
-
resp
|
82
|
-
end
|
83
|
-
|
84
41
|
# Create a new instance of Henkei with a given document.
|
85
42
|
#
|
86
43
|
# Using a file path:
|
@@ -94,7 +51,7 @@ class Henkei
|
|
94
51
|
# From a stream or an object which responds to +read+
|
95
52
|
#
|
96
53
|
# Henkei.new File.open('sample.pages')
|
97
|
-
|
54
|
+
#
|
98
55
|
def initialize(input)
|
99
56
|
if input.is_a? String
|
100
57
|
if File.exists? input
|
@@ -115,7 +72,7 @@ class Henkei
|
|
115
72
|
#
|
116
73
|
# henkei = Henkei.new 'sample.pages'
|
117
74
|
# henkei.text
|
118
|
-
|
75
|
+
#
|
119
76
|
def text
|
120
77
|
return @text if defined? @text
|
121
78
|
|
@@ -126,7 +83,7 @@ class Henkei
|
|
126
83
|
#
|
127
84
|
# henkei = Henkei.new 'sample.pages'
|
128
85
|
# henkei.html
|
129
|
-
|
86
|
+
#
|
130
87
|
def html
|
131
88
|
return @html if defined? @html
|
132
89
|
|
@@ -137,7 +94,7 @@ class Henkei
|
|
137
94
|
#
|
138
95
|
# henkei = Henkei.new 'sample.pages'
|
139
96
|
# henkei.metadata['Content-Type']
|
140
|
-
|
97
|
+
#
|
141
98
|
def metadata
|
142
99
|
return @metadata if defined? @metadata
|
143
100
|
|
@@ -149,7 +106,7 @@ class Henkei
|
|
149
106
|
# henkei = Henkei.new 'sample.docx'
|
150
107
|
# henkei.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
151
108
|
# henkei.mimetype.extensions #=> ['docx']
|
152
|
-
|
109
|
+
#
|
153
110
|
def mimetype
|
154
111
|
return @mimetype if defined? @mimetype
|
155
112
|
|
@@ -162,8 +119,7 @@ class Henkei
|
|
162
119
|
#
|
163
120
|
# henkei = Henkei.new 'sample.pages'
|
164
121
|
# henkei.path? #=> true
|
165
|
-
|
166
|
-
|
122
|
+
#
|
167
123
|
def creation_date
|
168
124
|
return @creation_date if defined? @creation_date
|
169
125
|
|
@@ -174,17 +130,22 @@ class Henkei
|
|
174
130
|
end
|
175
131
|
end
|
176
132
|
|
133
|
+
# Returns +true+ if the Henkei document was specified using a file path.
|
134
|
+
#
|
135
|
+
# henkei = Henkei.new '/my/document/path/sample.docx'
|
136
|
+
# henkei.path? #=> true
|
137
|
+
#
|
177
138
|
def path?
|
178
|
-
|
139
|
+
!!@path
|
179
140
|
end
|
180
141
|
|
181
142
|
# Returns +true+ if the Henkei document was specified using a URI.
|
182
143
|
#
|
183
144
|
# henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
184
145
|
# henkei.uri? #=> true
|
185
|
-
|
146
|
+
#
|
186
147
|
def uri?
|
187
|
-
|
148
|
+
!!@uri
|
188
149
|
end
|
189
150
|
|
190
151
|
# Returns +true+ if the Henkei document was specified from a stream or an object which responds to +read+.
|
@@ -192,16 +153,16 @@ class Henkei
|
|
192
153
|
# file = File.open('sample.pages')
|
193
154
|
# henkei = Henkei.new file
|
194
155
|
# henkei.stream? #=> true
|
195
|
-
|
156
|
+
#
|
196
157
|
def stream?
|
197
|
-
|
158
|
+
!!@stream
|
198
159
|
end
|
199
160
|
|
200
161
|
# Returns the raw/unparsed content of the Henkei document.
|
201
162
|
#
|
202
163
|
# henkei = Henkei.new 'sample.pages'
|
203
164
|
# henkei.data
|
204
|
-
|
165
|
+
#
|
205
166
|
def data
|
206
167
|
return @data if defined? @data
|
207
168
|
|
@@ -257,6 +218,7 @@ class Henkei
|
|
257
218
|
# ensure
|
258
219
|
# Henkei.kill_server!
|
259
220
|
# end
|
221
|
+
#
|
260
222
|
def self.kill_server!
|
261
223
|
if @@server_pid
|
262
224
|
Process.kill('INT', @@server_pid)
|
@@ -265,8 +227,60 @@ class Henkei
|
|
265
227
|
end
|
266
228
|
end
|
267
229
|
|
230
|
+
### Private class methods
|
231
|
+
|
232
|
+
# Provide the path to the Java binary
|
233
|
+
#
|
268
234
|
def self.java
|
269
235
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
270
236
|
end
|
271
237
|
private_class_method :java
|
238
|
+
|
239
|
+
# Internal helper for calling to Tika library directly
|
240
|
+
#
|
241
|
+
def self.client_read(type, data)
|
242
|
+
switch =
|
243
|
+
case type
|
244
|
+
when :text
|
245
|
+
'-t'
|
246
|
+
when :html
|
247
|
+
'-h'
|
248
|
+
when :metadata
|
249
|
+
'-m -j'
|
250
|
+
when :mimetype
|
251
|
+
'-m -j'
|
252
|
+
end
|
253
|
+
|
254
|
+
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
255
|
+
io.write data
|
256
|
+
io.close_write
|
257
|
+
io.read
|
258
|
+
end
|
259
|
+
end
|
260
|
+
private_class_method :client_read
|
261
|
+
|
262
|
+
# Internal helper for calling to running Tika server
|
263
|
+
#
|
264
|
+
def self.server_read(_, data)
|
265
|
+
s = TCPSocket.new('localhost', @@server_port)
|
266
|
+
file = StringIO.new(data, 'r')
|
267
|
+
|
268
|
+
while 1
|
269
|
+
chunk = file.read(65536)
|
270
|
+
break unless chunk
|
271
|
+
s.write(chunk)
|
272
|
+
end
|
273
|
+
|
274
|
+
# tell Tika that we're done sending data
|
275
|
+
s.shutdown(Socket::SHUT_WR)
|
276
|
+
|
277
|
+
resp = ''
|
278
|
+
while 1
|
279
|
+
chunk = s.recv(65536)
|
280
|
+
break if chunk.empty? || !chunk
|
281
|
+
resp << chunk
|
282
|
+
end
|
283
|
+
resp
|
284
|
+
end
|
285
|
+
private_class_method :server_read
|
272
286
|
end
|
data/lib/henkei/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,76 +9,76 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mime-types
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
18
|
+
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
20
|
version: '1.23'
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: '1.23'
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: json
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
|
-
- -
|
32
|
+
- - ">="
|
33
33
|
- !ruby/object:Gem::Version
|
34
34
|
version: '1.8'
|
35
35
|
type: :runtime
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
41
41
|
version: '1.8'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: bundler
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
|
-
- - ~>
|
46
|
+
- - "~>"
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '1.3'
|
49
49
|
type: :development
|
50
50
|
prerelease: false
|
51
51
|
version_requirements: !ruby/object:Gem::Requirement
|
52
52
|
requirements:
|
53
|
-
- - ~>
|
53
|
+
- - "~>"
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '1.3'
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
57
|
name: rake
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
|
-
- -
|
60
|
+
- - ">="
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: '0'
|
63
63
|
type: :development
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- -
|
67
|
+
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: rspec
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
|
-
- - ~>
|
74
|
+
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '3.5'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
|
-
- - ~>
|
81
|
+
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
83
|
version: '3.5'
|
84
84
|
description: Read text and metadata from files and documents (.doc, .docx, .pages,
|
@@ -90,9 +90,9 @@ executables: []
|
|
90
90
|
extensions: []
|
91
91
|
extra_rdoc_files: []
|
92
92
|
files:
|
93
|
-
- .gitignore
|
94
|
-
- .rspec
|
95
|
-
- .travis.yml
|
93
|
+
- ".gitignore"
|
94
|
+
- ".rspec"
|
95
|
+
- ".travis.yml"
|
96
96
|
- Gemfile
|
97
97
|
- LICENSE
|
98
98
|
- NOTICE.txt
|
@@ -119,17 +119,17 @@ require_paths:
|
|
119
119
|
- lib
|
120
120
|
required_ruby_version: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- -
|
122
|
+
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- -
|
127
|
+
- - ">="
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0'
|
130
130
|
requirements: []
|
131
131
|
rubyforge_project:
|
132
|
-
rubygems_version: 2.4.
|
132
|
+
rubygems_version: 2.4.8
|
133
133
|
signing_key:
|
134
134
|
specification_version: 4
|
135
135
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|