henkei 1.14.3 → 1.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/henkei.rb +72 -58
- data/lib/henkei/version.rb +1 -1
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3ab53035e222c201453eaf44efc1186a9c9ae2f
|
4
|
+
data.tar.gz: 9e527cde254131285d90ccd69f7c1ef57e8eaa53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 757063d9530cf4d88988bb2494e4f3f198465ea27c5603c2aed9bd6cfbbeb008a89d766f79fcb522f769ce49c078f7d4fe34b1256b4cbe241ae5dbe4543f5010
|
7
|
+
data.tar.gz: 7dd520e273bd2808871af156f53b5b8a39476fb8f969fee52e6424c5782c50122b4a0f3644199fdc9213973331ffe4a265fbd36c6a3b0d97a4990f8c8e9da98f
|
data/lib/henkei.rb
CHANGED
@@ -22,9 +22,9 @@ class Henkei
|
|
22
22
|
# data = File.read 'sample.pages'
|
23
23
|
# text = Henkei.read :text, data
|
24
24
|
# metadata = Henkei.read :metadata, data
|
25
|
-
|
25
|
+
#
|
26
26
|
def self.read(type, data)
|
27
|
-
result = @@server_pid ?
|
27
|
+
result = @@server_pid ? server_read(type, data) : client_read(type, data)
|
28
28
|
|
29
29
|
case type
|
30
30
|
when :text
|
@@ -38,49 +38,6 @@ class Henkei
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
-
def self._client_read(type, data)
|
42
|
-
switch =
|
43
|
-
case type
|
44
|
-
when :text
|
45
|
-
'-t'
|
46
|
-
when :html
|
47
|
-
'-h'
|
48
|
-
when :metadata
|
49
|
-
'-m -j'
|
50
|
-
when :mimetype
|
51
|
-
'-m -j'
|
52
|
-
end
|
53
|
-
|
54
|
-
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
55
|
-
io.write data
|
56
|
-
io.close_write
|
57
|
-
io.read
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
|
62
|
-
def self._server_read(_, data)
|
63
|
-
s = TCPSocket.new('localhost', @@server_port)
|
64
|
-
file = StringIO.new(data, 'r')
|
65
|
-
|
66
|
-
while 1
|
67
|
-
chunk = file.read(65536)
|
68
|
-
break unless chunk
|
69
|
-
s.write(chunk)
|
70
|
-
end
|
71
|
-
|
72
|
-
# tell Tika that we're done sending data
|
73
|
-
s.shutdown(Socket::SHUT_WR)
|
74
|
-
|
75
|
-
resp = ''
|
76
|
-
while 1
|
77
|
-
chunk = s.recv(65536)
|
78
|
-
break if chunk.empty? || !chunk
|
79
|
-
resp << chunk
|
80
|
-
end
|
81
|
-
resp
|
82
|
-
end
|
83
|
-
|
84
41
|
# Create a new instance of Henkei with a given document.
|
85
42
|
#
|
86
43
|
# Using a file path:
|
@@ -94,7 +51,7 @@ class Henkei
|
|
94
51
|
# From a stream or an object which responds to +read+
|
95
52
|
#
|
96
53
|
# Henkei.new File.open('sample.pages')
|
97
|
-
|
54
|
+
#
|
98
55
|
def initialize(input)
|
99
56
|
if input.is_a? String
|
100
57
|
if File.exists? input
|
@@ -115,7 +72,7 @@ class Henkei
|
|
115
72
|
#
|
116
73
|
# henkei = Henkei.new 'sample.pages'
|
117
74
|
# henkei.text
|
118
|
-
|
75
|
+
#
|
119
76
|
def text
|
120
77
|
return @text if defined? @text
|
121
78
|
|
@@ -126,7 +83,7 @@ class Henkei
|
|
126
83
|
#
|
127
84
|
# henkei = Henkei.new 'sample.pages'
|
128
85
|
# henkei.html
|
129
|
-
|
86
|
+
#
|
130
87
|
def html
|
131
88
|
return @html if defined? @html
|
132
89
|
|
@@ -137,7 +94,7 @@ class Henkei
|
|
137
94
|
#
|
138
95
|
# henkei = Henkei.new 'sample.pages'
|
139
96
|
# henkei.metadata['Content-Type']
|
140
|
-
|
97
|
+
#
|
141
98
|
def metadata
|
142
99
|
return @metadata if defined? @metadata
|
143
100
|
|
@@ -149,7 +106,7 @@ class Henkei
|
|
149
106
|
# henkei = Henkei.new 'sample.docx'
|
150
107
|
# henkei.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
151
108
|
# henkei.mimetype.extensions #=> ['docx']
|
152
|
-
|
109
|
+
#
|
153
110
|
def mimetype
|
154
111
|
return @mimetype if defined? @mimetype
|
155
112
|
|
@@ -162,8 +119,7 @@ class Henkei
|
|
162
119
|
#
|
163
120
|
# henkei = Henkei.new 'sample.pages'
|
164
121
|
# henkei.path? #=> true
|
165
|
-
|
166
|
-
|
122
|
+
#
|
167
123
|
def creation_date
|
168
124
|
return @creation_date if defined? @creation_date
|
169
125
|
|
@@ -174,17 +130,22 @@ class Henkei
|
|
174
130
|
end
|
175
131
|
end
|
176
132
|
|
133
|
+
# Returns +true+ if the Henkei document was specified using a file path.
|
134
|
+
#
|
135
|
+
# henkei = Henkei.new '/my/document/path/sample.docx'
|
136
|
+
# henkei.path? #=> true
|
137
|
+
#
|
177
138
|
def path?
|
178
|
-
|
139
|
+
!!@path
|
179
140
|
end
|
180
141
|
|
181
142
|
# Returns +true+ if the Henkei document was specified using a URI.
|
182
143
|
#
|
183
144
|
# henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
184
145
|
# henkei.uri? #=> true
|
185
|
-
|
146
|
+
#
|
186
147
|
def uri?
|
187
|
-
|
148
|
+
!!@uri
|
188
149
|
end
|
189
150
|
|
190
151
|
# Returns +true+ if the Henkei document was specified from a stream or an object which responds to +read+.
|
@@ -192,16 +153,16 @@ class Henkei
|
|
192
153
|
# file = File.open('sample.pages')
|
193
154
|
# henkei = Henkei.new file
|
194
155
|
# henkei.stream? #=> true
|
195
|
-
|
156
|
+
#
|
196
157
|
def stream?
|
197
|
-
|
158
|
+
!!@stream
|
198
159
|
end
|
199
160
|
|
200
161
|
# Returns the raw/unparsed content of the Henkei document.
|
201
162
|
#
|
202
163
|
# henkei = Henkei.new 'sample.pages'
|
203
164
|
# henkei.data
|
204
|
-
|
165
|
+
#
|
205
166
|
def data
|
206
167
|
return @data if defined? @data
|
207
168
|
|
@@ -257,6 +218,7 @@ class Henkei
|
|
257
218
|
# ensure
|
258
219
|
# Henkei.kill_server!
|
259
220
|
# end
|
221
|
+
#
|
260
222
|
def self.kill_server!
|
261
223
|
if @@server_pid
|
262
224
|
Process.kill('INT', @@server_pid)
|
@@ -265,8 +227,60 @@ class Henkei
|
|
265
227
|
end
|
266
228
|
end
|
267
229
|
|
230
|
+
### Private class methods
|
231
|
+
|
232
|
+
# Provide the path to the Java binary
|
233
|
+
#
|
268
234
|
def self.java
|
269
235
|
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
|
270
236
|
end
|
271
237
|
private_class_method :java
|
238
|
+
|
239
|
+
# Internal helper for calling to Tika library directly
|
240
|
+
#
|
241
|
+
def self.client_read(type, data)
|
242
|
+
switch =
|
243
|
+
case type
|
244
|
+
when :text
|
245
|
+
'-t'
|
246
|
+
when :html
|
247
|
+
'-h'
|
248
|
+
when :metadata
|
249
|
+
'-m -j'
|
250
|
+
when :mimetype
|
251
|
+
'-m -j'
|
252
|
+
end
|
253
|
+
|
254
|
+
IO.popen "#{java} -Djava.awt.headless=true -jar #{Henkei::JARPATH} #{switch}", 'r+' do |io|
|
255
|
+
io.write data
|
256
|
+
io.close_write
|
257
|
+
io.read
|
258
|
+
end
|
259
|
+
end
|
260
|
+
private_class_method :client_read
|
261
|
+
|
262
|
+
# Internal helper for calling to running Tika server
|
263
|
+
#
|
264
|
+
def self.server_read(_, data)
|
265
|
+
s = TCPSocket.new('localhost', @@server_port)
|
266
|
+
file = StringIO.new(data, 'r')
|
267
|
+
|
268
|
+
while 1
|
269
|
+
chunk = file.read(65536)
|
270
|
+
break unless chunk
|
271
|
+
s.write(chunk)
|
272
|
+
end
|
273
|
+
|
274
|
+
# tell Tika that we're done sending data
|
275
|
+
s.shutdown(Socket::SHUT_WR)
|
276
|
+
|
277
|
+
resp = ''
|
278
|
+
while 1
|
279
|
+
chunk = s.recv(65536)
|
280
|
+
break if chunk.empty? || !chunk
|
281
|
+
resp << chunk
|
282
|
+
end
|
283
|
+
resp
|
284
|
+
end
|
285
|
+
private_class_method :server_read
|
272
286
|
end
|
data/lib/henkei/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,76 +9,76 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mime-types
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
18
|
+
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
20
|
version: '1.23'
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: '1.23'
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: json
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
|
-
- -
|
32
|
+
- - ">="
|
33
33
|
- !ruby/object:Gem::Version
|
34
34
|
version: '1.8'
|
35
35
|
type: :runtime
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
41
41
|
version: '1.8'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: bundler
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
|
-
- - ~>
|
46
|
+
- - "~>"
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '1.3'
|
49
49
|
type: :development
|
50
50
|
prerelease: false
|
51
51
|
version_requirements: !ruby/object:Gem::Requirement
|
52
52
|
requirements:
|
53
|
-
- - ~>
|
53
|
+
- - "~>"
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '1.3'
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
57
|
name: rake
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
|
-
- -
|
60
|
+
- - ">="
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: '0'
|
63
63
|
type: :development
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- -
|
67
|
+
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: rspec
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
|
-
- - ~>
|
74
|
+
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '3.5'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
|
-
- - ~>
|
81
|
+
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
83
|
version: '3.5'
|
84
84
|
description: Read text and metadata from files and documents (.doc, .docx, .pages,
|
@@ -90,9 +90,9 @@ executables: []
|
|
90
90
|
extensions: []
|
91
91
|
extra_rdoc_files: []
|
92
92
|
files:
|
93
|
-
- .gitignore
|
94
|
-
- .rspec
|
95
|
-
- .travis.yml
|
93
|
+
- ".gitignore"
|
94
|
+
- ".rspec"
|
95
|
+
- ".travis.yml"
|
96
96
|
- Gemfile
|
97
97
|
- LICENSE
|
98
98
|
- NOTICE.txt
|
@@ -119,17 +119,17 @@ require_paths:
|
|
119
119
|
- lib
|
120
120
|
required_ruby_version: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- -
|
122
|
+
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- -
|
127
|
+
- - ">="
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0'
|
130
130
|
requirements: []
|
131
131
|
rubyforge_project:
|
132
|
-
rubygems_version: 2.4.
|
132
|
+
rubygems_version: 2.4.8
|
133
133
|
signing_key:
|
134
134
|
specification_version: 4
|
135
135
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|