chupa-text 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +31 -0
- data/lib/chupa-text/command/chupa-text.rb +2 -1
- data/lib/chupa-text/data.rb +26 -4
- data/lib/chupa-text/decomposers/http-server.rb +160 -0
- data/lib/chupa-text/external-command.rb +46 -9
- data/lib/chupa-text/extractor.rb +4 -5
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-csv.rb +1 -1
- data/test/decomposers/test-http-server.rb +175 -0
- data/test/helper.rb +6 -0
- data/test/test-external-command.rb +319 -6
- data/test/test-extractor.rb +2 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6d2f05926206d3da67e157161c9e4c35f036af80e80e5177fbc4edf9006e039
|
4
|
+
data.tar.gz: 5ed29a55f62d7a44cbbdbfc690b39005d35b354c2ba65629436b90d1ef32cef1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd5e2b4b04d2572bb90ec832b618bc855e5c038d0a46f84a970f8e1ae1011609356387c30a71c6d476362fb27391094a25d0759d62bd56772a630914f543e644
|
7
|
+
data.tar.gz: c1c22bb010320fe5f48eb0c8b500e54b4557ca0f24461b832c3ae278c8dd9b0c6dc93f5682cf9b52d15b05673cd013c5546543e2f46e095b72fe46d64deb0aba
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,36 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.2.2: 2019-03-28
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Added `http-server` decomposer.
|
8
|
+
|
9
|
+
* `ChupaText::Data#max_body_size`: Added.
|
10
|
+
|
11
|
+
* `ChupaText::Data#max_body_size=`: Added.
|
12
|
+
|
13
|
+
* `ChupaText::Data#timeout`: Added.
|
14
|
+
|
15
|
+
* `ChupaText::Data#timeout=`: Added.
|
16
|
+
|
17
|
+
* `ChupaText::Data#limit_cpu`: Added.
|
18
|
+
|
19
|
+
* `ChupaText::Data#limit_cpu=`: Added.
|
20
|
+
|
21
|
+
* `ChupaText::Data#limit_ax`: Added.
|
22
|
+
|
23
|
+
* `ChupaText::Data#limit_ax=`: Added.
|
24
|
+
|
25
|
+
* `ChupaText::ExternalCommand`: Added support for soft timeout and limits.
|
26
|
+
|
27
|
+
* `ChupaText::Extractor`: Stopped receiving the max body size as an
|
28
|
+
option. Use `ChupaText::Data#max_body_size=` instead.
|
29
|
+
|
30
|
+
### Fixes
|
31
|
+
|
32
|
+
* Fixed decomposer choose logic.
|
33
|
+
|
3
34
|
## 1.2.1: 2019-03-04
|
4
35
|
|
5
36
|
### Improvements
|
@@ -196,7 +196,7 @@ module ChupaText
|
|
196
196
|
end
|
197
197
|
|
198
198
|
def create_extractor
|
199
|
-
extractor = Extractor.new
|
199
|
+
extractor = Extractor.new
|
200
200
|
extractor.apply_configuration(@configuration)
|
201
201
|
extractor
|
202
202
|
end
|
@@ -222,6 +222,7 @@ module ChupaText
|
|
222
222
|
data.mime_type = @mime_type if @mime_type
|
223
223
|
data.need_screenshot = @need_screenshot
|
224
224
|
data.expected_screenshot_size = @expected_screenshot_size
|
225
|
+
data.max_body_size = @max_body_size
|
225
226
|
data
|
226
227
|
end
|
227
228
|
|
data/lib/chupa-text/data.rb
CHANGED
@@ -65,6 +65,20 @@ module ChupaText
|
|
65
65
|
# @return [Array<Integer, Integer>] the expected screenshot size.
|
66
66
|
attr_accessor :expected_screenshot_size
|
67
67
|
|
68
|
+
# @return [Integer, nil] the max body size in bytes.
|
69
|
+
attr_accessor :max_body_size
|
70
|
+
|
71
|
+
# @return [Numeric, String, nil] the timeout on extraction.
|
72
|
+
attr_accessor :timeout
|
73
|
+
|
74
|
+
# @return [Numeric, String, nil] the max CPU time on extraction by
|
75
|
+
# external command.
|
76
|
+
attr_accessor :limit_cpu
|
77
|
+
|
78
|
+
# @return [Numeric, String, nil] the max memory on extraction by
|
79
|
+
# external command.
|
80
|
+
attr_accessor :limit_as
|
81
|
+
|
68
82
|
def initialize(options={})
|
69
83
|
@uri = nil
|
70
84
|
@body = nil
|
@@ -76,6 +90,10 @@ module ChupaText
|
|
76
90
|
@screenshot = nil
|
77
91
|
@need_screenshot = true
|
78
92
|
@expected_screenshot_size = [200, 200]
|
93
|
+
@max_body_size = nil
|
94
|
+
@timeout = nil
|
95
|
+
@limit_cpu = nil
|
96
|
+
@limit_as = nil
|
79
97
|
@options = options || {}
|
80
98
|
source_data = @options[:source_data]
|
81
99
|
if source_data
|
@@ -107,6 +125,10 @@ module ChupaText
|
|
107
125
|
end
|
108
126
|
self.need_screenshot = data.need_screenshot?
|
109
127
|
self.expected_screenshot_size = data.expected_screenshot_size
|
128
|
+
self.max_body_size = data.max_body_size
|
129
|
+
self.timeout = data.timeout
|
130
|
+
self.limit_cpu = data.limit_cpu
|
131
|
+
self.limit_as = data.limit_as
|
110
132
|
end
|
111
133
|
|
112
134
|
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
@@ -198,11 +220,11 @@ module ChupaText
|
|
198
220
|
@need_screenshot
|
199
221
|
end
|
200
222
|
|
201
|
-
def to_utf8_body_data
|
223
|
+
def to_utf8_body_data
|
202
224
|
b = nil
|
203
|
-
if max_body_size
|
225
|
+
if @max_body_size
|
204
226
|
open do |input|
|
205
|
-
b = input.read(max_body_size)
|
227
|
+
b = input.read(@max_body_size)
|
206
228
|
end
|
207
229
|
else
|
208
230
|
b = body
|
@@ -211,7 +233,7 @@ module ChupaText
|
|
211
233
|
|
212
234
|
converter = UTF8Converter.new(b)
|
213
235
|
utf8_body = converter.convert
|
214
|
-
if max_body_size.nil? and b.equal?(utf8_body)
|
236
|
+
if @max_body_size.nil? and b.equal?(utf8_body)
|
215
237
|
self
|
216
238
|
else
|
217
239
|
TextData.new(utf8_body, source_data: self)
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "net/http"
|
18
|
+
require "pp"
|
19
|
+
require "uri"
|
20
|
+
|
21
|
+
module ChupaText
|
22
|
+
module Decomposers
|
23
|
+
class HTTPServer < Decomposer
|
24
|
+
include Loggable
|
25
|
+
|
26
|
+
registry.register("http-server", self)
|
27
|
+
|
28
|
+
@@default_url = nil
|
29
|
+
class << self
|
30
|
+
def default_url
|
31
|
+
@@default_url
|
32
|
+
end
|
33
|
+
|
34
|
+
def default_url=(url)
|
35
|
+
@@default_url = url
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(options)
|
40
|
+
super
|
41
|
+
@url = @options[:url] ||
|
42
|
+
self.class.default_url ||
|
43
|
+
ENV["CHUPA_TEXT_HTTP_SERVER_URL"]
|
44
|
+
@url = URI(@url) if @url
|
45
|
+
end
|
46
|
+
|
47
|
+
def target?(data)
|
48
|
+
return false unless @url
|
49
|
+
return false if data.text_plain?
|
50
|
+
true
|
51
|
+
end
|
52
|
+
|
53
|
+
def target_score(data)
|
54
|
+
if target?(data)
|
55
|
+
100
|
56
|
+
else
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def decompose(data, &block)
|
62
|
+
http = Net::HTTP.new(@url.host, @url.port)
|
63
|
+
http.use_ssl = true if @url.is_a?(URI::HTTPS)
|
64
|
+
if data.timeout.is_a?(Numeric)
|
65
|
+
http.open_timeout = data.timeout * 1.5
|
66
|
+
http.read_timeout = data.timeout * 1.5
|
67
|
+
http.write_timeout = data.timeout * 1.5
|
68
|
+
end
|
69
|
+
begin
|
70
|
+
http.start do
|
71
|
+
process_request(http, data, &block)
|
72
|
+
end
|
73
|
+
rescue SystemCallError => error
|
74
|
+
error do
|
75
|
+
message = "#{log_tag}[connection] "
|
76
|
+
message << "Failed to process data in server: "
|
77
|
+
message << "#{@url}: "
|
78
|
+
message << "#{error.class}: #{error.message}\n"
|
79
|
+
message << error.backtrace.join("\n")
|
80
|
+
message
|
81
|
+
end
|
82
|
+
rescue Net::ReadTimeout => error
|
83
|
+
error do
|
84
|
+
message = "#{log_tag}[timeout] "
|
85
|
+
message << "Failed to process data in server: "
|
86
|
+
message << "#{@url}: "
|
87
|
+
message << "#{error.class}: #{error.message}\n"
|
88
|
+
message << error.backtrace.join("\n")
|
89
|
+
message
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
def process_request(http, data)
|
96
|
+
request = Net::HTTP::Post.new(@url)
|
97
|
+
request["transfer-encoding"] = "chunked"
|
98
|
+
data.open do |input|
|
99
|
+
request.set_form(build_parameters(data, input),
|
100
|
+
"multipart/form-data")
|
101
|
+
response = http.request(request)
|
102
|
+
case response
|
103
|
+
when Net::HTTPOK
|
104
|
+
extracted = JSON.parse(response.body)
|
105
|
+
(extracted["texts"] || []).each do |text|
|
106
|
+
text_data = TextData.new(text["body"], source_data: data)
|
107
|
+
text.each do |key, value|
|
108
|
+
next if key == "body"
|
109
|
+
text_data[key] = value
|
110
|
+
end
|
111
|
+
yield(text_data)
|
112
|
+
end
|
113
|
+
else
|
114
|
+
error do
|
115
|
+
message = "#{log_tag} Failed to process data in server: "
|
116
|
+
message << "#{@url}: "
|
117
|
+
message << "#{response.code}: #{response.message.strip}\n"
|
118
|
+
case response.content_type
|
119
|
+
when "application/json"
|
120
|
+
PP.pp(JSON.parse(response.body), message)
|
121
|
+
else
|
122
|
+
message << response.body
|
123
|
+
end
|
124
|
+
message
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_parameters(data, input)
|
131
|
+
parameters = []
|
132
|
+
[
|
133
|
+
["timeout",
|
134
|
+
data.timeout || ChupaText::ExternalCommand.default_timeout],
|
135
|
+
["limit_cpu",
|
136
|
+
data.limit_cpu || ChupaText::ExternalCommand.default_limit_cpu],
|
137
|
+
["limit_as",
|
138
|
+
data.limit_as || ChupaText::ExternalCommand.default_limit_as],
|
139
|
+
["max_body_size", data.max_body_size],
|
140
|
+
].each do |key, value|
|
141
|
+
next if value.nil?
|
142
|
+
parameters << [key, StringIO.new(value.to_s)]
|
143
|
+
end
|
144
|
+
parameters << [
|
145
|
+
"data",
|
146
|
+
input,
|
147
|
+
{
|
148
|
+
filename: data.path.to_s,
|
149
|
+
content_type: data.mime_type,
|
150
|
+
},
|
151
|
+
]
|
152
|
+
parameters
|
153
|
+
end
|
154
|
+
|
155
|
+
def log_tag
|
156
|
+
"[decomposer][http-server]"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -69,13 +69,19 @@ module ChupaText
|
|
69
69
|
else
|
70
70
|
options = {}
|
71
71
|
end
|
72
|
+
data = options[:data]
|
72
73
|
pid = spawn(options[:env] || {},
|
73
74
|
@path.to_s,
|
74
75
|
*arguments,
|
75
|
-
spawn_options(options[:spawn_options]))
|
76
|
+
spawn_options(options[:spawn_options], data))
|
77
|
+
if data
|
78
|
+
soft_timeout = data.timeout
|
79
|
+
else
|
80
|
+
soft_timeout = nil
|
81
|
+
end
|
76
82
|
status = nil
|
77
83
|
begin
|
78
|
-
status = wait_process(pid, options[:timeout])
|
84
|
+
status = wait_process(pid, options[:timeout], soft_timeout)
|
79
85
|
ensure
|
80
86
|
unless status
|
81
87
|
begin
|
@@ -99,28 +105,44 @@ module ChupaText
|
|
99
105
|
end
|
100
106
|
|
101
107
|
private
|
102
|
-
def spawn_options(user_options)
|
108
|
+
def spawn_options(user_options, data)
|
103
109
|
options = (user_options || {}).dup
|
104
|
-
|
105
|
-
|
110
|
+
if data
|
111
|
+
soft_limit_cpu = data.limit_cpu
|
112
|
+
soft_limit_as = data.limit_as
|
113
|
+
else
|
114
|
+
soft_limit_cpu = nil
|
115
|
+
soft_limit_as = nil
|
116
|
+
end
|
117
|
+
apply_default_spawn_limit(options, soft_limit_cpu, :cpu, :time)
|
118
|
+
apply_default_spawn_limit(options, soft_limit_as, :as, :size)
|
106
119
|
options
|
107
120
|
end
|
108
121
|
|
109
|
-
def apply_default_spawn_limit(options, key, type)
|
122
|
+
def apply_default_spawn_limit(options, soft_value, key, type)
|
110
123
|
# TODO: Workaround for Ruby 2.3.3p222
|
111
124
|
case key
|
112
125
|
when :cpu
|
113
126
|
option_key = :rlimit_cpu
|
127
|
+
unit = "s"
|
114
128
|
when :as
|
115
129
|
option_key = :rlimit_as
|
130
|
+
unit = ""
|
116
131
|
else
|
117
132
|
option_key = :"rlimit_#{key}"
|
133
|
+
unit = ""
|
118
134
|
end
|
119
135
|
return if options[option_key]
|
120
136
|
|
121
137
|
tag = "[limit][#{key}]"
|
122
138
|
value = self.class.__send__("default_limit_#{key}")
|
123
139
|
value = __send__("parse_#{type}", tag, value)
|
140
|
+
soft_value = __send__("parse_#{type}", tag, soft_value)
|
141
|
+
if value
|
142
|
+
value = soft_value if soft_value and soft_value < value
|
143
|
+
else
|
144
|
+
value = soft_value
|
145
|
+
end
|
124
146
|
return if value.nil?
|
125
147
|
rlimit_number = Process.const_get("RLIMIT_#{key.to_s.upcase}")
|
126
148
|
soft_limit, hard_limit = Process.getrlimit(rlimit_number)
|
@@ -129,7 +151,7 @@ module ChupaText
|
|
129
151
|
return nil
|
130
152
|
end
|
131
153
|
limit_info = "soft-limit:#{soft_limit}, hard-limit:#{hard_limit}"
|
132
|
-
info("#{log_tag}#{tag}[set] <#{value}>(#{limit_info})")
|
154
|
+
info("#{log_tag}#{tag}[set] <#{value}#{unit}>(#{limit_info})")
|
133
155
|
|
134
156
|
options[option_key] = value
|
135
157
|
end
|
@@ -169,12 +191,21 @@ module ChupaText
|
|
169
191
|
scale = 1
|
170
192
|
case value
|
171
193
|
when /GB?\z/i
|
194
|
+
scale = 1000 ** 3
|
195
|
+
number = $PREMATCH
|
196
|
+
when /GiB?\z/i
|
172
197
|
scale = 1024 ** 3
|
173
198
|
number = $PREMATCH
|
174
199
|
when /MB?\z/i
|
200
|
+
scale = 1000 ** 2
|
201
|
+
number = $PREMATCH
|
202
|
+
when /MiB?\z/i
|
175
203
|
scale = 1024 ** 2
|
176
204
|
number = $PREMATCH
|
177
|
-
when /
|
205
|
+
when /[kK]B?\z/i
|
206
|
+
scale = 1000 ** 1
|
207
|
+
number = $PREMATCH
|
208
|
+
when /KiB?\z/i
|
178
209
|
scale = 1024 ** 1
|
179
210
|
number = $PREMATCH
|
180
211
|
when /B?\z/i
|
@@ -227,9 +258,15 @@ module ChupaText
|
|
227
258
|
warn("#{log_tag}#{tag}[invalid] <#{value}>(#{type})")
|
228
259
|
end
|
229
260
|
|
230
|
-
def wait_process(pid, timeout)
|
261
|
+
def wait_process(pid, timeout, soft_timeout)
|
231
262
|
tag = "[timeout]"
|
232
263
|
timeout = parse_time(tag, timeout || self.class.default_timeout)
|
264
|
+
soft_timeout = parse_time(tag, soft_timeout)
|
265
|
+
if timeout
|
266
|
+
timeout = soft_timeout if soft_timeout and soft_timeout < timeout
|
267
|
+
else
|
268
|
+
timeout = soft_timeout
|
269
|
+
end
|
233
270
|
if timeout
|
234
271
|
info("#{log_tag}#{tag}[use] <#{timeout}s>: <#{pid}>")
|
235
272
|
status = wait_process_timeout(pid, timeout)
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -21,9 +21,8 @@ module ChupaText
|
|
21
21
|
class Extractor
|
22
22
|
include Loggable
|
23
23
|
|
24
|
-
def initialize
|
24
|
+
def initialize
|
25
25
|
@decomposers = []
|
26
|
-
@max_body_size = max_body_size
|
27
26
|
end
|
28
27
|
|
29
28
|
# Sets the extractor up by the configuration. It adds decomposers
|
@@ -79,7 +78,7 @@ module ChupaText
|
|
79
78
|
candidates << [score, decomposer]
|
80
79
|
end
|
81
80
|
return nil if candidates.empty?
|
82
|
-
candidate = candidates.sort_by {|score, _| score}.first
|
81
|
+
candidate = candidates.sort_by {|score, _| -score}.first
|
83
82
|
candidate[1]
|
84
83
|
end
|
85
84
|
|
@@ -91,11 +90,11 @@ module ChupaText
|
|
91
90
|
if decomposer.nil?
|
92
91
|
if target.text_plain?
|
93
92
|
debug {"#{log_tag}[extract][text-plain]"}
|
94
|
-
yield(target.to_utf8_body_data
|
93
|
+
yield(target.to_utf8_body_data)
|
95
94
|
else
|
96
95
|
debug {"#{log_tag}[extract][decomposer] not found"}
|
97
96
|
if target.text?
|
98
|
-
yield(target.to_utf8_body_data
|
97
|
+
yield(target.to_utf8_body_data)
|
99
98
|
end
|
100
99
|
end
|
101
100
|
else
|
data/lib/chupa-text/version.rb
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
class TestDecomposersCSV< Test::Unit::TestCase
|
17
|
+
class TestDecomposersCSV < Test::Unit::TestCase
|
18
18
|
include Helper
|
19
19
|
|
20
20
|
def setup
|
@@ -0,0 +1,175 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
class TestDecomposersHTTPServer < Test::Unit::TestCase
|
18
|
+
include Helper
|
19
|
+
|
20
|
+
def setup
|
21
|
+
ChupaText::Decomposers::HTTPServer.default_url = nil
|
22
|
+
setup_server
|
23
|
+
setup_data
|
24
|
+
setup_decomposer
|
25
|
+
end
|
26
|
+
|
27
|
+
def setup_server
|
28
|
+
@port = 40080
|
29
|
+
@path = "/extraction.json"
|
30
|
+
@server_url = "http://127.0.0.1:#{@port}#{@path}"
|
31
|
+
logger = WEBrick::Log.new
|
32
|
+
logger.level = logger.class::ERROR
|
33
|
+
@server = WEBrick::HTTPServer.new(Port: @port,
|
34
|
+
Logger: logger,
|
35
|
+
AccessLog: [])
|
36
|
+
@response_status = 200
|
37
|
+
@server.mount_proc(@path) do |request, response|
|
38
|
+
sleep(@timeout * 2) if @timeout
|
39
|
+
response.status = @response_status
|
40
|
+
response.content_type = "application/json"
|
41
|
+
response.body = JSON.generate(@response)
|
42
|
+
end
|
43
|
+
@server_thread = Thread.new do
|
44
|
+
@server.start
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def setup_data
|
49
|
+
@input_data = <<-CSV
|
50
|
+
Hello,World
|
51
|
+
Ruby,ChupaText
|
52
|
+
CSV
|
53
|
+
@input_mime_type = "text/csv"
|
54
|
+
@input_path = "/tmp/hello.csv"
|
55
|
+
@timeout = nil
|
56
|
+
@extracted_text = @input_data.gsub(/,/, "\t")
|
57
|
+
@extracted_path = @input_path.gsub(/\.csv\z/, ".txt")
|
58
|
+
@response = {
|
59
|
+
"mime-type" => @input_mime_type,
|
60
|
+
"uri" => "file://#{@input_path}",
|
61
|
+
"path" => @input_path,
|
62
|
+
"size" => @input_data.bytesize,
|
63
|
+
"texts" => [
|
64
|
+
{
|
65
|
+
"mime-type" => "text/plain",
|
66
|
+
"uri" => "file://#{@extracted_path}",
|
67
|
+
"path" => @extracted_path,
|
68
|
+
"size" => @extracted_text.bytesize,
|
69
|
+
"source-mime-types" => [
|
70
|
+
@input_mime_type,
|
71
|
+
],
|
72
|
+
"body" => @extracted_text,
|
73
|
+
},
|
74
|
+
],
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def setup_decomposer
|
79
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new(:url => @server_url)
|
80
|
+
end
|
81
|
+
|
82
|
+
def teardown
|
83
|
+
teardown_server
|
84
|
+
end
|
85
|
+
|
86
|
+
def teardown_server
|
87
|
+
@server.shutdown
|
88
|
+
@server_thread.join
|
89
|
+
end
|
90
|
+
|
91
|
+
sub_test_case("decompose") do
|
92
|
+
def test_success
|
93
|
+
assert_equal([@extracted_text],
|
94
|
+
decompose.collect(&:body))
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_not_ok
|
98
|
+
@response_status = 404
|
99
|
+
messages = capture_log do
|
100
|
+
assert_equal([], decompose.collect(&:body))
|
101
|
+
end
|
102
|
+
assert_equal([
|
103
|
+
[
|
104
|
+
:error,
|
105
|
+
"[decomposer][http-server] " +
|
106
|
+
"Failed to process data in server: " +
|
107
|
+
"#{@server_url}: " +
|
108
|
+
"#{@response_status}: Not Found",
|
109
|
+
],
|
110
|
+
],
|
111
|
+
messages)
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_no_server
|
115
|
+
no_server_url = "http://127.0.0.1:2929/extraction.json"
|
116
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new(:url => no_server_url)
|
117
|
+
messages = capture_log do
|
118
|
+
assert_equal([], decompose.collect(&:body))
|
119
|
+
end
|
120
|
+
messages = messages.collect do |level, message|
|
121
|
+
[level, message.gsub(/Errno::.*\z/, "")]
|
122
|
+
end
|
123
|
+
assert_equal([
|
124
|
+
[
|
125
|
+
:error,
|
126
|
+
"[decomposer][http-server][connection] " +
|
127
|
+
"Failed to process data in server: " +
|
128
|
+
"#{no_server_url}: ",
|
129
|
+
],
|
130
|
+
],
|
131
|
+
messages)
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_read_timeout
|
135
|
+
@timeout = 0.1
|
136
|
+
messages = capture_log do
|
137
|
+
assert_equal([], decompose.collect(&:body))
|
138
|
+
end
|
139
|
+
messages = messages.collect do |level, message|
|
140
|
+
[level, message.gsub(/Net::.*\z/, "")]
|
141
|
+
end
|
142
|
+
assert_equal([
|
143
|
+
[
|
144
|
+
:error,
|
145
|
+
"[decomposer][http-server][timeout] " +
|
146
|
+
"Failed to process data in server: " +
|
147
|
+
"#{@server_url}: ",
|
148
|
+
],
|
149
|
+
],
|
150
|
+
messages)
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_default_url
|
154
|
+
ChupaText::Decomposers::HTTPServer.default_url = @server_url
|
155
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new({})
|
156
|
+
assert_equal([@extracted_text],
|
157
|
+
decompose.collect(&:body))
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
def decompose
|
162
|
+
data = ChupaText::Data.new
|
163
|
+
data.path = @input_path
|
164
|
+
data.mime_type = @input_mime_type
|
165
|
+
data.body = @input_data
|
166
|
+
data.timeout = @timeout
|
167
|
+
|
168
|
+
decomposed = []
|
169
|
+
@decomposer.decompose(data) do |decomposed_data|
|
170
|
+
decomposed << decomposed_data
|
171
|
+
end
|
172
|
+
decomposed
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
data/test/helper.rb
CHANGED
@@ -15,8 +15,10 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
require "pathname"
|
18
|
+
require "rbconfig"
|
18
19
|
require "tempfile"
|
19
20
|
require "uri"
|
21
|
+
require "webrick"
|
20
22
|
|
21
23
|
module Helper
|
22
24
|
def fixture_path(*components)
|
@@ -39,4 +41,8 @@ module Helper
|
|
39
41
|
[level, message]
|
40
42
|
end
|
41
43
|
end
|
44
|
+
|
45
|
+
def ruby
|
46
|
+
RbConfig.ruby
|
47
|
+
end
|
42
48
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2014-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,12 +14,8 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "rbconfig"
|
18
|
-
|
19
17
|
class TestExternalCommand < Test::Unit::TestCase
|
20
|
-
|
21
|
-
RbConfig.ruby
|
22
|
-
end
|
18
|
+
include Helper
|
23
19
|
|
24
20
|
def create_command(command)
|
25
21
|
ChupaText::ExternalCommand.new(command)
|
@@ -76,4 +72,321 @@ class TestExternalCommand < Test::Unit::TestCase
|
|
76
72
|
assert_false(exist?("nonexistent"))
|
77
73
|
end
|
78
74
|
end
|
75
|
+
|
76
|
+
class TestTimeout < self
|
77
|
+
def setup
|
78
|
+
@data = ChupaText::TextData.new("Hello")
|
79
|
+
timeout = ChupaText::ExternalCommand.default_timeout
|
80
|
+
begin
|
81
|
+
yield
|
82
|
+
ensure
|
83
|
+
ChupaText::ExternalCommand.default_timeout = timeout
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def run_command(options={})
|
88
|
+
IO.pipe do |input, output|
|
89
|
+
command = create_command(ruby)
|
90
|
+
command.run("-e", "puts(Process.pid)",
|
91
|
+
options.merge(data: @data,
|
92
|
+
spawn_options: {out: output}))
|
93
|
+
input.gets.chomp
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_option
|
98
|
+
pid = nil
|
99
|
+
messages = capture_log do
|
100
|
+
pid = run_command(timeout: "60s")
|
101
|
+
end
|
102
|
+
assert_equal([
|
103
|
+
[
|
104
|
+
:info,
|
105
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
106
|
+
]
|
107
|
+
],
|
108
|
+
messages)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_data_not_use
|
112
|
+
@data.timeout = "90s"
|
113
|
+
pid = nil
|
114
|
+
messages = capture_log do
|
115
|
+
pid = run_command(timeout: "60s")
|
116
|
+
end
|
117
|
+
assert_equal([
|
118
|
+
[
|
119
|
+
:info,
|
120
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
121
|
+
]
|
122
|
+
],
|
123
|
+
messages)
|
124
|
+
end
|
125
|
+
|
126
|
+
def test_data_use
|
127
|
+
@data.timeout = "30s"
|
128
|
+
pid = nil
|
129
|
+
messages = capture_log do
|
130
|
+
pid = run_command(timeout: "60s")
|
131
|
+
end
|
132
|
+
assert_equal([
|
133
|
+
[
|
134
|
+
:info,
|
135
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
136
|
+
]
|
137
|
+
],
|
138
|
+
messages)
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_data_only
|
142
|
+
@data.timeout = "30s"
|
143
|
+
pid = nil
|
144
|
+
messages = capture_log do
|
145
|
+
pid = run_command
|
146
|
+
end
|
147
|
+
assert_equal([
|
148
|
+
[
|
149
|
+
:info,
|
150
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
151
|
+
]
|
152
|
+
],
|
153
|
+
messages)
|
154
|
+
end
|
155
|
+
|
156
|
+
def test_default
|
157
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
158
|
+
pid = nil
|
159
|
+
messages = capture_log do
|
160
|
+
pid = run_command
|
161
|
+
end
|
162
|
+
assert_equal([
|
163
|
+
[
|
164
|
+
:info,
|
165
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
166
|
+
]
|
167
|
+
],
|
168
|
+
messages)
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_default_data_not_use
|
172
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
173
|
+
@data.timeout = "90s"
|
174
|
+
pid = nil
|
175
|
+
messages = capture_log do
|
176
|
+
pid = run_command
|
177
|
+
end
|
178
|
+
assert_equal([
|
179
|
+
[
|
180
|
+
:info,
|
181
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
182
|
+
]
|
183
|
+
],
|
184
|
+
messages)
|
185
|
+
end
|
186
|
+
|
187
|
+
def test_default_data_use
|
188
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
189
|
+
@data.timeout = "30s"
|
190
|
+
pid = nil
|
191
|
+
messages = capture_log do
|
192
|
+
pid = run_command
|
193
|
+
end
|
194
|
+
assert_equal([
|
195
|
+
[
|
196
|
+
:info,
|
197
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
198
|
+
]
|
199
|
+
],
|
200
|
+
messages)
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_default_data_only
|
204
|
+
@data.timeout = "30s"
|
205
|
+
pid = nil
|
206
|
+
messages = capture_log do
|
207
|
+
pid = run_command
|
208
|
+
end
|
209
|
+
assert_equal([
|
210
|
+
[
|
211
|
+
:info,
|
212
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
213
|
+
]
|
214
|
+
],
|
215
|
+
messages)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
class TestLimitCPU < self
|
220
|
+
def setup
|
221
|
+
@data = ChupaText::TextData.new("Hello")
|
222
|
+
limit_cpu = ChupaText::ExternalCommand.default_limit_cpu
|
223
|
+
begin
|
224
|
+
yield
|
225
|
+
ensure
|
226
|
+
ChupaText::ExternalCommand.default_limit_cpu = limit_cpu
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def run_command(spawn_options={})
|
231
|
+
command = create_command(ruby)
|
232
|
+
command.run("-e", "true",
|
233
|
+
data: @data,
|
234
|
+
spawn_options: spawn_options)
|
235
|
+
end
|
236
|
+
|
237
|
+
def test_default
|
238
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
239
|
+
messages = capture_log do
|
240
|
+
run_command
|
241
|
+
end
|
242
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
243
|
+
assert_equal([
|
244
|
+
[
|
245
|
+
:info,
|
246
|
+
"[external-command][limit][cpu][set] <60.0s>" +
|
247
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
248
|
+
]
|
249
|
+
],
|
250
|
+
messages)
|
251
|
+
end
|
252
|
+
|
253
|
+
def test_default_data_not_use
|
254
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
255
|
+
@data.limit_cpu = "90s"
|
256
|
+
messages = capture_log do
|
257
|
+
run_command
|
258
|
+
end
|
259
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
260
|
+
assert_equal([
|
261
|
+
[
|
262
|
+
:info,
|
263
|
+
"[external-command][limit][cpu][set] <60.0s>" +
|
264
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
265
|
+
]
|
266
|
+
],
|
267
|
+
messages)
|
268
|
+
end
|
269
|
+
|
270
|
+
def test_default_data_use
|
271
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
272
|
+
@data.limit_cpu = "30s"
|
273
|
+
messages = capture_log do
|
274
|
+
run_command
|
275
|
+
end
|
276
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
277
|
+
assert_equal([
|
278
|
+
[
|
279
|
+
:info,
|
280
|
+
"[external-command][limit][cpu][set] <30.0s>" +
|
281
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
282
|
+
]
|
283
|
+
],
|
284
|
+
messages)
|
285
|
+
end
|
286
|
+
|
287
|
+
def test_default_data_only
|
288
|
+
@data.limit_cpu = "30s"
|
289
|
+
messages = capture_log do
|
290
|
+
run_command
|
291
|
+
end
|
292
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
293
|
+
assert_equal([
|
294
|
+
[
|
295
|
+
:info,
|
296
|
+
"[external-command][limit][cpu][set] <30.0s>" +
|
297
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
298
|
+
]
|
299
|
+
],
|
300
|
+
messages)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
class TestLimitAS < self
|
305
|
+
def setup
|
306
|
+
@data = ChupaText::TextData.new("Hello")
|
307
|
+
limit_as = ChupaText::ExternalCommand.default_limit_as
|
308
|
+
begin
|
309
|
+
yield
|
310
|
+
ensure
|
311
|
+
ChupaText::ExternalCommand.default_limit_as = limit_as
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def run_command(spawn_options={})
|
316
|
+
command = create_command(ruby)
|
317
|
+
command.run("-e", "true",
|
318
|
+
data: @data,
|
319
|
+
spawn_options: spawn_options)
|
320
|
+
end
|
321
|
+
|
322
|
+
def test_default
|
323
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
324
|
+
messages = capture_log do
|
325
|
+
run_command
|
326
|
+
end
|
327
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
328
|
+
assert_equal([
|
329
|
+
[
|
330
|
+
:info,
|
331
|
+
"[external-command][limit][as][set] " +
|
332
|
+
"<#{100 * 1024 * 1024}>" +
|
333
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
334
|
+
]
|
335
|
+
],
|
336
|
+
messages)
|
337
|
+
end
|
338
|
+
|
339
|
+
def test_default_data_not_use
|
340
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
341
|
+
@data.limit_as = "150MiB"
|
342
|
+
messages = capture_log do
|
343
|
+
run_command
|
344
|
+
end
|
345
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
346
|
+
assert_equal([
|
347
|
+
[
|
348
|
+
:info,
|
349
|
+
"[external-command][limit][as][set] " +
|
350
|
+
"<#{100 * 1024 * 1024}>" +
|
351
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
352
|
+
]
|
353
|
+
],
|
354
|
+
messages)
|
355
|
+
end
|
356
|
+
|
357
|
+
def test_default_soft_use
|
358
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
359
|
+
@data.limit_as = "50MiB"
|
360
|
+
messages = capture_log do
|
361
|
+
run_command
|
362
|
+
end
|
363
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
364
|
+
assert_equal([
|
365
|
+
[
|
366
|
+
:info,
|
367
|
+
"[external-command][limit][as][set] " +
|
368
|
+
"<#{50 * 1024 * 1024}>" +
|
369
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
370
|
+
]
|
371
|
+
],
|
372
|
+
messages)
|
373
|
+
end
|
374
|
+
|
375
|
+
def test_default_soft_only
|
376
|
+
@data.limit_as = "50MiB"
|
377
|
+
messages = capture_log do
|
378
|
+
run_command
|
379
|
+
end
|
380
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
381
|
+
assert_equal([
|
382
|
+
[
|
383
|
+
:info,
|
384
|
+
"[external-command][limit][as][set] " +
|
385
|
+
"<#{50 * 1024 * 1024}>" +
|
386
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
387
|
+
]
|
388
|
+
],
|
389
|
+
messages)
|
390
|
+
end
|
391
|
+
end
|
79
392
|
end
|
data/test/test-extractor.rb
CHANGED
@@ -231,10 +231,11 @@ class TestExtractor < Test::Unit::TestCase
|
|
231
231
|
|
232
232
|
sub_test_case("max body size") do
|
233
233
|
def test_last_invalid
|
234
|
-
@extractor = ChupaText::Extractor.new
|
234
|
+
@extractor = ChupaText::Extractor.new
|
235
235
|
data = ChupaText::Data.new
|
236
236
|
data.mime_type = "text/plain"
|
237
237
|
data.body = "こん"
|
238
|
+
data.max_body_size = 5
|
238
239
|
assert_equal(["こ"], extract(data))
|
239
240
|
end
|
240
241
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- lib/chupa-text/decomposers.rb
|
160
160
|
- lib/chupa-text/decomposers/csv.rb
|
161
161
|
- lib/chupa-text/decomposers/gzip.rb
|
162
|
+
- lib/chupa-text/decomposers/http-server.rb
|
162
163
|
- lib/chupa-text/decomposers/office-open-xml-document.rb
|
163
164
|
- lib/chupa-text/decomposers/office-open-xml-presentation.rb
|
164
165
|
- lib/chupa-text/decomposers/office-open-xml-workbook.rb
|
@@ -198,6 +199,7 @@ files:
|
|
198
199
|
- test/command/test-chupa-text.rb
|
199
200
|
- test/decomposers/test-csv.rb
|
200
201
|
- test/decomposers/test-gzip.rb
|
202
|
+
- test/decomposers/test-http-server.rb
|
201
203
|
- test/decomposers/test-office-open-xml-document.rb
|
202
204
|
- test/decomposers/test-office-open-xml-presentation.rb
|
203
205
|
- test/decomposers/test-office-open-xml-workbook.rb
|