chupa-text 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +31 -0
- data/lib/chupa-text/command/chupa-text.rb +2 -1
- data/lib/chupa-text/data.rb +26 -4
- data/lib/chupa-text/decomposers/http-server.rb +160 -0
- data/lib/chupa-text/external-command.rb +46 -9
- data/lib/chupa-text/extractor.rb +4 -5
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-csv.rb +1 -1
- data/test/decomposers/test-http-server.rb +175 -0
- data/test/helper.rb +6 -0
- data/test/test-external-command.rb +319 -6
- data/test/test-extractor.rb +2 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6d2f05926206d3da67e157161c9e4c35f036af80e80e5177fbc4edf9006e039
|
4
|
+
data.tar.gz: 5ed29a55f62d7a44cbbdbfc690b39005d35b354c2ba65629436b90d1ef32cef1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd5e2b4b04d2572bb90ec832b618bc855e5c038d0a46f84a970f8e1ae1011609356387c30a71c6d476362fb27391094a25d0759d62bd56772a630914f543e644
|
7
|
+
data.tar.gz: c1c22bb010320fe5f48eb0c8b500e54b4557ca0f24461b832c3ae278c8dd9b0c6dc93f5682cf9b52d15b05673cd013c5546543e2f46e095b72fe46d64deb0aba
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,36 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.2.2: 2019-03-28
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Added `http-server` decomposer.
|
8
|
+
|
9
|
+
* `ChupaText::Data#max_body_size`: Added.
|
10
|
+
|
11
|
+
* `ChupaText::Data#max_body_size=`: Added.
|
12
|
+
|
13
|
+
* `ChupaText::Data#timeout`: Added.
|
14
|
+
|
15
|
+
* `ChupaText::Data#timeout=`: Added.
|
16
|
+
|
17
|
+
* `ChupaText::Data#limit_cpu`: Added.
|
18
|
+
|
19
|
+
* `ChupaText::Data#limit_cpu=`: Added.
|
20
|
+
|
21
|
+
* `ChupaText::Data#limit_ax`: Added.
|
22
|
+
|
23
|
+
* `ChupaText::Data#limit_ax=`: Added.
|
24
|
+
|
25
|
+
* `ChupaText::ExternalCommand`: Added support for soft timeout and limits.
|
26
|
+
|
27
|
+
* `ChupaText::Extractor`: Stopped receiving the max body size as an
|
28
|
+
option. Use `ChupaText::Data#max_body_size=` instead.
|
29
|
+
|
30
|
+
### Fixes
|
31
|
+
|
32
|
+
* Fixed decomposer choose logic.
|
33
|
+
|
3
34
|
## 1.2.1: 2019-03-04
|
4
35
|
|
5
36
|
### Improvements
|
@@ -196,7 +196,7 @@ module ChupaText
|
|
196
196
|
end
|
197
197
|
|
198
198
|
def create_extractor
|
199
|
-
extractor = Extractor.new
|
199
|
+
extractor = Extractor.new
|
200
200
|
extractor.apply_configuration(@configuration)
|
201
201
|
extractor
|
202
202
|
end
|
@@ -222,6 +222,7 @@ module ChupaText
|
|
222
222
|
data.mime_type = @mime_type if @mime_type
|
223
223
|
data.need_screenshot = @need_screenshot
|
224
224
|
data.expected_screenshot_size = @expected_screenshot_size
|
225
|
+
data.max_body_size = @max_body_size
|
225
226
|
data
|
226
227
|
end
|
227
228
|
|
data/lib/chupa-text/data.rb
CHANGED
@@ -65,6 +65,20 @@ module ChupaText
|
|
65
65
|
# @return [Array<Integer, Integer>] the expected screenshot size.
|
66
66
|
attr_accessor :expected_screenshot_size
|
67
67
|
|
68
|
+
# @return [Integer, nil] the max body size in bytes.
|
69
|
+
attr_accessor :max_body_size
|
70
|
+
|
71
|
+
# @return [Numeric, String, nil] the timeout on extraction.
|
72
|
+
attr_accessor :timeout
|
73
|
+
|
74
|
+
# @return [Numeric, String, nil] the max CPU time on extraction by
|
75
|
+
# external command.
|
76
|
+
attr_accessor :limit_cpu
|
77
|
+
|
78
|
+
# @return [Numeric, String, nil] the max memory on extraction by
|
79
|
+
# external command.
|
80
|
+
attr_accessor :limit_as
|
81
|
+
|
68
82
|
def initialize(options={})
|
69
83
|
@uri = nil
|
70
84
|
@body = nil
|
@@ -76,6 +90,10 @@ module ChupaText
|
|
76
90
|
@screenshot = nil
|
77
91
|
@need_screenshot = true
|
78
92
|
@expected_screenshot_size = [200, 200]
|
93
|
+
@max_body_size = nil
|
94
|
+
@timeout = nil
|
95
|
+
@limit_cpu = nil
|
96
|
+
@limit_as = nil
|
79
97
|
@options = options || {}
|
80
98
|
source_data = @options[:source_data]
|
81
99
|
if source_data
|
@@ -107,6 +125,10 @@ module ChupaText
|
|
107
125
|
end
|
108
126
|
self.need_screenshot = data.need_screenshot?
|
109
127
|
self.expected_screenshot_size = data.expected_screenshot_size
|
128
|
+
self.max_body_size = data.max_body_size
|
129
|
+
self.timeout = data.timeout
|
130
|
+
self.limit_cpu = data.limit_cpu
|
131
|
+
self.limit_as = data.limit_as
|
110
132
|
end
|
111
133
|
|
112
134
|
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
@@ -198,11 +220,11 @@ module ChupaText
|
|
198
220
|
@need_screenshot
|
199
221
|
end
|
200
222
|
|
201
|
-
def to_utf8_body_data
|
223
|
+
def to_utf8_body_data
|
202
224
|
b = nil
|
203
|
-
if max_body_size
|
225
|
+
if @max_body_size
|
204
226
|
open do |input|
|
205
|
-
b = input.read(max_body_size)
|
227
|
+
b = input.read(@max_body_size)
|
206
228
|
end
|
207
229
|
else
|
208
230
|
b = body
|
@@ -211,7 +233,7 @@ module ChupaText
|
|
211
233
|
|
212
234
|
converter = UTF8Converter.new(b)
|
213
235
|
utf8_body = converter.convert
|
214
|
-
if max_body_size.nil? and b.equal?(utf8_body)
|
236
|
+
if @max_body_size.nil? and b.equal?(utf8_body)
|
215
237
|
self
|
216
238
|
else
|
217
239
|
TextData.new(utf8_body, source_data: self)
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "net/http"
|
18
|
+
require "pp"
|
19
|
+
require "uri"
|
20
|
+
|
21
|
+
module ChupaText
|
22
|
+
module Decomposers
|
23
|
+
class HTTPServer < Decomposer
|
24
|
+
include Loggable
|
25
|
+
|
26
|
+
registry.register("http-server", self)
|
27
|
+
|
28
|
+
@@default_url = nil
|
29
|
+
class << self
|
30
|
+
def default_url
|
31
|
+
@@default_url
|
32
|
+
end
|
33
|
+
|
34
|
+
def default_url=(url)
|
35
|
+
@@default_url = url
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(options)
|
40
|
+
super
|
41
|
+
@url = @options[:url] ||
|
42
|
+
self.class.default_url ||
|
43
|
+
ENV["CHUPA_TEXT_HTTP_SERVER_URL"]
|
44
|
+
@url = URI(@url) if @url
|
45
|
+
end
|
46
|
+
|
47
|
+
def target?(data)
|
48
|
+
return false unless @url
|
49
|
+
return false if data.text_plain?
|
50
|
+
true
|
51
|
+
end
|
52
|
+
|
53
|
+
def target_score(data)
|
54
|
+
if target?(data)
|
55
|
+
100
|
56
|
+
else
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def decompose(data, &block)
|
62
|
+
http = Net::HTTP.new(@url.host, @url.port)
|
63
|
+
http.use_ssl = true if @url.is_a?(URI::HTTPS)
|
64
|
+
if data.timeout.is_a?(Numeric)
|
65
|
+
http.open_timeout = data.timeout * 1.5
|
66
|
+
http.read_timeout = data.timeout * 1.5
|
67
|
+
http.write_timeout = data.timeout * 1.5
|
68
|
+
end
|
69
|
+
begin
|
70
|
+
http.start do
|
71
|
+
process_request(http, data, &block)
|
72
|
+
end
|
73
|
+
rescue SystemCallError => error
|
74
|
+
error do
|
75
|
+
message = "#{log_tag}[connection] "
|
76
|
+
message << "Failed to process data in server: "
|
77
|
+
message << "#{@url}: "
|
78
|
+
message << "#{error.class}: #{error.message}\n"
|
79
|
+
message << error.backtrace.join("\n")
|
80
|
+
message
|
81
|
+
end
|
82
|
+
rescue Net::ReadTimeout => error
|
83
|
+
error do
|
84
|
+
message = "#{log_tag}[timeout] "
|
85
|
+
message << "Failed to process data in server: "
|
86
|
+
message << "#{@url}: "
|
87
|
+
message << "#{error.class}: #{error.message}\n"
|
88
|
+
message << error.backtrace.join("\n")
|
89
|
+
message
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
def process_request(http, data)
|
96
|
+
request = Net::HTTP::Post.new(@url)
|
97
|
+
request["transfer-encoding"] = "chunked"
|
98
|
+
data.open do |input|
|
99
|
+
request.set_form(build_parameters(data, input),
|
100
|
+
"multipart/form-data")
|
101
|
+
response = http.request(request)
|
102
|
+
case response
|
103
|
+
when Net::HTTPOK
|
104
|
+
extracted = JSON.parse(response.body)
|
105
|
+
(extracted["texts"] || []).each do |text|
|
106
|
+
text_data = TextData.new(text["body"], source_data: data)
|
107
|
+
text.each do |key, value|
|
108
|
+
next if key == "body"
|
109
|
+
text_data[key] = value
|
110
|
+
end
|
111
|
+
yield(text_data)
|
112
|
+
end
|
113
|
+
else
|
114
|
+
error do
|
115
|
+
message = "#{log_tag} Failed to process data in server: "
|
116
|
+
message << "#{@url}: "
|
117
|
+
message << "#{response.code}: #{response.message.strip}\n"
|
118
|
+
case response.content_type
|
119
|
+
when "application/json"
|
120
|
+
PP.pp(JSON.parse(response.body), message)
|
121
|
+
else
|
122
|
+
message << response.body
|
123
|
+
end
|
124
|
+
message
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_parameters(data, input)
|
131
|
+
parameters = []
|
132
|
+
[
|
133
|
+
["timeout",
|
134
|
+
data.timeout || ChupaText::ExternalCommand.default_timeout],
|
135
|
+
["limit_cpu",
|
136
|
+
data.limit_cpu || ChupaText::ExternalCommand.default_limit_cpu],
|
137
|
+
["limit_as",
|
138
|
+
data.limit_as || ChupaText::ExternalCommand.default_limit_as],
|
139
|
+
["max_body_size", data.max_body_size],
|
140
|
+
].each do |key, value|
|
141
|
+
next if value.nil?
|
142
|
+
parameters << [key, StringIO.new(value.to_s)]
|
143
|
+
end
|
144
|
+
parameters << [
|
145
|
+
"data",
|
146
|
+
input,
|
147
|
+
{
|
148
|
+
filename: data.path.to_s,
|
149
|
+
content_type: data.mime_type,
|
150
|
+
},
|
151
|
+
]
|
152
|
+
parameters
|
153
|
+
end
|
154
|
+
|
155
|
+
def log_tag
|
156
|
+
"[decomposer][http-server]"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -69,13 +69,19 @@ module ChupaText
|
|
69
69
|
else
|
70
70
|
options = {}
|
71
71
|
end
|
72
|
+
data = options[:data]
|
72
73
|
pid = spawn(options[:env] || {},
|
73
74
|
@path.to_s,
|
74
75
|
*arguments,
|
75
|
-
spawn_options(options[:spawn_options]))
|
76
|
+
spawn_options(options[:spawn_options], data))
|
77
|
+
if data
|
78
|
+
soft_timeout = data.timeout
|
79
|
+
else
|
80
|
+
soft_timeout = nil
|
81
|
+
end
|
76
82
|
status = nil
|
77
83
|
begin
|
78
|
-
status = wait_process(pid, options[:timeout])
|
84
|
+
status = wait_process(pid, options[:timeout], soft_timeout)
|
79
85
|
ensure
|
80
86
|
unless status
|
81
87
|
begin
|
@@ -99,28 +105,44 @@ module ChupaText
|
|
99
105
|
end
|
100
106
|
|
101
107
|
private
|
102
|
-
def spawn_options(user_options)
|
108
|
+
def spawn_options(user_options, data)
|
103
109
|
options = (user_options || {}).dup
|
104
|
-
|
105
|
-
|
110
|
+
if data
|
111
|
+
soft_limit_cpu = data.limit_cpu
|
112
|
+
soft_limit_as = data.limit_as
|
113
|
+
else
|
114
|
+
soft_limit_cpu = nil
|
115
|
+
soft_limit_as = nil
|
116
|
+
end
|
117
|
+
apply_default_spawn_limit(options, soft_limit_cpu, :cpu, :time)
|
118
|
+
apply_default_spawn_limit(options, soft_limit_as, :as, :size)
|
106
119
|
options
|
107
120
|
end
|
108
121
|
|
109
|
-
def apply_default_spawn_limit(options, key, type)
|
122
|
+
def apply_default_spawn_limit(options, soft_value, key, type)
|
110
123
|
# TODO: Workaround for Ruby 2.3.3p222
|
111
124
|
case key
|
112
125
|
when :cpu
|
113
126
|
option_key = :rlimit_cpu
|
127
|
+
unit = "s"
|
114
128
|
when :as
|
115
129
|
option_key = :rlimit_as
|
130
|
+
unit = ""
|
116
131
|
else
|
117
132
|
option_key = :"rlimit_#{key}"
|
133
|
+
unit = ""
|
118
134
|
end
|
119
135
|
return if options[option_key]
|
120
136
|
|
121
137
|
tag = "[limit][#{key}]"
|
122
138
|
value = self.class.__send__("default_limit_#{key}")
|
123
139
|
value = __send__("parse_#{type}", tag, value)
|
140
|
+
soft_value = __send__("parse_#{type}", tag, soft_value)
|
141
|
+
if value
|
142
|
+
value = soft_value if soft_value and soft_value < value
|
143
|
+
else
|
144
|
+
value = soft_value
|
145
|
+
end
|
124
146
|
return if value.nil?
|
125
147
|
rlimit_number = Process.const_get("RLIMIT_#{key.to_s.upcase}")
|
126
148
|
soft_limit, hard_limit = Process.getrlimit(rlimit_number)
|
@@ -129,7 +151,7 @@ module ChupaText
|
|
129
151
|
return nil
|
130
152
|
end
|
131
153
|
limit_info = "soft-limit:#{soft_limit}, hard-limit:#{hard_limit}"
|
132
|
-
info("#{log_tag}#{tag}[set] <#{value}>(#{limit_info})")
|
154
|
+
info("#{log_tag}#{tag}[set] <#{value}#{unit}>(#{limit_info})")
|
133
155
|
|
134
156
|
options[option_key] = value
|
135
157
|
end
|
@@ -169,12 +191,21 @@ module ChupaText
|
|
169
191
|
scale = 1
|
170
192
|
case value
|
171
193
|
when /GB?\z/i
|
194
|
+
scale = 1000 ** 3
|
195
|
+
number = $PREMATCH
|
196
|
+
when /GiB?\z/i
|
172
197
|
scale = 1024 ** 3
|
173
198
|
number = $PREMATCH
|
174
199
|
when /MB?\z/i
|
200
|
+
scale = 1000 ** 2
|
201
|
+
number = $PREMATCH
|
202
|
+
when /MiB?\z/i
|
175
203
|
scale = 1024 ** 2
|
176
204
|
number = $PREMATCH
|
177
|
-
when /
|
205
|
+
when /[kK]B?\z/i
|
206
|
+
scale = 1000 ** 1
|
207
|
+
number = $PREMATCH
|
208
|
+
when /KiB?\z/i
|
178
209
|
scale = 1024 ** 1
|
179
210
|
number = $PREMATCH
|
180
211
|
when /B?\z/i
|
@@ -227,9 +258,15 @@ module ChupaText
|
|
227
258
|
warn("#{log_tag}#{tag}[invalid] <#{value}>(#{type})")
|
228
259
|
end
|
229
260
|
|
230
|
-
def wait_process(pid, timeout)
|
261
|
+
def wait_process(pid, timeout, soft_timeout)
|
231
262
|
tag = "[timeout]"
|
232
263
|
timeout = parse_time(tag, timeout || self.class.default_timeout)
|
264
|
+
soft_timeout = parse_time(tag, soft_timeout)
|
265
|
+
if timeout
|
266
|
+
timeout = soft_timeout if soft_timeout and soft_timeout < timeout
|
267
|
+
else
|
268
|
+
timeout = soft_timeout
|
269
|
+
end
|
233
270
|
if timeout
|
234
271
|
info("#{log_tag}#{tag}[use] <#{timeout}s>: <#{pid}>")
|
235
272
|
status = wait_process_timeout(pid, timeout)
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -21,9 +21,8 @@ module ChupaText
|
|
21
21
|
class Extractor
|
22
22
|
include Loggable
|
23
23
|
|
24
|
-
def initialize
|
24
|
+
def initialize
|
25
25
|
@decomposers = []
|
26
|
-
@max_body_size = max_body_size
|
27
26
|
end
|
28
27
|
|
29
28
|
# Sets the extractor up by the configuration. It adds decomposers
|
@@ -79,7 +78,7 @@ module ChupaText
|
|
79
78
|
candidates << [score, decomposer]
|
80
79
|
end
|
81
80
|
return nil if candidates.empty?
|
82
|
-
candidate = candidates.sort_by {|score, _| score}.first
|
81
|
+
candidate = candidates.sort_by {|score, _| -score}.first
|
83
82
|
candidate[1]
|
84
83
|
end
|
85
84
|
|
@@ -91,11 +90,11 @@ module ChupaText
|
|
91
90
|
if decomposer.nil?
|
92
91
|
if target.text_plain?
|
93
92
|
debug {"#{log_tag}[extract][text-plain]"}
|
94
|
-
yield(target.to_utf8_body_data
|
93
|
+
yield(target.to_utf8_body_data)
|
95
94
|
else
|
96
95
|
debug {"#{log_tag}[extract][decomposer] not found"}
|
97
96
|
if target.text?
|
98
|
-
yield(target.to_utf8_body_data
|
97
|
+
yield(target.to_utf8_body_data)
|
99
98
|
end
|
100
99
|
end
|
101
100
|
else
|
data/lib/chupa-text/version.rb
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
class TestDecomposersCSV< Test::Unit::TestCase
|
17
|
+
class TestDecomposersCSV < Test::Unit::TestCase
|
18
18
|
include Helper
|
19
19
|
|
20
20
|
def setup
|
@@ -0,0 +1,175 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
class TestDecomposersHTTPServer < Test::Unit::TestCase
|
18
|
+
include Helper
|
19
|
+
|
20
|
+
def setup
|
21
|
+
ChupaText::Decomposers::HTTPServer.default_url = nil
|
22
|
+
setup_server
|
23
|
+
setup_data
|
24
|
+
setup_decomposer
|
25
|
+
end
|
26
|
+
|
27
|
+
def setup_server
|
28
|
+
@port = 40080
|
29
|
+
@path = "/extraction.json"
|
30
|
+
@server_url = "http://127.0.0.1:#{@port}#{@path}"
|
31
|
+
logger = WEBrick::Log.new
|
32
|
+
logger.level = logger.class::ERROR
|
33
|
+
@server = WEBrick::HTTPServer.new(Port: @port,
|
34
|
+
Logger: logger,
|
35
|
+
AccessLog: [])
|
36
|
+
@response_status = 200
|
37
|
+
@server.mount_proc(@path) do |request, response|
|
38
|
+
sleep(@timeout * 2) if @timeout
|
39
|
+
response.status = @response_status
|
40
|
+
response.content_type = "application/json"
|
41
|
+
response.body = JSON.generate(@response)
|
42
|
+
end
|
43
|
+
@server_thread = Thread.new do
|
44
|
+
@server.start
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def setup_data
|
49
|
+
@input_data = <<-CSV
|
50
|
+
Hello,World
|
51
|
+
Ruby,ChupaText
|
52
|
+
CSV
|
53
|
+
@input_mime_type = "text/csv"
|
54
|
+
@input_path = "/tmp/hello.csv"
|
55
|
+
@timeout = nil
|
56
|
+
@extracted_text = @input_data.gsub(/,/, "\t")
|
57
|
+
@extracted_path = @input_path.gsub(/\.csv\z/, ".txt")
|
58
|
+
@response = {
|
59
|
+
"mime-type" => @input_mime_type,
|
60
|
+
"uri" => "file://#{@input_path}",
|
61
|
+
"path" => @input_path,
|
62
|
+
"size" => @input_data.bytesize,
|
63
|
+
"texts" => [
|
64
|
+
{
|
65
|
+
"mime-type" => "text/plain",
|
66
|
+
"uri" => "file://#{@extracted_path}",
|
67
|
+
"path" => @extracted_path,
|
68
|
+
"size" => @extracted_text.bytesize,
|
69
|
+
"source-mime-types" => [
|
70
|
+
@input_mime_type,
|
71
|
+
],
|
72
|
+
"body" => @extracted_text,
|
73
|
+
},
|
74
|
+
],
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def setup_decomposer
|
79
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new(:url => @server_url)
|
80
|
+
end
|
81
|
+
|
82
|
+
def teardown
|
83
|
+
teardown_server
|
84
|
+
end
|
85
|
+
|
86
|
+
def teardown_server
|
87
|
+
@server.shutdown
|
88
|
+
@server_thread.join
|
89
|
+
end
|
90
|
+
|
91
|
+
sub_test_case("decompose") do
|
92
|
+
def test_success
|
93
|
+
assert_equal([@extracted_text],
|
94
|
+
decompose.collect(&:body))
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_not_ok
|
98
|
+
@response_status = 404
|
99
|
+
messages = capture_log do
|
100
|
+
assert_equal([], decompose.collect(&:body))
|
101
|
+
end
|
102
|
+
assert_equal([
|
103
|
+
[
|
104
|
+
:error,
|
105
|
+
"[decomposer][http-server] " +
|
106
|
+
"Failed to process data in server: " +
|
107
|
+
"#{@server_url}: " +
|
108
|
+
"#{@response_status}: Not Found",
|
109
|
+
],
|
110
|
+
],
|
111
|
+
messages)
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_no_server
|
115
|
+
no_server_url = "http://127.0.0.1:2929/extraction.json"
|
116
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new(:url => no_server_url)
|
117
|
+
messages = capture_log do
|
118
|
+
assert_equal([], decompose.collect(&:body))
|
119
|
+
end
|
120
|
+
messages = messages.collect do |level, message|
|
121
|
+
[level, message.gsub(/Errno::.*\z/, "")]
|
122
|
+
end
|
123
|
+
assert_equal([
|
124
|
+
[
|
125
|
+
:error,
|
126
|
+
"[decomposer][http-server][connection] " +
|
127
|
+
"Failed to process data in server: " +
|
128
|
+
"#{no_server_url}: ",
|
129
|
+
],
|
130
|
+
],
|
131
|
+
messages)
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_read_timeout
|
135
|
+
@timeout = 0.1
|
136
|
+
messages = capture_log do
|
137
|
+
assert_equal([], decompose.collect(&:body))
|
138
|
+
end
|
139
|
+
messages = messages.collect do |level, message|
|
140
|
+
[level, message.gsub(/Net::.*\z/, "")]
|
141
|
+
end
|
142
|
+
assert_equal([
|
143
|
+
[
|
144
|
+
:error,
|
145
|
+
"[decomposer][http-server][timeout] " +
|
146
|
+
"Failed to process data in server: " +
|
147
|
+
"#{@server_url}: ",
|
148
|
+
],
|
149
|
+
],
|
150
|
+
messages)
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_default_url
|
154
|
+
ChupaText::Decomposers::HTTPServer.default_url = @server_url
|
155
|
+
@decomposer = ChupaText::Decomposers::HTTPServer.new({})
|
156
|
+
assert_equal([@extracted_text],
|
157
|
+
decompose.collect(&:body))
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
def decompose
|
162
|
+
data = ChupaText::Data.new
|
163
|
+
data.path = @input_path
|
164
|
+
data.mime_type = @input_mime_type
|
165
|
+
data.body = @input_data
|
166
|
+
data.timeout = @timeout
|
167
|
+
|
168
|
+
decomposed = []
|
169
|
+
@decomposer.decompose(data) do |decomposed_data|
|
170
|
+
decomposed << decomposed_data
|
171
|
+
end
|
172
|
+
decomposed
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
data/test/helper.rb
CHANGED
@@ -15,8 +15,10 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
require "pathname"
|
18
|
+
require "rbconfig"
|
18
19
|
require "tempfile"
|
19
20
|
require "uri"
|
21
|
+
require "webrick"
|
20
22
|
|
21
23
|
module Helper
|
22
24
|
def fixture_path(*components)
|
@@ -39,4 +41,8 @@ module Helper
|
|
39
41
|
[level, message]
|
40
42
|
end
|
41
43
|
end
|
44
|
+
|
45
|
+
def ruby
|
46
|
+
RbConfig.ruby
|
47
|
+
end
|
42
48
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2014 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2014-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,12 +14,8 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "rbconfig"
|
18
|
-
|
19
17
|
class TestExternalCommand < Test::Unit::TestCase
|
20
|
-
|
21
|
-
RbConfig.ruby
|
22
|
-
end
|
18
|
+
include Helper
|
23
19
|
|
24
20
|
def create_command(command)
|
25
21
|
ChupaText::ExternalCommand.new(command)
|
@@ -76,4 +72,321 @@ class TestExternalCommand < Test::Unit::TestCase
|
|
76
72
|
assert_false(exist?("nonexistent"))
|
77
73
|
end
|
78
74
|
end
|
75
|
+
|
76
|
+
class TestTimeout < self
|
77
|
+
def setup
|
78
|
+
@data = ChupaText::TextData.new("Hello")
|
79
|
+
timeout = ChupaText::ExternalCommand.default_timeout
|
80
|
+
begin
|
81
|
+
yield
|
82
|
+
ensure
|
83
|
+
ChupaText::ExternalCommand.default_timeout = timeout
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def run_command(options={})
|
88
|
+
IO.pipe do |input, output|
|
89
|
+
command = create_command(ruby)
|
90
|
+
command.run("-e", "puts(Process.pid)",
|
91
|
+
options.merge(data: @data,
|
92
|
+
spawn_options: {out: output}))
|
93
|
+
input.gets.chomp
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_option
|
98
|
+
pid = nil
|
99
|
+
messages = capture_log do
|
100
|
+
pid = run_command(timeout: "60s")
|
101
|
+
end
|
102
|
+
assert_equal([
|
103
|
+
[
|
104
|
+
:info,
|
105
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
106
|
+
]
|
107
|
+
],
|
108
|
+
messages)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_data_not_use
|
112
|
+
@data.timeout = "90s"
|
113
|
+
pid = nil
|
114
|
+
messages = capture_log do
|
115
|
+
pid = run_command(timeout: "60s")
|
116
|
+
end
|
117
|
+
assert_equal([
|
118
|
+
[
|
119
|
+
:info,
|
120
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
121
|
+
]
|
122
|
+
],
|
123
|
+
messages)
|
124
|
+
end
|
125
|
+
|
126
|
+
def test_data_use
|
127
|
+
@data.timeout = "30s"
|
128
|
+
pid = nil
|
129
|
+
messages = capture_log do
|
130
|
+
pid = run_command(timeout: "60s")
|
131
|
+
end
|
132
|
+
assert_equal([
|
133
|
+
[
|
134
|
+
:info,
|
135
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
136
|
+
]
|
137
|
+
],
|
138
|
+
messages)
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_data_only
|
142
|
+
@data.timeout = "30s"
|
143
|
+
pid = nil
|
144
|
+
messages = capture_log do
|
145
|
+
pid = run_command
|
146
|
+
end
|
147
|
+
assert_equal([
|
148
|
+
[
|
149
|
+
:info,
|
150
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
151
|
+
]
|
152
|
+
],
|
153
|
+
messages)
|
154
|
+
end
|
155
|
+
|
156
|
+
def test_default
|
157
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
158
|
+
pid = nil
|
159
|
+
messages = capture_log do
|
160
|
+
pid = run_command
|
161
|
+
end
|
162
|
+
assert_equal([
|
163
|
+
[
|
164
|
+
:info,
|
165
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
166
|
+
]
|
167
|
+
],
|
168
|
+
messages)
|
169
|
+
end
|
170
|
+
|
171
|
+
def test_default_data_not_use
|
172
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
173
|
+
@data.timeout = "90s"
|
174
|
+
pid = nil
|
175
|
+
messages = capture_log do
|
176
|
+
pid = run_command
|
177
|
+
end
|
178
|
+
assert_equal([
|
179
|
+
[
|
180
|
+
:info,
|
181
|
+
"[external-command][timeout][use] <60.0s>: <#{pid}>",
|
182
|
+
]
|
183
|
+
],
|
184
|
+
messages)
|
185
|
+
end
|
186
|
+
|
187
|
+
def test_default_data_use
|
188
|
+
ChupaText::ExternalCommand.default_timeout = "60s"
|
189
|
+
@data.timeout = "30s"
|
190
|
+
pid = nil
|
191
|
+
messages = capture_log do
|
192
|
+
pid = run_command
|
193
|
+
end
|
194
|
+
assert_equal([
|
195
|
+
[
|
196
|
+
:info,
|
197
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
198
|
+
]
|
199
|
+
],
|
200
|
+
messages)
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_default_data_only
|
204
|
+
@data.timeout = "30s"
|
205
|
+
pid = nil
|
206
|
+
messages = capture_log do
|
207
|
+
pid = run_command
|
208
|
+
end
|
209
|
+
assert_equal([
|
210
|
+
[
|
211
|
+
:info,
|
212
|
+
"[external-command][timeout][use] <30.0s>: <#{pid}>",
|
213
|
+
]
|
214
|
+
],
|
215
|
+
messages)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
class TestLimitCPU < self
|
220
|
+
def setup
|
221
|
+
@data = ChupaText::TextData.new("Hello")
|
222
|
+
limit_cpu = ChupaText::ExternalCommand.default_limit_cpu
|
223
|
+
begin
|
224
|
+
yield
|
225
|
+
ensure
|
226
|
+
ChupaText::ExternalCommand.default_limit_cpu = limit_cpu
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def run_command(spawn_options={})
|
231
|
+
command = create_command(ruby)
|
232
|
+
command.run("-e", "true",
|
233
|
+
data: @data,
|
234
|
+
spawn_options: spawn_options)
|
235
|
+
end
|
236
|
+
|
237
|
+
def test_default
|
238
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
239
|
+
messages = capture_log do
|
240
|
+
run_command
|
241
|
+
end
|
242
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
243
|
+
assert_equal([
|
244
|
+
[
|
245
|
+
:info,
|
246
|
+
"[external-command][limit][cpu][set] <60.0s>" +
|
247
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
248
|
+
]
|
249
|
+
],
|
250
|
+
messages)
|
251
|
+
end
|
252
|
+
|
253
|
+
def test_default_data_not_use
|
254
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
255
|
+
@data.limit_cpu = "90s"
|
256
|
+
messages = capture_log do
|
257
|
+
run_command
|
258
|
+
end
|
259
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
260
|
+
assert_equal([
|
261
|
+
[
|
262
|
+
:info,
|
263
|
+
"[external-command][limit][cpu][set] <60.0s>" +
|
264
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
265
|
+
]
|
266
|
+
],
|
267
|
+
messages)
|
268
|
+
end
|
269
|
+
|
270
|
+
def test_default_data_use
|
271
|
+
ChupaText::ExternalCommand.default_limit_cpu = "60s"
|
272
|
+
@data.limit_cpu = "30s"
|
273
|
+
messages = capture_log do
|
274
|
+
run_command
|
275
|
+
end
|
276
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
277
|
+
assert_equal([
|
278
|
+
[
|
279
|
+
:info,
|
280
|
+
"[external-command][limit][cpu][set] <30.0s>" +
|
281
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
282
|
+
]
|
283
|
+
],
|
284
|
+
messages)
|
285
|
+
end
|
286
|
+
|
287
|
+
def test_default_data_only
|
288
|
+
@data.limit_cpu = "30s"
|
289
|
+
messages = capture_log do
|
290
|
+
run_command
|
291
|
+
end
|
292
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_CPU)
|
293
|
+
assert_equal([
|
294
|
+
[
|
295
|
+
:info,
|
296
|
+
"[external-command][limit][cpu][set] <30.0s>" +
|
297
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
298
|
+
]
|
299
|
+
],
|
300
|
+
messages)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
class TestLimitAS < self
|
305
|
+
def setup
|
306
|
+
@data = ChupaText::TextData.new("Hello")
|
307
|
+
limit_as = ChupaText::ExternalCommand.default_limit_as
|
308
|
+
begin
|
309
|
+
yield
|
310
|
+
ensure
|
311
|
+
ChupaText::ExternalCommand.default_limit_as = limit_as
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def run_command(spawn_options={})
|
316
|
+
command = create_command(ruby)
|
317
|
+
command.run("-e", "true",
|
318
|
+
data: @data,
|
319
|
+
spawn_options: spawn_options)
|
320
|
+
end
|
321
|
+
|
322
|
+
def test_default
|
323
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
324
|
+
messages = capture_log do
|
325
|
+
run_command
|
326
|
+
end
|
327
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
328
|
+
assert_equal([
|
329
|
+
[
|
330
|
+
:info,
|
331
|
+
"[external-command][limit][as][set] " +
|
332
|
+
"<#{100 * 1024 * 1024}>" +
|
333
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
334
|
+
]
|
335
|
+
],
|
336
|
+
messages)
|
337
|
+
end
|
338
|
+
|
339
|
+
def test_default_data_not_use
|
340
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
341
|
+
@data.limit_as = "150MiB"
|
342
|
+
messages = capture_log do
|
343
|
+
run_command
|
344
|
+
end
|
345
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
346
|
+
assert_equal([
|
347
|
+
[
|
348
|
+
:info,
|
349
|
+
"[external-command][limit][as][set] " +
|
350
|
+
"<#{100 * 1024 * 1024}>" +
|
351
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
352
|
+
]
|
353
|
+
],
|
354
|
+
messages)
|
355
|
+
end
|
356
|
+
|
357
|
+
def test_default_soft_use
|
358
|
+
ChupaText::ExternalCommand.default_limit_as = "100MiB"
|
359
|
+
@data.limit_as = "50MiB"
|
360
|
+
messages = capture_log do
|
361
|
+
run_command
|
362
|
+
end
|
363
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
364
|
+
assert_equal([
|
365
|
+
[
|
366
|
+
:info,
|
367
|
+
"[external-command][limit][as][set] " +
|
368
|
+
"<#{50 * 1024 * 1024}>" +
|
369
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
370
|
+
]
|
371
|
+
],
|
372
|
+
messages)
|
373
|
+
end
|
374
|
+
|
375
|
+
def test_default_soft_only
|
376
|
+
@data.limit_as = "50MiB"
|
377
|
+
messages = capture_log do
|
378
|
+
run_command
|
379
|
+
end
|
380
|
+
soft_limit, hard_limit = Process.getrlimit(Process::RLIMIT_AS)
|
381
|
+
assert_equal([
|
382
|
+
[
|
383
|
+
:info,
|
384
|
+
"[external-command][limit][as][set] " +
|
385
|
+
"<#{50 * 1024 * 1024}>" +
|
386
|
+
"(soft-limit:#{soft_limit}, hard-limit:#{hard_limit})",
|
387
|
+
]
|
388
|
+
],
|
389
|
+
messages)
|
390
|
+
end
|
391
|
+
end
|
79
392
|
end
|
data/test/test-extractor.rb
CHANGED
@@ -231,10 +231,11 @@ class TestExtractor < Test::Unit::TestCase
|
|
231
231
|
|
232
232
|
sub_test_case("max body size") do
|
233
233
|
def test_last_invalid
|
234
|
-
@extractor = ChupaText::Extractor.new
|
234
|
+
@extractor = ChupaText::Extractor.new
|
235
235
|
data = ChupaText::Data.new
|
236
236
|
data.mime_type = "text/plain"
|
237
237
|
data.body = "こん"
|
238
|
+
data.max_body_size = 5
|
238
239
|
assert_equal(["こ"], extract(data))
|
239
240
|
end
|
240
241
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- lib/chupa-text/decomposers.rb
|
160
160
|
- lib/chupa-text/decomposers/csv.rb
|
161
161
|
- lib/chupa-text/decomposers/gzip.rb
|
162
|
+
- lib/chupa-text/decomposers/http-server.rb
|
162
163
|
- lib/chupa-text/decomposers/office-open-xml-document.rb
|
163
164
|
- lib/chupa-text/decomposers/office-open-xml-presentation.rb
|
164
165
|
- lib/chupa-text/decomposers/office-open-xml-workbook.rb
|
@@ -198,6 +199,7 @@ files:
|
|
198
199
|
- test/command/test-chupa-text.rb
|
199
200
|
- test/decomposers/test-csv.rb
|
200
201
|
- test/decomposers/test-gzip.rb
|
202
|
+
- test/decomposers/test-http-server.rb
|
201
203
|
- test/decomposers/test-office-open-xml-document.rb
|
202
204
|
- test/decomposers/test-office-open-xml-presentation.rb
|
203
205
|
- test/decomposers/test-office-open-xml-workbook.rb
|