chupa-text 1.2.6 → 1.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/error.rb +9 -0
- data/lib/chupa-text/extractor.rb +27 -8
- data/lib/chupa-text/version.rb +1 -1
- data/test/test-extractor.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 415bf5c173d68ce1e887dc97e014823cdc9f747bb3d191a714b74e87ee4a6fc7
|
4
|
+
data.tar.gz: 41da0888b006b66e3134bb77b3478a9c5b8fb99958ac29ecc08d5a8c4d79829a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 63097bace8113b4ed2d3634cc09e9d670d08918109f8fe27cdf26f4c4b05a2ec7d28e200b6ed6ec5417d43d9fbbb908a3eb807591fd36d6bc232ec90961c0253
|
7
|
+
data.tar.gz: 59644ec76de0529616ffe7e3f31e7b3c5792ee31e62a7a0ece851a77b2e8d8add52aafbefa0401005201e8780608e349a00cd43de08f04cb7bf3a4eeea115cea
|
data/doc/text/news.md
CHANGED
data/lib/chupa-text/error.rb
CHANGED
@@ -53,4 +53,13 @@ module ChupaText
|
|
53
53
|
super("Unknown encoding data: <#{data.uri}>(#{data.mime_type}): <#{encoding}>")
|
54
54
|
end
|
55
55
|
end
|
56
|
+
|
57
|
+
class TimeoutError < Error
|
58
|
+
attr_reader :data, :timeout
|
59
|
+
def initialize(data, timeout)
|
60
|
+
@data = data
|
61
|
+
@timeout = timeout
|
62
|
+
super("Timeout error: <#{data.uri}>(#{data.mime_type}): <#{timeout}>")
|
63
|
+
end
|
64
|
+
end
|
56
65
|
end
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -16,6 +16,7 @@
|
|
16
16
|
|
17
17
|
require "pathname"
|
18
18
|
require "uri"
|
19
|
+
require "timeout"
|
19
20
|
|
20
21
|
module ChupaText
|
21
22
|
class Extractor
|
@@ -103,19 +104,37 @@ module ChupaText
|
|
103
104
|
end
|
104
105
|
else
|
105
106
|
debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
107
|
+
with_timeout(target) do
|
108
|
+
decomposer.decompose(target) do |decomposed|
|
109
|
+
begin
|
110
|
+
debug do
|
111
|
+
"#{log_tag}[extract][decomposed] " +
|
112
|
+
"#{decomposer.class}: " +
|
113
|
+
"<#{target.uri}>: " +
|
114
|
+
"<#{target.mime_type}> -> <#{decomposed.mime_type}>"
|
115
|
+
end
|
116
|
+
extract_recursive(decomposed, &block)
|
117
|
+
ensure
|
118
|
+
decomposed.release
|
119
|
+
end
|
112
120
|
end
|
113
|
-
extract_recursive(decomposed, &block)
|
114
|
-
decomposed.release
|
115
121
|
end
|
116
122
|
end
|
117
123
|
end
|
118
124
|
|
125
|
+
def with_timeout(data, &block)
|
126
|
+
timeout = data.timeout
|
127
|
+
if timeout
|
128
|
+
begin
|
129
|
+
Timeout.timeout(timeout, &block)
|
130
|
+
rescue Timeout::Error
|
131
|
+
raise TimeoutError.new(data, timeout)
|
132
|
+
end
|
133
|
+
else
|
134
|
+
yield
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
119
138
|
def log_tag
|
120
139
|
"[extractor]"
|
121
140
|
end
|
data/lib/chupa-text/version.rb
CHANGED
data/test/test-extractor.rb
CHANGED
@@ -76,6 +76,7 @@ class TestExtractor < Test::Unit::TestCase
|
|
76
76
|
extracted = ChupaText::Data.new
|
77
77
|
extracted.mime_type = "text/plain"
|
78
78
|
extracted.body = data.body.gsub(/<.+?>/, "")
|
79
|
+
sleep(data.timeout * 2) if data.timeout
|
79
80
|
yield(extracted)
|
80
81
|
end
|
81
82
|
end
|
@@ -92,6 +93,17 @@ class TestExtractor < Test::Unit::TestCase
|
|
92
93
|
data.body = "<html><body>Hello</body></html>"
|
93
94
|
assert_equal(["Hello"], extract(data))
|
94
95
|
end
|
96
|
+
|
97
|
+
def test_timeout
|
98
|
+
data = ChupaText::Data.new
|
99
|
+
data.mime_type = "text/html"
|
100
|
+
data.body = "<html><body>Hello</body></html>"
|
101
|
+
data.timeout = 0.0001
|
102
|
+
error = ChupaText::TimeoutError.new(data, data.timeout)
|
103
|
+
assert_raise(error) do
|
104
|
+
extract(data)
|
105
|
+
end
|
106
|
+
end
|
95
107
|
end
|
96
108
|
|
97
109
|
sub_test_case("multi decomposed") do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|