chupa-text 1.2.6 → 1.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/error.rb +9 -0
- data/lib/chupa-text/extractor.rb +27 -8
- data/lib/chupa-text/version.rb +1 -1
- data/test/test-extractor.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 415bf5c173d68ce1e887dc97e014823cdc9f747bb3d191a714b74e87ee4a6fc7
|
|
4
|
+
data.tar.gz: 41da0888b006b66e3134bb77b3478a9c5b8fb99958ac29ecc08d5a8c4d79829a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 63097bace8113b4ed2d3634cc09e9d670d08918109f8fe27cdf26f4c4b05a2ec7d28e200b6ed6ec5417d43d9fbbb908a3eb807591fd36d6bc232ec90961c0253
|
|
7
|
+
data.tar.gz: 59644ec76de0529616ffe7e3f31e7b3c5792ee31e62a7a0ece851a77b2e8d8add52aafbefa0401005201e8780608e349a00cd43de08f04cb7bf3a4eeea115cea
|
data/doc/text/news.md
CHANGED
data/lib/chupa-text/error.rb
CHANGED
|
@@ -53,4 +53,13 @@ module ChupaText
|
|
|
53
53
|
super("Unknown encoding data: <#{data.uri}>(#{data.mime_type}): <#{encoding}>")
|
|
54
54
|
end
|
|
55
55
|
end
|
|
56
|
+
|
|
57
|
+
class TimeoutError < Error
|
|
58
|
+
attr_reader :data, :timeout
|
|
59
|
+
def initialize(data, timeout)
|
|
60
|
+
@data = data
|
|
61
|
+
@timeout = timeout
|
|
62
|
+
super("Timeout error: <#{data.uri}>(#{data.mime_type}): <#{timeout}>")
|
|
63
|
+
end
|
|
64
|
+
end
|
|
56
65
|
end
|
data/lib/chupa-text/extractor.rb
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
require "pathname"
|
|
18
18
|
require "uri"
|
|
19
|
+
require "timeout"
|
|
19
20
|
|
|
20
21
|
module ChupaText
|
|
21
22
|
class Extractor
|
|
@@ -103,19 +104,37 @@ module ChupaText
|
|
|
103
104
|
end
|
|
104
105
|
else
|
|
105
106
|
debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
107
|
+
with_timeout(target) do
|
|
108
|
+
decomposer.decompose(target) do |decomposed|
|
|
109
|
+
begin
|
|
110
|
+
debug do
|
|
111
|
+
"#{log_tag}[extract][decomposed] " +
|
|
112
|
+
"#{decomposer.class}: " +
|
|
113
|
+
"<#{target.uri}>: " +
|
|
114
|
+
"<#{target.mime_type}> -> <#{decomposed.mime_type}>"
|
|
115
|
+
end
|
|
116
|
+
extract_recursive(decomposed, &block)
|
|
117
|
+
ensure
|
|
118
|
+
decomposed.release
|
|
119
|
+
end
|
|
112
120
|
end
|
|
113
|
-
extract_recursive(decomposed, &block)
|
|
114
|
-
decomposed.release
|
|
115
121
|
end
|
|
116
122
|
end
|
|
117
123
|
end
|
|
118
124
|
|
|
125
|
+
def with_timeout(data, &block)
|
|
126
|
+
timeout = data.timeout
|
|
127
|
+
if timeout
|
|
128
|
+
begin
|
|
129
|
+
Timeout.timeout(timeout, &block)
|
|
130
|
+
rescue Timeout::Error
|
|
131
|
+
raise TimeoutError.new(data, timeout)
|
|
132
|
+
end
|
|
133
|
+
else
|
|
134
|
+
yield
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
119
138
|
def log_tag
|
|
120
139
|
"[extractor]"
|
|
121
140
|
end
|
data/lib/chupa-text/version.rb
CHANGED
data/test/test-extractor.rb
CHANGED
|
@@ -76,6 +76,7 @@ class TestExtractor < Test::Unit::TestCase
|
|
|
76
76
|
extracted = ChupaText::Data.new
|
|
77
77
|
extracted.mime_type = "text/plain"
|
|
78
78
|
extracted.body = data.body.gsub(/<.+?>/, "")
|
|
79
|
+
sleep(data.timeout * 2) if data.timeout
|
|
79
80
|
yield(extracted)
|
|
80
81
|
end
|
|
81
82
|
end
|
|
@@ -92,6 +93,17 @@ class TestExtractor < Test::Unit::TestCase
|
|
|
92
93
|
data.body = "<html><body>Hello</body></html>"
|
|
93
94
|
assert_equal(["Hello"], extract(data))
|
|
94
95
|
end
|
|
96
|
+
|
|
97
|
+
def test_timeout
|
|
98
|
+
data = ChupaText::Data.new
|
|
99
|
+
data.mime_type = "text/html"
|
|
100
|
+
data.body = "<html><body>Hello</body></html>"
|
|
101
|
+
data.timeout = 0.0001
|
|
102
|
+
error = ChupaText::TimeoutError.new(data, data.timeout)
|
|
103
|
+
assert_raise(error) do
|
|
104
|
+
extract(data)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
95
107
|
end
|
|
96
108
|
|
|
97
109
|
sub_test_case("multi decomposed") do
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: chupa-text
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.2.
|
|
4
|
+
version: 1.2.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kouhei Sutou
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-06-
|
|
11
|
+
date: 2019-06-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: archive-zip
|