yasuri 2.0.12 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
@@ -0,0 +1,75 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@uri = uri
|
8
|
+
end
|
9
|
+
|
10
|
+
describe '::MapNode' do
|
11
|
+
it "multi scrape in singe page" do
|
12
|
+
map = Yasuri.map_sample do
|
13
|
+
text_title '/html/head/title'
|
14
|
+
text_body_p '/html/body/p[1]'
|
15
|
+
end
|
16
|
+
actual = map.scrape(@uri)
|
17
|
+
|
18
|
+
expected = {
|
19
|
+
"title" => "Yasuri Test",
|
20
|
+
"body_p" => "Hello,Yasuri"
|
21
|
+
}
|
22
|
+
expect(actual).to include expected
|
23
|
+
end
|
24
|
+
|
25
|
+
it "nested multi scrape in singe page" do
|
26
|
+
map = Yasuri.map_sample do
|
27
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
28
|
+
map_group2 do
|
29
|
+
text_child01 '/html/body/a[1]'
|
30
|
+
text_child03 '/html/body/a[3]'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
actual = map.scrape(@uri)
|
34
|
+
|
35
|
+
expected = {
|
36
|
+
"group1" => {
|
37
|
+
"child01" => "child01"
|
38
|
+
},
|
39
|
+
"group2" => {
|
40
|
+
"child01" => "child01",
|
41
|
+
"child03" => "child03"
|
42
|
+
}
|
43
|
+
}
|
44
|
+
expect(actual).to include expected
|
45
|
+
end
|
46
|
+
|
47
|
+
it "scrape with links node" do
|
48
|
+
map = Yasuri.map_sample do
|
49
|
+
map_group1 do
|
50
|
+
links_a '/html/body/a' do
|
51
|
+
text_content '/html/body/p'
|
52
|
+
end
|
53
|
+
text_child01 '/html/body/a[1]'
|
54
|
+
end
|
55
|
+
map_group2 do
|
56
|
+
text_child03 '/html/body/a[3]'
|
57
|
+
end
|
58
|
+
end
|
59
|
+
actual = map.scrape(@uri)
|
60
|
+
|
61
|
+
expected = {
|
62
|
+
"group1" => {
|
63
|
+
"a" => [
|
64
|
+
{"content" => "Child 01 page."},
|
65
|
+
{"content" => "Child 02 page."},
|
66
|
+
{"content" => "Child 03 page."},
|
67
|
+
],
|
68
|
+
"child01" => "child01"
|
69
|
+
},
|
70
|
+
"group2" => { "child03" => "child03" }
|
71
|
+
}
|
72
|
+
expect(actual).to include expected
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -11,16 +11,14 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::PaginateNode' do
|
13
13
|
before do
|
14
|
-
@agent = Mechanize.new
|
15
14
|
@uri = uri + "/pagination/page01.html"
|
16
|
-
@page = @agent.get(@uri)
|
17
15
|
end
|
18
16
|
|
19
17
|
it "scrape each paginated pages" do
|
20
18
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
21
19
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
20
|
])
|
23
|
-
actual = root_node.
|
21
|
+
actual = root_node.scrape(@uri)
|
24
22
|
expected = [
|
25
23
|
{"content" => "PaginationTest01"},
|
26
24
|
{"content" => "PaginationTest02"},
|
@@ -37,7 +35,7 @@ describe 'Yasuri' do
|
|
37
35
|
Yasuri::TextNode.new('./a', "text"),
|
38
36
|
]),
|
39
37
|
], flatten: true)
|
40
|
-
actual = root_node.
|
38
|
+
actual = root_node.scrape(@uri)
|
41
39
|
expected = [
|
42
40
|
"PaginationTest01",
|
43
41
|
{"text"=>""},
|
@@ -77,7 +75,7 @@ describe 'Yasuri' do
|
|
77
75
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
78
76
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
79
77
|
], limit:3)
|
80
|
-
actual = root_node.
|
78
|
+
actual = root_node.scrape(@uri)
|
81
79
|
expected = [
|
82
80
|
{"content" => "PaginationTest01"},
|
83
81
|
{"content" => "PaginationTest02"},
|
@@ -91,7 +89,7 @@ describe 'Yasuri' do
|
|
91
89
|
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
92
90
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
93
91
|
])
|
94
|
-
actual = root_node.
|
92
|
+
actual = root_node.scrape(@uri)
|
95
93
|
expected = [ {"content" => "PaginationTest01"}, ]
|
96
94
|
expect(actual).to match_array expected
|
97
95
|
end
|
@@ -100,7 +98,7 @@ describe 'Yasuri' do
|
|
100
98
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
101
99
|
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
102
100
|
])
|
103
|
-
actual = root_node.
|
101
|
+
actual = root_node.scrape(@uri)
|
104
102
|
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
105
103
|
expect(actual).to match_array expected
|
106
104
|
end
|
@@ -112,7 +110,7 @@ describe 'Yasuri' do
|
|
112
110
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
113
111
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
114
112
|
])
|
115
|
-
compare_generated_vs_original(generated, original, @
|
113
|
+
compare_generated_vs_original(generated, original, @uri)
|
116
114
|
end
|
117
115
|
|
118
116
|
it 'can be defined by DSL, return single PaginateNode content limited' do
|
@@ -122,14 +120,14 @@ describe 'Yasuri' do
|
|
122
120
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
123
121
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
124
122
|
], limit: 2)
|
125
|
-
compare_generated_vs_original(generated, original, @
|
123
|
+
compare_generated_vs_original(generated, original, @uri)
|
126
124
|
end
|
127
125
|
|
128
126
|
it "return child node as symbol" do
|
129
127
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
130
128
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
131
129
|
])
|
132
|
-
actual = root_node.
|
130
|
+
actual = root_node.scrape(@uri, symbolize_names:true)
|
133
131
|
expected = [
|
134
132
|
{:content => "PaginationTest01"},
|
135
133
|
{:content => "PaginationTest02"},
|
@@ -138,5 +136,19 @@ describe 'Yasuri' do
|
|
138
136
|
]
|
139
137
|
expect(actual).to match expected
|
140
138
|
end
|
139
|
+
|
140
|
+
it "scrape with interval for each request" do
|
141
|
+
allow(Kernel).to receive(:sleep)
|
142
|
+
|
143
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
144
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
145
|
+
])
|
146
|
+
actual = root_node.scrape(@uri, interval_ms: 1000)
|
147
|
+
expect(actual.size).to match 4
|
148
|
+
|
149
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
150
|
+
expect(interval_sec).to match 1.0
|
151
|
+
end
|
152
|
+
end
|
141
153
|
end
|
142
154
|
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -8,11 +8,78 @@ describe 'Yasuri' do
|
|
8
8
|
include_context 'httpserver'
|
9
9
|
|
10
10
|
before do
|
11
|
-
@agent = Mechanize.new
|
12
11
|
@uri = uri
|
13
|
-
@index_page = @agent.get(@uri)
|
14
12
|
end
|
15
13
|
|
14
|
+
|
15
|
+
############
|
16
|
+
# yam2tree #
|
17
|
+
############
|
18
|
+
describe '.yaml2tree' do
|
19
|
+
it "fail if empty yaml" do
|
20
|
+
expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "return text node" do
|
24
|
+
src = <<-EOB
|
25
|
+
text_content: "/html/body/p[1]"
|
26
|
+
EOB
|
27
|
+
generated = Yasuri.yaml2tree(src)
|
28
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
29
|
+
|
30
|
+
compare_generated_vs_original(generated, original, @uri)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "return text node as symbol" do
|
34
|
+
src = <<-EOB
|
35
|
+
:text_content:
|
36
|
+
:path: "/html/body/p[1]"
|
37
|
+
EOB
|
38
|
+
generated = Yasuri.yaml2tree(src)
|
39
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
40
|
+
|
41
|
+
compare_generated_vs_original(generated, original, @uri)
|
42
|
+
end
|
43
|
+
|
44
|
+
it "return LinksNode/TextNode" do
|
45
|
+
|
46
|
+
src = <<-EOB
|
47
|
+
links_root:
|
48
|
+
path: "/html/body/a"
|
49
|
+
text_content: "/html/body/p"
|
50
|
+
EOB
|
51
|
+
generated = Yasuri.yaml2tree(src)
|
52
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
53
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
54
|
+
])
|
55
|
+
|
56
|
+
compare_generated_vs_original(generated, original, @uri)
|
57
|
+
end
|
58
|
+
|
59
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
60
|
+
src = <<-EOB
|
61
|
+
struct_tables:
|
62
|
+
path: "/html/body/table"
|
63
|
+
struct_table:
|
64
|
+
path: "./tr"
|
65
|
+
text_title: "./td[1]"
|
66
|
+
text_pub_date: "./td[2]"
|
67
|
+
EOB
|
68
|
+
|
69
|
+
generated = Yasuri.yaml2tree(src)
|
70
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
71
|
+
Yasuri::StructNode.new('./tr', "table", [
|
72
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
73
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
74
|
+
])
|
75
|
+
])
|
76
|
+
uri = @uri + "/struct/structual_text.html"
|
77
|
+
compare_generated_vs_original(generated, original, uri)
|
78
|
+
end
|
79
|
+
|
80
|
+
end # end of describe '.yaml2tree'
|
81
|
+
|
82
|
+
|
16
83
|
#############
|
17
84
|
# json2tree #
|
18
85
|
#############
|
@@ -22,103 +89,107 @@ describe 'Yasuri' do
|
|
22
89
|
end
|
23
90
|
|
24
91
|
it "return TextNode" do
|
25
|
-
src = %q|
|
26
|
-
|
27
|
-
|
28
|
-
|
92
|
+
src = %q|
|
93
|
+
{
|
94
|
+
"text_content": "/html/body/p[1]"
|
95
|
+
}|
|
29
96
|
generated = Yasuri.json2tree(src)
|
30
97
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
98
|
|
32
|
-
compare_generated_vs_original(generated, original, @
|
99
|
+
compare_generated_vs_original(generated, original, @uri)
|
33
100
|
end
|
34
101
|
|
35
102
|
it "return TextNode with truncate_regexp" do
|
36
|
-
src = %q|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
103
|
+
src = %q|
|
104
|
+
{
|
105
|
+
"text_content": {
|
106
|
+
"path": "/html/body/p[1]",
|
107
|
+
"truncate" : "^[^,]+"
|
108
|
+
}
|
109
|
+
}|
|
41
110
|
generated = Yasuri.json2tree(src)
|
42
111
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
|
-
compare_generated_vs_original(generated, original, @
|
112
|
+
compare_generated_vs_original(generated, original, @uri)
|
44
113
|
end
|
45
114
|
|
115
|
+
it "return MapNode with TextNodes" do
|
116
|
+
src = %q|
|
117
|
+
{
|
118
|
+
"text_content01": "/html/body/p[1]",
|
119
|
+
"text_content02": "/html/body/p[2]"
|
120
|
+
}|
|
121
|
+
generated = Yasuri.json2tree(src)
|
122
|
+
original = Yasuri::MapNode.new('parent', [
|
123
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
124
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
125
|
+
])
|
126
|
+
compare_generated_vs_original(generated, original, @uri)
|
127
|
+
end
|
46
128
|
|
47
129
|
it "return LinksNode/TextNode" do
|
48
|
-
src = %q|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
130
|
+
src = %q|
|
131
|
+
{
|
132
|
+
"links_root": {
|
133
|
+
"path": "/html/body/a",
|
134
|
+
"text_content": "/html/body/p"
|
135
|
+
}
|
136
|
+
}|
|
137
|
+
|
56
138
|
generated = Yasuri.json2tree(src)
|
57
139
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
58
140
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
59
141
|
])
|
60
142
|
|
61
|
-
compare_generated_vs_original(generated, original, @
|
143
|
+
compare_generated_vs_original(generated, original, @uri)
|
62
144
|
end
|
63
145
|
|
64
146
|
it "return PaginateNode/TextNode" do
|
65
|
-
src = %q|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
}|
|
147
|
+
src = %q|
|
148
|
+
{
|
149
|
+
"pages_root": {
|
150
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
151
|
+
"text_content": "/html/body/p"
|
152
|
+
}
|
153
|
+
}|
|
73
154
|
generated = Yasuri.json2tree(src)
|
74
155
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
75
156
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
76
157
|
])
|
77
158
|
|
78
|
-
|
79
|
-
|
80
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
159
|
+
uri = @uri + "/pagination/page01.html"
|
160
|
+
compare_generated_vs_original(generated, original, uri)
|
81
161
|
end
|
82
162
|
|
83
163
|
it "return PaginateNode/TextNode with limit" do
|
84
|
-
src = %q|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
}|
|
164
|
+
src = %q|
|
165
|
+
{
|
166
|
+
"pages_root": {
|
167
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
168
|
+
"limit": 2,
|
169
|
+
"text_content": "/html/body/p"
|
170
|
+
}
|
171
|
+
}|
|
93
172
|
generated = Yasuri.json2tree(src)
|
94
173
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
95
174
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
96
175
|
], limit:2)
|
97
176
|
|
98
|
-
|
99
|
-
|
100
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
177
|
+
uri = @uri + "/pagination/page01.html"
|
178
|
+
compare_generated_vs_original(generated, original, uri)
|
101
179
|
end
|
102
180
|
|
103
181
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
},
|
116
|
-
{ "node" : "text",
|
117
|
-
"name" : "pub_date",
|
118
|
-
"path" : "./td[2]"
|
119
|
-
}]
|
120
|
-
}]
|
121
|
-
}|
|
182
|
+
src = %q|
|
183
|
+
{
|
184
|
+
"struct_tables": {
|
185
|
+
"path": "/html/body/table",
|
186
|
+
"struct_table": {
|
187
|
+
"path": "./tr",
|
188
|
+
"text_title": "./td[1]",
|
189
|
+
"text_pub_date": "./td[2]"
|
190
|
+
}
|
191
|
+
}
|
192
|
+
}|
|
122
193
|
generated = Yasuri.json2tree(src)
|
123
194
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
124
195
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -126,27 +197,27 @@ describe 'Yasuri' do
|
|
126
197
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
198
|
])
|
128
199
|
])
|
129
|
-
|
130
|
-
compare_generated_vs_original(generated, original,
|
200
|
+
uri = @uri + "/struct/structual_text.html"
|
201
|
+
compare_generated_vs_original(generated, original, uri)
|
131
202
|
end
|
132
203
|
end
|
133
204
|
|
205
|
+
|
134
206
|
#############
|
135
207
|
# tree2json #
|
136
208
|
#############
|
137
209
|
describe '.tree2json' do
|
138
210
|
it "return empty json" do
|
139
|
-
|
140
|
-
expect(json).to match "{}"
|
211
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
141
212
|
end
|
142
213
|
|
143
214
|
it "return text node" do
|
144
215
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
145
216
|
json = Yasuri.tree2json(node)
|
146
|
-
expected_str = %q|
|
147
|
-
|
148
|
-
|
149
|
-
|
217
|
+
expected_str = %q|
|
218
|
+
{
|
219
|
+
"text_title": "/html/head/title"
|
220
|
+
}|
|
150
221
|
expected = JSON.parse(expected_str)
|
151
222
|
actual = JSON.parse(json)
|
152
223
|
expect(actual).to match expected
|
@@ -155,29 +226,49 @@ describe 'Yasuri' do
|
|
155
226
|
it "return text node with truncate_regexp" do
|
156
227
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
228
|
json = Yasuri.tree2json(node)
|
158
|
-
expected_str = %q|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
229
|
+
expected_str = %q|
|
230
|
+
{
|
231
|
+
"text_title": {
|
232
|
+
"path": "/html/head/title",
|
233
|
+
"truncate": "^[^,]+"
|
234
|
+
}
|
235
|
+
}|
|
163
236
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
164
237
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
165
238
|
expect(actual).to match expected
|
166
239
|
end
|
167
240
|
|
241
|
+
it "return map node with text nodes" do
|
242
|
+
tree = Yasuri::MapNode.new('parent', [
|
243
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
244
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
245
|
+
])
|
246
|
+
actual_json = Yasuri.tree2json(tree)
|
247
|
+
|
248
|
+
expected_json = %q|
|
249
|
+
{
|
250
|
+
"text_content01": "/html/body/p[1]",
|
251
|
+
"text_content02": "/html/body/p[2]"
|
252
|
+
}|
|
253
|
+
|
254
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
255
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
256
|
+
expect(actual).to match expected
|
257
|
+
end
|
258
|
+
|
168
259
|
it "return LinksNode/TextNode" do
|
169
260
|
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
170
261
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
171
262
|
])
|
172
263
|
json = Yasuri.tree2json(tree)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
264
|
+
|
265
|
+
expected_src = %q|
|
266
|
+
{
|
267
|
+
"links_root": {
|
268
|
+
"path": "/html/body/a",
|
269
|
+
"text_content":"/html/body/p"
|
270
|
+
}
|
271
|
+
}|
|
181
272
|
expected = JSON.parse(expected_src)
|
182
273
|
actual = JSON.parse(json)
|
183
274
|
expect(actual).to match expected
|
@@ -189,25 +280,84 @@ describe 'Yasuri' do
|
|
189
280
|
], limit:10)
|
190
281
|
|
191
282
|
json = Yasuri.tree2json(tree)
|
192
|
-
expected_src = %q|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
}|
|
283
|
+
expected_src = %q|
|
284
|
+
{
|
285
|
+
"pages_root": {
|
286
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
287
|
+
"limit": 10,
|
288
|
+
"flatten": false,
|
289
|
+
"text_content": "/html/body/p"
|
290
|
+
}
|
291
|
+
}|
|
202
292
|
expected = JSON.parse(expected_src)
|
203
293
|
actual = JSON.parse(json)
|
204
294
|
expect(actual).to match expected
|
205
295
|
end
|
296
|
+
end
|
297
|
+
|
298
|
+
describe '.with_retry' do
|
299
|
+
it "call once if success" do
|
300
|
+
actual = Yasuri.with_retry(0){ 42 }
|
301
|
+
expect(actual).to match 42
|
302
|
+
end
|
303
|
+
|
304
|
+
it "call untile success" do
|
305
|
+
i = [1,1,0,0]
|
306
|
+
actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
307
|
+
expect(actual).to match 42/1
|
308
|
+
end
|
309
|
+
|
310
|
+
it "raise error when exceed retry count" do
|
311
|
+
i = [1,0,0,0]
|
312
|
+
expect {
|
313
|
+
Yasuri.with_retry(2){42 / i.pop } # do this 3 times
|
314
|
+
}.to raise_error(Exception)
|
315
|
+
end
|
316
|
+
|
317
|
+
it "wait interval before run" do
|
318
|
+
allow(Kernel).to receive(:sleep)
|
319
|
+
Yasuri.with_retry(0){ 42 }
|
320
|
+
expect(Kernel).to have_received(:sleep).once
|
321
|
+
end
|
206
322
|
|
323
|
+
it "wait interval before run" do
|
324
|
+
allow(Kernel).to receive(:sleep)
|
325
|
+
Yasuri.with_retry(0){ 42 }
|
326
|
+
expect(Kernel).to have_received(:sleep).once
|
327
|
+
end
|
207
328
|
|
329
|
+
it "wait interval for each runs" do
|
330
|
+
allow(Kernel).to receive(:sleep)
|
208
331
|
|
332
|
+
i = [1,1,0,0]
|
333
|
+
Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
334
|
+
expect(Kernel).to have_received(:sleep).exactly(3).times
|
335
|
+
end
|
209
336
|
end
|
210
337
|
|
338
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
339
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
340
|
+
Yasuri::StructNode.new('./tr', "table", [
|
341
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
342
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
343
|
+
])
|
344
|
+
])
|
345
|
+
json = Yasuri.tree2json(tree)
|
346
|
+
expected_src = %q|
|
347
|
+
{
|
348
|
+
"struct_tables": {
|
349
|
+
"path": "/html/body/table",
|
350
|
+
"struct_table": {
|
351
|
+
"path": "./tr",
|
352
|
+
"text_title": "./td[1]",
|
353
|
+
"text_pub_date": "./td[2]"
|
354
|
+
}
|
355
|
+
}
|
356
|
+
}|
|
357
|
+
expected = JSON.parse(expected_src)
|
358
|
+
actual = JSON.parse(json)
|
359
|
+
expect(actual).to match expected
|
360
|
+
end
|
211
361
|
|
212
362
|
it 'has a version number' do
|
213
363
|
expect(Yasuri::VERSION).not_to be nil
|