yasuri 3.3.0 → 3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
data/spec/yasuri_map_spec.rb
CHANGED
@@ -3,20 +3,16 @@ require_relative 'spec_helper'
|
|
3
3
|
describe 'Yasuri' do
|
4
4
|
include_context 'httpserver'
|
5
5
|
|
6
|
-
before do
|
7
|
-
@uri = uri
|
8
|
-
end
|
9
|
-
|
10
6
|
describe '::MapNode' do
|
11
7
|
it "multi scrape in singe page" do
|
12
8
|
map = Yasuri.map_sample do
|
13
9
|
text_title '/html/head/title'
|
14
10
|
text_body_p '/html/body/p[1]'
|
15
11
|
end
|
16
|
-
actual = map.scrape(
|
12
|
+
actual = map.scrape(uri)
|
17
13
|
|
18
14
|
expected = {
|
19
|
-
"title"
|
15
|
+
"title" => "Yasuri Test",
|
20
16
|
"body_p" => "Hello,Yasuri"
|
21
17
|
}
|
22
18
|
expect(actual).to include expected
|
@@ -24,13 +20,13 @@ describe 'Yasuri' do
|
|
24
20
|
|
25
21
|
it "nested multi scrape in singe page" do
|
26
22
|
map = Yasuri.map_sample do
|
27
|
-
map_group1 { text_child01
|
23
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
28
24
|
map_group2 do
|
29
25
|
text_child01 '/html/body/a[1]'
|
30
26
|
text_child03 '/html/body/a[3]'
|
31
27
|
end
|
32
28
|
end
|
33
|
-
actual = map.scrape(
|
29
|
+
actual = map.scrape(uri)
|
34
30
|
|
35
31
|
expected = {
|
36
32
|
"group1" => {
|
@@ -50,20 +46,20 @@ describe 'Yasuri' do
|
|
50
46
|
links_a '/html/body/a' do
|
51
47
|
text_content '/html/body/p'
|
52
48
|
end
|
53
|
-
text_child01
|
49
|
+
text_child01 '/html/body/a[1]'
|
54
50
|
end
|
55
51
|
map_group2 do
|
56
52
|
text_child03 '/html/body/a[3]'
|
57
53
|
end
|
58
54
|
end
|
59
|
-
actual = map.scrape(
|
55
|
+
actual = map.scrape(uri)
|
60
56
|
|
61
57
|
expected = {
|
62
58
|
"group1" => {
|
63
59
|
"a" => [
|
64
|
-
{"content" => "Child 01 page."},
|
65
|
-
{"content" => "Child 02 page."},
|
66
|
-
{"content" => "Child 03 page."}
|
60
|
+
{ "content" => "Child 01 page." },
|
61
|
+
{ "content" => "Child 02 page." },
|
62
|
+
{ "content" => "Child 03 page." }
|
67
63
|
],
|
68
64
|
"child01" => "child01"
|
69
65
|
},
|
@@ -72,4 +68,4 @@ describe 'Yasuri' do
|
|
72
68
|
expect(actual).to include expected
|
73
69
|
end
|
74
70
|
end
|
75
|
-
end
|
71
|
+
end
|
@@ -1,105 +1,96 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
############
|
7
|
-
# Paginate #
|
8
|
-
############
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::PaginateNode' do
|
13
|
-
|
14
|
-
@uri = uri + "/pagination/page01.html"
|
15
|
-
end
|
8
|
+
let(:uri_paginate) { "#{uri}/pagination/page01.html" }
|
16
9
|
|
17
10
|
it "scrape each paginated pages" do
|
18
|
-
root_node = Yasuri::PaginateNode.new(
|
19
|
-
|
20
|
-
|
21
|
-
|
11
|
+
root_node = Yasuri::PaginateNode.new(
|
12
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
13
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
14
|
+
]
|
15
|
+
)
|
16
|
+
actual = root_node.scrape(uri_paginate)
|
22
17
|
expected = [
|
23
|
-
{"content" => "PaginationTest01"},
|
24
|
-
{"content" => "PaginationTest02"},
|
25
|
-
{"content" => "PaginationTest03"},
|
26
|
-
{"content" => "PaginationTest04"}
|
18
|
+
{ "content" => "PaginationTest01" },
|
19
|
+
{ "content" => "PaginationTest02" },
|
20
|
+
{ "content" => "PaginationTest03" },
|
21
|
+
{ "content" => "PaginationTest04" }
|
27
22
|
]
|
28
23
|
expect(actual).to match expected
|
29
24
|
end
|
30
25
|
|
31
26
|
it "scrape each paginated pages with flatten" do
|
32
|
-
root_node = Yasuri::PaginateNode.new(
|
33
|
-
|
34
|
-
|
35
|
-
Yasuri::
|
36
|
-
|
37
|
-
|
38
|
-
|
27
|
+
root_node = Yasuri::PaginateNode.new(
|
28
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
29
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
30
|
+
Yasuri::StructNode.new(
|
31
|
+
'/html/body/nav/span', "span", [
|
32
|
+
Yasuri::TextNode.new('./a', "text")
|
33
|
+
]
|
34
|
+
)
|
35
|
+
], flatten: true
|
36
|
+
)
|
37
|
+
actual = root_node.scrape(uri_paginate)
|
39
38
|
expected = [
|
40
|
-
"PaginationTest01",
|
41
|
-
{"text"=>""},
|
42
|
-
{"text"=>""},
|
43
|
-
|
44
|
-
{"text" => "
|
45
|
-
{"text" => "4"},
|
46
|
-
{"text"=>"NextPage »"},
|
47
|
-
|
48
|
-
{"text"=>"« PreviousPage"},
|
49
|
-
{"text" => "1"},
|
50
|
-
{"text"=>""},
|
51
|
-
|
52
|
-
{"text" => "
|
53
|
-
{"text"=>"
|
54
|
-
"
|
55
|
-
{"text"=>"« PreviousPage"},
|
56
|
-
{"text" => "1"},
|
57
|
-
{"text" => "2"},
|
58
|
-
{"text"=>""},
|
59
|
-
{"text" => "4"},
|
60
|
-
{"text"=>"NextPage »"},
|
61
|
-
"PaginationTest04",
|
62
|
-
{"text"=>"« PreviousPage"},
|
63
|
-
{"text" => "1"},
|
64
|
-
{"text" => "2"},
|
65
|
-
{"text" => "3"},
|
66
|
-
{"text"=>""},
|
67
|
-
{"text"=>""},
|
39
|
+
"PaginationTest01", { "text" => "" },
|
40
|
+
{ "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
|
41
|
+
{ "text" => "NextPage »" },
|
42
|
+
|
43
|
+
"PaginationTest02", { "text" => "« PreviousPage" },
|
44
|
+
{ "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
|
45
|
+
{ "text" => "NextPage »" },
|
46
|
+
|
47
|
+
"PaginationTest03", { "text" => "« PreviousPage" },
|
48
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
|
49
|
+
{ "text" => "NextPage »" },
|
50
|
+
|
51
|
+
"PaginationTest04", { "text" => "« PreviousPage" },
|
52
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
|
53
|
+
{ "text" => "" }
|
68
54
|
]
|
69
55
|
|
70
56
|
expect(actual).to match expected
|
71
57
|
end
|
72
58
|
|
73
|
-
|
74
59
|
it "scrape each paginated pages limited" do
|
75
|
-
root_node = Yasuri::PaginateNode.new(
|
76
|
-
|
77
|
-
|
78
|
-
|
60
|
+
root_node = Yasuri::PaginateNode.new(
|
61
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
62
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
63
|
+
], limit: 3
|
64
|
+
)
|
65
|
+
actual = root_node.scrape(uri_paginate)
|
79
66
|
expected = [
|
80
|
-
{"content" => "PaginationTest01"},
|
81
|
-
{"content" => "PaginationTest02"},
|
82
|
-
{"content" => "PaginationTest03"}
|
67
|
+
{ "content" => "PaginationTest01" },
|
68
|
+
{ "content" => "PaginationTest02" },
|
69
|
+
{ "content" => "PaginationTest03" }
|
83
70
|
]
|
84
71
|
expect(actual).to match expected
|
85
72
|
end
|
86
73
|
|
87
74
|
it 'return first content if paginate link node is not found' do
|
88
75
|
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
89
|
-
root_node = Yasuri::PaginateNode.new(
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
76
|
+
root_node = Yasuri::PaginateNode.new(
|
77
|
+
missing_xpath, "root", [
|
78
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = root_node.scrape(uri_paginate)
|
82
|
+
expected = [{ "content" => "PaginationTest01" }]
|
94
83
|
expect(actual).to match_array expected
|
95
84
|
end
|
96
85
|
|
97
86
|
it 'return empty hashes if content node is not found' do
|
98
|
-
root_node = Yasuri::PaginateNode.new(
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
87
|
+
root_node = Yasuri::PaginateNode.new(
|
88
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
89
|
+
Yasuri::TextNode.new('/html/body/hoge', "content")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
actual = root_node.scrape(uri_paginate)
|
93
|
+
expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
|
103
94
|
expect(actual).to match_array expected
|
104
95
|
end
|
105
96
|
|
@@ -107,32 +98,38 @@ describe 'Yasuri' do
|
|
107
98
|
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
108
99
|
text_content '/html/body/p'
|
109
100
|
end
|
110
|
-
original = Yasuri::PaginateNode.new(
|
111
|
-
|
112
|
-
|
113
|
-
|
101
|
+
original = Yasuri::PaginateNode.new(
|
102
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
103
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
104
|
+
]
|
105
|
+
)
|
106
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
114
107
|
end
|
115
108
|
|
116
109
|
it 'can be defined by DSL, return single PaginateNode content limited' do
|
117
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
|
110
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
|
118
111
|
text_content '/html/body/p'
|
119
112
|
end
|
120
|
-
original = Yasuri::PaginateNode.new(
|
121
|
-
|
122
|
-
|
123
|
-
|
113
|
+
original = Yasuri::PaginateNode.new(
|
114
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
115
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
116
|
+
], limit: 2
|
117
|
+
)
|
118
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
124
119
|
end
|
125
120
|
|
126
121
|
it "return child node as symbol" do
|
127
|
-
root_node = Yasuri::PaginateNode.new(
|
128
|
-
|
129
|
-
|
130
|
-
|
122
|
+
root_node = Yasuri::PaginateNode.new(
|
123
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
124
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
125
|
+
]
|
126
|
+
)
|
127
|
+
actual = root_node.scrape(uri_paginate, symbolize_names: true)
|
131
128
|
expected = [
|
132
|
-
{:
|
133
|
-
{:
|
134
|
-
{:
|
135
|
-
{:
|
129
|
+
{ content: "PaginationTest01" },
|
130
|
+
{ content: "PaginationTest02" },
|
131
|
+
{ content: "PaginationTest03" },
|
132
|
+
{ content: "PaginationTest04" }
|
136
133
|
]
|
137
134
|
expect(actual).to match expected
|
138
135
|
end
|
@@ -140,10 +137,12 @@ describe 'Yasuri' do
|
|
140
137
|
it "scrape with interval for each request" do
|
141
138
|
allow(Kernel).to receive(:sleep)
|
142
139
|
|
143
|
-
root_node = Yasuri::PaginateNode.new(
|
144
|
-
|
145
|
-
|
146
|
-
|
140
|
+
root_node = Yasuri::PaginateNode.new(
|
141
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
142
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
143
|
+
]
|
144
|
+
)
|
145
|
+
actual = root_node.scrape(uri_paginate, interval_ms: 1000)
|
147
146
|
expect(actual.size).to match 4
|
148
147
|
|
149
148
|
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
data/spec/yasuri_spec.rb
CHANGED
@@ -1,17 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
|
-
|
5
1
|
require_relative 'spec_helper'
|
6
2
|
|
7
3
|
describe 'Yasuri' do
|
8
4
|
include_context 'httpserver'
|
9
5
|
|
10
|
-
before do
|
11
|
-
@uri = uri
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
6
|
############
|
16
7
|
# yam2tree #
|
17
8
|
############
|
@@ -27,7 +18,7 @@ describe 'Yasuri' do
|
|
27
18
|
generated = Yasuri.yaml2tree(src)
|
28
19
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
29
20
|
|
30
|
-
compare_generated_vs_original(generated, original,
|
21
|
+
compare_generated_vs_original(generated, original, uri)
|
31
22
|
end
|
32
23
|
|
33
24
|
it "return text node as symbol" do
|
@@ -38,7 +29,7 @@ describe 'Yasuri' do
|
|
38
29
|
generated = Yasuri.yaml2tree(src)
|
39
30
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
40
31
|
|
41
|
-
compare_generated_vs_original(generated, original,
|
32
|
+
compare_generated_vs_original(generated, original, uri)
|
42
33
|
end
|
43
34
|
|
44
35
|
it "return LinksNode/TextNode" do
|
@@ -53,7 +44,7 @@ describe 'Yasuri' do
|
|
53
44
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
54
45
|
])
|
55
46
|
|
56
|
-
compare_generated_vs_original(generated, original,
|
47
|
+
compare_generated_vs_original(generated, original, uri)
|
57
48
|
end
|
58
49
|
|
59
50
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -73,8 +64,8 @@ describe 'Yasuri' do
|
|
73
64
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
74
65
|
])
|
75
66
|
])
|
76
|
-
|
77
|
-
compare_generated_vs_original(generated, original,
|
67
|
+
test_uri = uri + "/struct/structual_text.html"
|
68
|
+
compare_generated_vs_original(generated, original, test_uri)
|
78
69
|
end
|
79
70
|
|
80
71
|
end # end of describe '.yaml2tree'
|
@@ -96,7 +87,7 @@ describe 'Yasuri' do
|
|
96
87
|
generated = Yasuri.json2tree(src)
|
97
88
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
98
89
|
|
99
|
-
compare_generated_vs_original(generated, original,
|
90
|
+
compare_generated_vs_original(generated, original, uri)
|
100
91
|
end
|
101
92
|
|
102
93
|
it "return TextNode with truncate_regexp" do
|
@@ -109,7 +100,7 @@ describe 'Yasuri' do
|
|
109
100
|
}|
|
110
101
|
generated = Yasuri.json2tree(src)
|
111
102
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
112
|
-
compare_generated_vs_original(generated, original,
|
103
|
+
compare_generated_vs_original(generated, original, uri)
|
113
104
|
end
|
114
105
|
|
115
106
|
it "return MapNode with TextNodes" do
|
@@ -123,7 +114,7 @@ describe 'Yasuri' do
|
|
123
114
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
124
115
|
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
125
116
|
])
|
126
|
-
compare_generated_vs_original(generated, original,
|
117
|
+
compare_generated_vs_original(generated, original, uri)
|
127
118
|
end
|
128
119
|
|
129
120
|
it "return LinksNode/TextNode" do
|
@@ -140,7 +131,7 @@ describe 'Yasuri' do
|
|
140
131
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
141
132
|
])
|
142
133
|
|
143
|
-
compare_generated_vs_original(generated, original,
|
134
|
+
compare_generated_vs_original(generated, original, uri)
|
144
135
|
end
|
145
136
|
|
146
137
|
it "return PaginateNode/TextNode" do
|
@@ -156,8 +147,8 @@ describe 'Yasuri' do
|
|
156
147
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
157
148
|
])
|
158
149
|
|
159
|
-
|
160
|
-
compare_generated_vs_original(generated, original,
|
150
|
+
test_uri = uri + "/pagination/page01.html"
|
151
|
+
compare_generated_vs_original(generated, original, test_uri)
|
161
152
|
end
|
162
153
|
|
163
154
|
it "return PaginateNode/TextNode with limit" do
|
@@ -174,8 +165,8 @@ describe 'Yasuri' do
|
|
174
165
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
175
166
|
], limit:2)
|
176
167
|
|
177
|
-
|
178
|
-
compare_generated_vs_original(generated, original,
|
168
|
+
test_uri = uri + "/pagination/page01.html"
|
169
|
+
compare_generated_vs_original(generated, original, test_uri)
|
179
170
|
end
|
180
171
|
|
181
172
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -197,8 +188,8 @@ describe 'Yasuri' do
|
|
197
188
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
198
189
|
])
|
199
190
|
])
|
200
|
-
|
201
|
-
compare_generated_vs_original(generated, original,
|
191
|
+
test_uri = uri + "/struct/structual_text.html"
|
192
|
+
compare_generated_vs_original(generated, original, test_uri)
|
202
193
|
end
|
203
194
|
end
|
204
195
|
|
@@ -1,106 +1,121 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
##########
|
7
|
-
# Struct #
|
8
|
-
##########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::StructNode' do
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
{ "title" => "The Perfect Insider",
|
8
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
9
|
+
let(:table1996) do
|
10
|
+
[
|
11
|
+
{ "title" => "The Perfect Insider",
|
18
12
|
"pub_date" => "1996/4/5" },
|
19
|
-
{ "title"
|
13
|
+
{ "title" => "Doctors in Isolated Room",
|
20
14
|
"pub_date" => "1996/7/5" },
|
21
|
-
{ "title"
|
22
|
-
"pub_date" => "1996/9/5" }
|
15
|
+
{ "title" => "Mathematical Goodbye",
|
16
|
+
"pub_date" => "1996/9/5" }
|
23
17
|
]
|
24
|
-
|
25
|
-
|
18
|
+
end
|
19
|
+
let(:table1997) do
|
20
|
+
[
|
21
|
+
{ "title" => "Jack the Poetical Private",
|
26
22
|
"pub_date" => "1997/1/5" },
|
27
|
-
{ "title"
|
23
|
+
{ "title" => "Who Inside",
|
28
24
|
"pub_date" => "1997/4/5" },
|
29
|
-
{ "title"
|
30
|
-
"pub_date" => "1997/10/5" }
|
25
|
+
{ "title" => "Illusion Acts Like Magic",
|
26
|
+
"pub_date" => "1997/10/5" }
|
31
27
|
]
|
32
|
-
|
33
|
-
|
28
|
+
end
|
29
|
+
let(:table1998) do
|
30
|
+
[
|
31
|
+
{ "title" => "Replaceable Summer",
|
34
32
|
"pub_date" => "1998/1/7" },
|
35
|
-
{ "title"
|
33
|
+
{ "title" => "Switch Back",
|
36
34
|
"pub_date" => "1998/4/5" },
|
37
|
-
{ "title"
|
35
|
+
{ "title" => "Numerical Models",
|
38
36
|
"pub_date" => "1998/7/5" },
|
39
|
-
{ "title"
|
40
|
-
"pub_date" => "1998/10/5" }
|
37
|
+
{ "title" => "The Perfect Outsider",
|
38
|
+
"pub_date" => "1998/10/5" }
|
41
39
|
]
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
40
|
+
end
|
41
|
+
|
42
|
+
let(:all_tables) do
|
43
|
+
[
|
44
|
+
{ "table" => table1996 },
|
45
|
+
{ "table" => table1997 },
|
46
|
+
{ "table" => table1998 }
|
46
47
|
]
|
47
48
|
end
|
48
49
|
|
49
50
|
it 'scrape single table contents' do
|
50
|
-
node = Yasuri::StructNode.new(
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
51
|
+
node = Yasuri::StructNode.new(
|
52
|
+
'/html/body/table[1]/tr', "table", [
|
53
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
54
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
55
|
+
]
|
56
|
+
)
|
57
|
+
expected = table1996
|
58
|
+
actual = node.scrape(uri_struct)
|
56
59
|
expect(actual).to match expected
|
57
60
|
end
|
58
61
|
|
59
62
|
it 'return single result without array' do
|
60
|
-
node = Yasuri::StructNode.new(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
node = Yasuri::StructNode.new(
|
64
|
+
'/html/body/table[1]/tr[1]', "table_first_tr", [
|
65
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
66
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
67
|
+
]
|
68
|
+
)
|
69
|
+
expected = table1996.first
|
70
|
+
actual = node.scrape(uri_struct)
|
66
71
|
expect(actual).to match expected
|
67
72
|
end
|
68
73
|
|
69
74
|
it 'return empty text if no match node' do
|
70
75
|
no_match_xpath = '/html/body/table[1]/t'
|
71
|
-
node = Yasuri::StructNode.new(
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
node = Yasuri::StructNode.new(
|
77
|
+
no_match_xpath, "table", [
|
78
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = node.scrape(uri_struct)
|
75
82
|
expect(actual).to be_empty
|
76
83
|
end
|
77
84
|
|
78
85
|
it 'fail with invalid xpath' do
|
79
86
|
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
80
|
-
node = Yasuri::StructNode.new(
|
81
|
-
|
82
|
-
|
83
|
-
|
87
|
+
node = Yasuri::StructNode.new(
|
88
|
+
invalid_xpath, "table", [
|
89
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
84
93
|
end
|
85
94
|
|
86
95
|
it 'fail with invalid xpath in children' do
|
87
96
|
invalid_xpath = './td[1]['
|
88
|
-
node = Yasuri::StructNode.new(
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
97
|
+
node = Yasuri::StructNode.new(
|
98
|
+
'/html/body/table[1]/tr', "table", [
|
99
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
100
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
101
|
+
]
|
102
|
+
)
|
103
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
93
104
|
end
|
94
105
|
|
95
106
|
it 'scrape all tables' do
|
96
|
-
node = Yasuri::StructNode.new(
|
97
|
-
|
98
|
-
Yasuri::
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
107
|
+
node = Yasuri::StructNode.new(
|
108
|
+
'/html/body/table', "tables", [
|
109
|
+
Yasuri::StructNode.new(
|
110
|
+
'./tr', "table", [
|
111
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
112
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
113
|
+
]
|
114
|
+
)
|
115
|
+
]
|
116
|
+
)
|
117
|
+
expected = all_tables
|
118
|
+
actual = node.scrape(uri_struct)
|
104
119
|
expect(actual).to match expected
|
105
120
|
end
|
106
121
|
|
@@ -111,62 +126,71 @@ describe 'Yasuri' do
|
|
111
126
|
text_pub_date './td[2]'
|
112
127
|
end
|
113
128
|
end
|
114
|
-
original = Yasuri::StructNode.new(
|
115
|
-
|
116
|
-
Yasuri::
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
129
|
+
original = Yasuri::StructNode.new(
|
130
|
+
'/html/body/table', "tables", [
|
131
|
+
Yasuri::StructNode.new(
|
132
|
+
'./tr', "table", [
|
133
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
134
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
135
|
+
]
|
136
|
+
)
|
137
|
+
]
|
138
|
+
)
|
139
|
+
compare_generated_vs_original(generated, original, uri_struct)
|
121
140
|
end
|
122
141
|
|
123
142
|
it 'return child node as symbol' do
|
124
|
-
node = Yasuri::StructNode.new(
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
143
|
+
node = Yasuri::StructNode.new(
|
144
|
+
'/html/body/table[1]/tr', "table", [
|
145
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
146
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
147
|
+
]
|
148
|
+
)
|
149
|
+
expected = table1996.map { |h| h.transform_keys(&:to_sym) }
|
150
|
+
actual = node.scrape(uri_struct, symbolize_names: true)
|
130
151
|
expect(actual).to match expected
|
131
152
|
end
|
132
|
-
|
133
153
|
end
|
134
154
|
|
135
155
|
describe '::StructNode::Links' do
|
136
|
-
|
137
|
-
|
138
|
-
|
156
|
+
let(:uri_struct) { "#{uri}/struct/structual_links.html" }
|
157
|
+
let(:table) do
|
158
|
+
[
|
139
159
|
{ "title" => "Child01,02",
|
140
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
160
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }] },
|
141
161
|
|
142
162
|
{ "title" => "Child01,02,03",
|
143
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
163
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }, { "p" => "Child 03 page." }] }
|
144
164
|
]
|
145
165
|
end
|
146
166
|
|
147
167
|
it 'return child node in links inside struct' do
|
148
|
-
node = Yasuri::StructNode.new(
|
149
|
-
|
150
|
-
|
151
|
-
Yasuri::
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
168
|
+
node = Yasuri::StructNode.new(
|
169
|
+
'/html/body/table/tr', "table", [
|
170
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
171
|
+
Yasuri::LinksNode.new(
|
172
|
+
'./td/a', "child", [
|
173
|
+
Yasuri::TextNode.new('/html/body/p', "p")
|
174
|
+
]
|
175
|
+
)
|
176
|
+
]
|
177
|
+
)
|
178
|
+
expected = table
|
179
|
+
actual = node.scrape(uri_struct)
|
156
180
|
expect(actual).to match expected
|
157
181
|
end
|
158
|
-
end
|
182
|
+
end
|
159
183
|
|
160
184
|
describe '::StructNode::Pages' do
|
161
|
-
|
162
|
-
@uri = uri + "/struct/structual_text.html"
|
163
|
-
end
|
185
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
164
186
|
|
165
187
|
it 'not supported' do
|
166
|
-
node = Yasuri::StructNode.new(
|
167
|
-
|
168
|
-
|
169
|
-
|
188
|
+
node = Yasuri::StructNode.new(
|
189
|
+
'/html/body/table[1]/tr', "table", [
|
190
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
191
|
+
]
|
192
|
+
)
|
193
|
+
expect { node.scrape(uri_struct) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
170
194
|
end
|
171
195
|
end
|
172
196
|
end
|