yasuri 3.3.0 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
data/spec/yasuri_map_spec.rb
CHANGED
@@ -3,20 +3,16 @@ require_relative 'spec_helper'
|
|
3
3
|
describe 'Yasuri' do
|
4
4
|
include_context 'httpserver'
|
5
5
|
|
6
|
-
before do
|
7
|
-
@uri = uri
|
8
|
-
end
|
9
|
-
|
10
6
|
describe '::MapNode' do
|
11
7
|
it "multi scrape in singe page" do
|
12
8
|
map = Yasuri.map_sample do
|
13
9
|
text_title '/html/head/title'
|
14
10
|
text_body_p '/html/body/p[1]'
|
15
11
|
end
|
16
|
-
actual = map.scrape(
|
12
|
+
actual = map.scrape(uri)
|
17
13
|
|
18
14
|
expected = {
|
19
|
-
"title"
|
15
|
+
"title" => "Yasuri Test",
|
20
16
|
"body_p" => "Hello,Yasuri"
|
21
17
|
}
|
22
18
|
expect(actual).to include expected
|
@@ -24,13 +20,13 @@ describe 'Yasuri' do
|
|
24
20
|
|
25
21
|
it "nested multi scrape in singe page" do
|
26
22
|
map = Yasuri.map_sample do
|
27
|
-
map_group1 { text_child01
|
23
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
28
24
|
map_group2 do
|
29
25
|
text_child01 '/html/body/a[1]'
|
30
26
|
text_child03 '/html/body/a[3]'
|
31
27
|
end
|
32
28
|
end
|
33
|
-
actual = map.scrape(
|
29
|
+
actual = map.scrape(uri)
|
34
30
|
|
35
31
|
expected = {
|
36
32
|
"group1" => {
|
@@ -50,20 +46,20 @@ describe 'Yasuri' do
|
|
50
46
|
links_a '/html/body/a' do
|
51
47
|
text_content '/html/body/p'
|
52
48
|
end
|
53
|
-
text_child01
|
49
|
+
text_child01 '/html/body/a[1]'
|
54
50
|
end
|
55
51
|
map_group2 do
|
56
52
|
text_child03 '/html/body/a[3]'
|
57
53
|
end
|
58
54
|
end
|
59
|
-
actual = map.scrape(
|
55
|
+
actual = map.scrape(uri)
|
60
56
|
|
61
57
|
expected = {
|
62
58
|
"group1" => {
|
63
59
|
"a" => [
|
64
|
-
{"content" => "Child 01 page."},
|
65
|
-
{"content" => "Child 02 page."},
|
66
|
-
{"content" => "Child 03 page."}
|
60
|
+
{ "content" => "Child 01 page." },
|
61
|
+
{ "content" => "Child 02 page." },
|
62
|
+
{ "content" => "Child 03 page." }
|
67
63
|
],
|
68
64
|
"child01" => "child01"
|
69
65
|
},
|
@@ -72,4 +68,4 @@ describe 'Yasuri' do
|
|
72
68
|
expect(actual).to include expected
|
73
69
|
end
|
74
70
|
end
|
75
|
-
end
|
71
|
+
end
|
@@ -1,105 +1,96 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
############
|
7
|
-
# Paginate #
|
8
|
-
############
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::PaginateNode' do
|
13
|
-
|
14
|
-
@uri = uri + "/pagination/page01.html"
|
15
|
-
end
|
8
|
+
let(:uri_paginate) { "#{uri}/pagination/page01.html" }
|
16
9
|
|
17
10
|
it "scrape each paginated pages" do
|
18
|
-
root_node = Yasuri::PaginateNode.new(
|
19
|
-
|
20
|
-
|
21
|
-
|
11
|
+
root_node = Yasuri::PaginateNode.new(
|
12
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
13
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
14
|
+
]
|
15
|
+
)
|
16
|
+
actual = root_node.scrape(uri_paginate)
|
22
17
|
expected = [
|
23
|
-
{"content" => "PaginationTest01"},
|
24
|
-
{"content" => "PaginationTest02"},
|
25
|
-
{"content" => "PaginationTest03"},
|
26
|
-
{"content" => "PaginationTest04"}
|
18
|
+
{ "content" => "PaginationTest01" },
|
19
|
+
{ "content" => "PaginationTest02" },
|
20
|
+
{ "content" => "PaginationTest03" },
|
21
|
+
{ "content" => "PaginationTest04" }
|
27
22
|
]
|
28
23
|
expect(actual).to match expected
|
29
24
|
end
|
30
25
|
|
31
26
|
it "scrape each paginated pages with flatten" do
|
32
|
-
root_node = Yasuri::PaginateNode.new(
|
33
|
-
|
34
|
-
|
35
|
-
Yasuri::
|
36
|
-
|
37
|
-
|
38
|
-
|
27
|
+
root_node = Yasuri::PaginateNode.new(
|
28
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
29
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
30
|
+
Yasuri::StructNode.new(
|
31
|
+
'/html/body/nav/span', "span", [
|
32
|
+
Yasuri::TextNode.new('./a', "text")
|
33
|
+
]
|
34
|
+
)
|
35
|
+
], flatten: true
|
36
|
+
)
|
37
|
+
actual = root_node.scrape(uri_paginate)
|
39
38
|
expected = [
|
40
|
-
"PaginationTest01",
|
41
|
-
{"text"=>""},
|
42
|
-
{"text"=>""},
|
43
|
-
|
44
|
-
{"text" => "
|
45
|
-
{"text" => "4"},
|
46
|
-
{"text"=>"NextPage »"},
|
47
|
-
|
48
|
-
{"text"=>"« PreviousPage"},
|
49
|
-
{"text" => "1"},
|
50
|
-
{"text"=>""},
|
51
|
-
|
52
|
-
{"text" => "
|
53
|
-
{"text"=>"
|
54
|
-
"
|
55
|
-
{"text"=>"« PreviousPage"},
|
56
|
-
{"text" => "1"},
|
57
|
-
{"text" => "2"},
|
58
|
-
{"text"=>""},
|
59
|
-
{"text" => "4"},
|
60
|
-
{"text"=>"NextPage »"},
|
61
|
-
"PaginationTest04",
|
62
|
-
{"text"=>"« PreviousPage"},
|
63
|
-
{"text" => "1"},
|
64
|
-
{"text" => "2"},
|
65
|
-
{"text" => "3"},
|
66
|
-
{"text"=>""},
|
67
|
-
{"text"=>""},
|
39
|
+
"PaginationTest01", { "text" => "" },
|
40
|
+
{ "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
|
41
|
+
{ "text" => "NextPage »" },
|
42
|
+
|
43
|
+
"PaginationTest02", { "text" => "« PreviousPage" },
|
44
|
+
{ "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
|
45
|
+
{ "text" => "NextPage »" },
|
46
|
+
|
47
|
+
"PaginationTest03", { "text" => "« PreviousPage" },
|
48
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
|
49
|
+
{ "text" => "NextPage »" },
|
50
|
+
|
51
|
+
"PaginationTest04", { "text" => "« PreviousPage" },
|
52
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
|
53
|
+
{ "text" => "" }
|
68
54
|
]
|
69
55
|
|
70
56
|
expect(actual).to match expected
|
71
57
|
end
|
72
58
|
|
73
|
-
|
74
59
|
it "scrape each paginated pages limited" do
|
75
|
-
root_node = Yasuri::PaginateNode.new(
|
76
|
-
|
77
|
-
|
78
|
-
|
60
|
+
root_node = Yasuri::PaginateNode.new(
|
61
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
62
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
63
|
+
], limit: 3
|
64
|
+
)
|
65
|
+
actual = root_node.scrape(uri_paginate)
|
79
66
|
expected = [
|
80
|
-
{"content" => "PaginationTest01"},
|
81
|
-
{"content" => "PaginationTest02"},
|
82
|
-
{"content" => "PaginationTest03"}
|
67
|
+
{ "content" => "PaginationTest01" },
|
68
|
+
{ "content" => "PaginationTest02" },
|
69
|
+
{ "content" => "PaginationTest03" }
|
83
70
|
]
|
84
71
|
expect(actual).to match expected
|
85
72
|
end
|
86
73
|
|
87
74
|
it 'return first content if paginate link node is not found' do
|
88
75
|
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
89
|
-
root_node = Yasuri::PaginateNode.new(
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
76
|
+
root_node = Yasuri::PaginateNode.new(
|
77
|
+
missing_xpath, "root", [
|
78
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = root_node.scrape(uri_paginate)
|
82
|
+
expected = [{ "content" => "PaginationTest01" }]
|
94
83
|
expect(actual).to match_array expected
|
95
84
|
end
|
96
85
|
|
97
86
|
it 'return empty hashes if content node is not found' do
|
98
|
-
root_node = Yasuri::PaginateNode.new(
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
87
|
+
root_node = Yasuri::PaginateNode.new(
|
88
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
89
|
+
Yasuri::TextNode.new('/html/body/hoge', "content")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
actual = root_node.scrape(uri_paginate)
|
93
|
+
expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
|
103
94
|
expect(actual).to match_array expected
|
104
95
|
end
|
105
96
|
|
@@ -107,32 +98,38 @@ describe 'Yasuri' do
|
|
107
98
|
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
108
99
|
text_content '/html/body/p'
|
109
100
|
end
|
110
|
-
original = Yasuri::PaginateNode.new(
|
111
|
-
|
112
|
-
|
113
|
-
|
101
|
+
original = Yasuri::PaginateNode.new(
|
102
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
103
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
104
|
+
]
|
105
|
+
)
|
106
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
114
107
|
end
|
115
108
|
|
116
109
|
it 'can be defined by DSL, return single PaginateNode content limited' do
|
117
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
|
110
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
|
118
111
|
text_content '/html/body/p'
|
119
112
|
end
|
120
|
-
original = Yasuri::PaginateNode.new(
|
121
|
-
|
122
|
-
|
123
|
-
|
113
|
+
original = Yasuri::PaginateNode.new(
|
114
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
115
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
116
|
+
], limit: 2
|
117
|
+
)
|
118
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
124
119
|
end
|
125
120
|
|
126
121
|
it "return child node as symbol" do
|
127
|
-
root_node = Yasuri::PaginateNode.new(
|
128
|
-
|
129
|
-
|
130
|
-
|
122
|
+
root_node = Yasuri::PaginateNode.new(
|
123
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
124
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
125
|
+
]
|
126
|
+
)
|
127
|
+
actual = root_node.scrape(uri_paginate, symbolize_names: true)
|
131
128
|
expected = [
|
132
|
-
{:
|
133
|
-
{:
|
134
|
-
{:
|
135
|
-
{:
|
129
|
+
{ content: "PaginationTest01" },
|
130
|
+
{ content: "PaginationTest02" },
|
131
|
+
{ content: "PaginationTest03" },
|
132
|
+
{ content: "PaginationTest04" }
|
136
133
|
]
|
137
134
|
expect(actual).to match expected
|
138
135
|
end
|
@@ -140,10 +137,12 @@ describe 'Yasuri' do
|
|
140
137
|
it "scrape with interval for each request" do
|
141
138
|
allow(Kernel).to receive(:sleep)
|
142
139
|
|
143
|
-
root_node = Yasuri::PaginateNode.new(
|
144
|
-
|
145
|
-
|
146
|
-
|
140
|
+
root_node = Yasuri::PaginateNode.new(
|
141
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
142
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
143
|
+
]
|
144
|
+
)
|
145
|
+
actual = root_node.scrape(uri_paginate, interval_ms: 1000)
|
147
146
|
expect(actual.size).to match 4
|
148
147
|
|
149
148
|
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
data/spec/yasuri_spec.rb
CHANGED
@@ -1,17 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
|
-
|
5
1
|
require_relative 'spec_helper'
|
6
2
|
|
7
3
|
describe 'Yasuri' do
|
8
4
|
include_context 'httpserver'
|
9
5
|
|
10
|
-
before do
|
11
|
-
@uri = uri
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
6
|
############
|
16
7
|
# yam2tree #
|
17
8
|
############
|
@@ -27,7 +18,7 @@ describe 'Yasuri' do
|
|
27
18
|
generated = Yasuri.yaml2tree(src)
|
28
19
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
29
20
|
|
30
|
-
compare_generated_vs_original(generated, original,
|
21
|
+
compare_generated_vs_original(generated, original, uri)
|
31
22
|
end
|
32
23
|
|
33
24
|
it "return text node as symbol" do
|
@@ -38,7 +29,7 @@ describe 'Yasuri' do
|
|
38
29
|
generated = Yasuri.yaml2tree(src)
|
39
30
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
40
31
|
|
41
|
-
compare_generated_vs_original(generated, original,
|
32
|
+
compare_generated_vs_original(generated, original, uri)
|
42
33
|
end
|
43
34
|
|
44
35
|
it "return LinksNode/TextNode" do
|
@@ -53,7 +44,7 @@ describe 'Yasuri' do
|
|
53
44
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
54
45
|
])
|
55
46
|
|
56
|
-
compare_generated_vs_original(generated, original,
|
47
|
+
compare_generated_vs_original(generated, original, uri)
|
57
48
|
end
|
58
49
|
|
59
50
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -73,8 +64,8 @@ describe 'Yasuri' do
|
|
73
64
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
74
65
|
])
|
75
66
|
])
|
76
|
-
|
77
|
-
compare_generated_vs_original(generated, original,
|
67
|
+
test_uri = uri + "/struct/structual_text.html"
|
68
|
+
compare_generated_vs_original(generated, original, test_uri)
|
78
69
|
end
|
79
70
|
|
80
71
|
end # end of describe '.yaml2tree'
|
@@ -96,7 +87,7 @@ describe 'Yasuri' do
|
|
96
87
|
generated = Yasuri.json2tree(src)
|
97
88
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
98
89
|
|
99
|
-
compare_generated_vs_original(generated, original,
|
90
|
+
compare_generated_vs_original(generated, original, uri)
|
100
91
|
end
|
101
92
|
|
102
93
|
it "return TextNode with truncate_regexp" do
|
@@ -109,7 +100,7 @@ describe 'Yasuri' do
|
|
109
100
|
}|
|
110
101
|
generated = Yasuri.json2tree(src)
|
111
102
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
112
|
-
compare_generated_vs_original(generated, original,
|
103
|
+
compare_generated_vs_original(generated, original, uri)
|
113
104
|
end
|
114
105
|
|
115
106
|
it "return MapNode with TextNodes" do
|
@@ -123,7 +114,7 @@ describe 'Yasuri' do
|
|
123
114
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
124
115
|
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
125
116
|
])
|
126
|
-
compare_generated_vs_original(generated, original,
|
117
|
+
compare_generated_vs_original(generated, original, uri)
|
127
118
|
end
|
128
119
|
|
129
120
|
it "return LinksNode/TextNode" do
|
@@ -140,7 +131,7 @@ describe 'Yasuri' do
|
|
140
131
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
141
132
|
])
|
142
133
|
|
143
|
-
compare_generated_vs_original(generated, original,
|
134
|
+
compare_generated_vs_original(generated, original, uri)
|
144
135
|
end
|
145
136
|
|
146
137
|
it "return PaginateNode/TextNode" do
|
@@ -156,8 +147,8 @@ describe 'Yasuri' do
|
|
156
147
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
157
148
|
])
|
158
149
|
|
159
|
-
|
160
|
-
compare_generated_vs_original(generated, original,
|
150
|
+
test_uri = uri + "/pagination/page01.html"
|
151
|
+
compare_generated_vs_original(generated, original, test_uri)
|
161
152
|
end
|
162
153
|
|
163
154
|
it "return PaginateNode/TextNode with limit" do
|
@@ -174,8 +165,8 @@ describe 'Yasuri' do
|
|
174
165
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
175
166
|
], limit:2)
|
176
167
|
|
177
|
-
|
178
|
-
compare_generated_vs_original(generated, original,
|
168
|
+
test_uri = uri + "/pagination/page01.html"
|
169
|
+
compare_generated_vs_original(generated, original, test_uri)
|
179
170
|
end
|
180
171
|
|
181
172
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -197,8 +188,8 @@ describe 'Yasuri' do
|
|
197
188
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
198
189
|
])
|
199
190
|
])
|
200
|
-
|
201
|
-
compare_generated_vs_original(generated, original,
|
191
|
+
test_uri = uri + "/struct/structual_text.html"
|
192
|
+
compare_generated_vs_original(generated, original, test_uri)
|
202
193
|
end
|
203
194
|
end
|
204
195
|
|
@@ -1,106 +1,121 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
##########
|
7
|
-
# Struct #
|
8
|
-
##########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::StructNode' do
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
{ "title" => "The Perfect Insider",
|
8
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
9
|
+
let(:table1996) do
|
10
|
+
[
|
11
|
+
{ "title" => "The Perfect Insider",
|
18
12
|
"pub_date" => "1996/4/5" },
|
19
|
-
{ "title"
|
13
|
+
{ "title" => "Doctors in Isolated Room",
|
20
14
|
"pub_date" => "1996/7/5" },
|
21
|
-
{ "title"
|
22
|
-
"pub_date" => "1996/9/5" }
|
15
|
+
{ "title" => "Mathematical Goodbye",
|
16
|
+
"pub_date" => "1996/9/5" }
|
23
17
|
]
|
24
|
-
|
25
|
-
|
18
|
+
end
|
19
|
+
let(:table1997) do
|
20
|
+
[
|
21
|
+
{ "title" => "Jack the Poetical Private",
|
26
22
|
"pub_date" => "1997/1/5" },
|
27
|
-
{ "title"
|
23
|
+
{ "title" => "Who Inside",
|
28
24
|
"pub_date" => "1997/4/5" },
|
29
|
-
{ "title"
|
30
|
-
"pub_date" => "1997/10/5" }
|
25
|
+
{ "title" => "Illusion Acts Like Magic",
|
26
|
+
"pub_date" => "1997/10/5" }
|
31
27
|
]
|
32
|
-
|
33
|
-
|
28
|
+
end
|
29
|
+
let(:table1998) do
|
30
|
+
[
|
31
|
+
{ "title" => "Replaceable Summer",
|
34
32
|
"pub_date" => "1998/1/7" },
|
35
|
-
{ "title"
|
33
|
+
{ "title" => "Switch Back",
|
36
34
|
"pub_date" => "1998/4/5" },
|
37
|
-
{ "title"
|
35
|
+
{ "title" => "Numerical Models",
|
38
36
|
"pub_date" => "1998/7/5" },
|
39
|
-
{ "title"
|
40
|
-
"pub_date" => "1998/10/5" }
|
37
|
+
{ "title" => "The Perfect Outsider",
|
38
|
+
"pub_date" => "1998/10/5" }
|
41
39
|
]
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
40
|
+
end
|
41
|
+
|
42
|
+
let(:all_tables) do
|
43
|
+
[
|
44
|
+
{ "table" => table1996 },
|
45
|
+
{ "table" => table1997 },
|
46
|
+
{ "table" => table1998 }
|
46
47
|
]
|
47
48
|
end
|
48
49
|
|
49
50
|
it 'scrape single table contents' do
|
50
|
-
node = Yasuri::StructNode.new(
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
51
|
+
node = Yasuri::StructNode.new(
|
52
|
+
'/html/body/table[1]/tr', "table", [
|
53
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
54
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
55
|
+
]
|
56
|
+
)
|
57
|
+
expected = table1996
|
58
|
+
actual = node.scrape(uri_struct)
|
56
59
|
expect(actual).to match expected
|
57
60
|
end
|
58
61
|
|
59
62
|
it 'return single result without array' do
|
60
|
-
node = Yasuri::StructNode.new(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
node = Yasuri::StructNode.new(
|
64
|
+
'/html/body/table[1]/tr[1]', "table_first_tr", [
|
65
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
66
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
67
|
+
]
|
68
|
+
)
|
69
|
+
expected = table1996.first
|
70
|
+
actual = node.scrape(uri_struct)
|
66
71
|
expect(actual).to match expected
|
67
72
|
end
|
68
73
|
|
69
74
|
it 'return empty text if no match node' do
|
70
75
|
no_match_xpath = '/html/body/table[1]/t'
|
71
|
-
node = Yasuri::StructNode.new(
|
72
|
-
|
73
|
-
|
74
|
-
|
76
|
+
node = Yasuri::StructNode.new(
|
77
|
+
no_match_xpath, "table", [
|
78
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = node.scrape(uri_struct)
|
75
82
|
expect(actual).to be_empty
|
76
83
|
end
|
77
84
|
|
78
85
|
it 'fail with invalid xpath' do
|
79
86
|
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
80
|
-
node = Yasuri::StructNode.new(
|
81
|
-
|
82
|
-
|
83
|
-
|
87
|
+
node = Yasuri::StructNode.new(
|
88
|
+
invalid_xpath, "table", [
|
89
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
84
93
|
end
|
85
94
|
|
86
95
|
it 'fail with invalid xpath in children' do
|
87
96
|
invalid_xpath = './td[1]['
|
88
|
-
node = Yasuri::StructNode.new(
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
97
|
+
node = Yasuri::StructNode.new(
|
98
|
+
'/html/body/table[1]/tr', "table", [
|
99
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
100
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
101
|
+
]
|
102
|
+
)
|
103
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
93
104
|
end
|
94
105
|
|
95
106
|
it 'scrape all tables' do
|
96
|
-
node = Yasuri::StructNode.new(
|
97
|
-
|
98
|
-
Yasuri::
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
107
|
+
node = Yasuri::StructNode.new(
|
108
|
+
'/html/body/table', "tables", [
|
109
|
+
Yasuri::StructNode.new(
|
110
|
+
'./tr', "table", [
|
111
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
112
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
113
|
+
]
|
114
|
+
)
|
115
|
+
]
|
116
|
+
)
|
117
|
+
expected = all_tables
|
118
|
+
actual = node.scrape(uri_struct)
|
104
119
|
expect(actual).to match expected
|
105
120
|
end
|
106
121
|
|
@@ -111,62 +126,71 @@ describe 'Yasuri' do
|
|
111
126
|
text_pub_date './td[2]'
|
112
127
|
end
|
113
128
|
end
|
114
|
-
original = Yasuri::StructNode.new(
|
115
|
-
|
116
|
-
Yasuri::
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
129
|
+
original = Yasuri::StructNode.new(
|
130
|
+
'/html/body/table', "tables", [
|
131
|
+
Yasuri::StructNode.new(
|
132
|
+
'./tr', "table", [
|
133
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
134
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
135
|
+
]
|
136
|
+
)
|
137
|
+
]
|
138
|
+
)
|
139
|
+
compare_generated_vs_original(generated, original, uri_struct)
|
121
140
|
end
|
122
141
|
|
123
142
|
it 'return child node as symbol' do
|
124
|
-
node = Yasuri::StructNode.new(
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
143
|
+
node = Yasuri::StructNode.new(
|
144
|
+
'/html/body/table[1]/tr', "table", [
|
145
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
146
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
147
|
+
]
|
148
|
+
)
|
149
|
+
expected = table1996.map { |h| h.transform_keys(&:to_sym) }
|
150
|
+
actual = node.scrape(uri_struct, symbolize_names: true)
|
130
151
|
expect(actual).to match expected
|
131
152
|
end
|
132
|
-
|
133
153
|
end
|
134
154
|
|
135
155
|
describe '::StructNode::Links' do
|
136
|
-
|
137
|
-
|
138
|
-
|
156
|
+
let(:uri_struct) { "#{uri}/struct/structual_links.html" }
|
157
|
+
let(:table) do
|
158
|
+
[
|
139
159
|
{ "title" => "Child01,02",
|
140
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
160
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }] },
|
141
161
|
|
142
162
|
{ "title" => "Child01,02,03",
|
143
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
163
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }, { "p" => "Child 03 page." }] }
|
144
164
|
]
|
145
165
|
end
|
146
166
|
|
147
167
|
it 'return child node in links inside struct' do
|
148
|
-
node = Yasuri::StructNode.new(
|
149
|
-
|
150
|
-
|
151
|
-
Yasuri::
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
168
|
+
node = Yasuri::StructNode.new(
|
169
|
+
'/html/body/table/tr', "table", [
|
170
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
171
|
+
Yasuri::LinksNode.new(
|
172
|
+
'./td/a', "child", [
|
173
|
+
Yasuri::TextNode.new('/html/body/p', "p")
|
174
|
+
]
|
175
|
+
)
|
176
|
+
]
|
177
|
+
)
|
178
|
+
expected = table
|
179
|
+
actual = node.scrape(uri_struct)
|
156
180
|
expect(actual).to match expected
|
157
181
|
end
|
158
|
-
end
|
182
|
+
end
|
159
183
|
|
160
184
|
describe '::StructNode::Pages' do
|
161
|
-
|
162
|
-
@uri = uri + "/struct/structual_text.html"
|
163
|
-
end
|
185
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
164
186
|
|
165
187
|
it 'not supported' do
|
166
|
-
node = Yasuri::StructNode.new(
|
167
|
-
|
168
|
-
|
169
|
-
|
188
|
+
node = Yasuri::StructNode.new(
|
189
|
+
'/html/body/table[1]/tr', "table", [
|
190
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
191
|
+
]
|
192
|
+
)
|
193
|
+
expect { node.scrape(uri_struct) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
170
194
|
end
|
171
195
|
end
|
172
196
|
end
|