yasuri 3.0.0 → 3.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +70 -27
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -76
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +14 -9
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +9 -7
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -6
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +82 -58
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +105 -15
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
@@ -1,85 +1,86 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
#########
|
7
|
-
# Links #
|
8
|
-
#########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::LinksNode' do
|
13
|
-
before do
|
14
|
-
@agent = Mechanize.new
|
15
|
-
@uri = uri
|
16
|
-
@index_page = @agent.get(@uri)
|
17
|
-
end
|
18
|
-
|
19
8
|
it 'scrape links' do
|
20
|
-
root_node = Yasuri::LinksNode.new(
|
21
|
-
|
22
|
-
|
9
|
+
root_node = Yasuri::LinksNode.new(
|
10
|
+
'/html/body/a', "root", [
|
11
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
12
|
+
]
|
13
|
+
)
|
23
14
|
|
24
|
-
actual = root_node.
|
15
|
+
actual = root_node.scrape(uri)
|
25
16
|
expected = [
|
26
|
-
{"content" => "Child 01 page."},
|
27
|
-
{"content" => "Child 02 page."},
|
28
|
-
{"content" => "Child 03 page."}
|
17
|
+
{ "content" => "Child 01 page." },
|
18
|
+
{ "content" => "Child 02 page." },
|
19
|
+
{ "content" => "Child 03 page." }
|
29
20
|
]
|
30
21
|
expect(actual).to match expected
|
31
22
|
end
|
32
23
|
|
33
24
|
it 'return empty set if no match node' do
|
34
25
|
missing_xpath = '/html/body/b'
|
35
|
-
root_node = Yasuri::LinksNode.new(
|
36
|
-
|
37
|
-
|
26
|
+
root_node = Yasuri::LinksNode.new(
|
27
|
+
missing_xpath, "root", [
|
28
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
29
|
+
]
|
30
|
+
)
|
38
31
|
|
39
|
-
actual = root_node.
|
32
|
+
actual = root_node.scrape(uri)
|
40
33
|
expect(actual).to be_empty
|
41
34
|
end
|
42
35
|
|
43
36
|
it 'scrape links, recursive' do
|
44
|
-
root_node = Yasuri::LinksNode.new(
|
45
|
-
|
46
|
-
|
47
|
-
Yasuri::
|
48
|
-
|
49
|
-
|
50
|
-
|
37
|
+
root_node = Yasuri::LinksNode.new(
|
38
|
+
'/html/body/a', "root", [
|
39
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
40
|
+
Yasuri::LinksNode.new(
|
41
|
+
'/html/body/ul/li/a', "sub_link", [
|
42
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
43
|
+
]
|
44
|
+
)
|
45
|
+
]
|
46
|
+
)
|
47
|
+
actual = root_node.scrape(uri)
|
51
48
|
expected = [
|
52
|
-
{"content"
|
53
|
-
|
54
|
-
|
55
|
-
{"content" => "Child 02 page.",
|
56
|
-
|
57
|
-
{"content" => "Child 03 page.",
|
58
|
-
|
49
|
+
{ "content" => "Child 01 page.",
|
50
|
+
"sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
|
51
|
+
{ "sub_page_title" => "Child 02 SubPage Test" }] },
|
52
|
+
{ "content" => "Child 02 page.",
|
53
|
+
"sub_link" => [] },
|
54
|
+
{ "content" => "Child 03 page.",
|
55
|
+
"sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
|
59
56
|
]
|
60
57
|
expect(actual).to match expected
|
61
58
|
end
|
59
|
+
|
62
60
|
it 'can be defined by DSL, return no contains if no child node' do
|
63
61
|
root_node = Yasuri.links_title '/html/body/a'
|
64
|
-
actual = root_node.
|
62
|
+
actual = root_node.scrape(uri)
|
65
63
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
66
64
|
expect(actual).to match expected
|
67
65
|
end
|
68
66
|
|
69
67
|
it 'can be defined return no contains if no child node' do
|
70
68
|
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
71
|
-
actual = root_node.
|
69
|
+
actual = root_node.scrape(uri)
|
72
70
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
73
71
|
expect(actual).to match expected
|
74
72
|
end
|
73
|
+
|
75
74
|
it 'can be defined by DSL, return nested contents under link' do
|
76
75
|
generated = Yasuri.links_title '/html/body/a' do
|
77
|
-
|
78
|
-
|
79
|
-
original = Yasuri::LinksNode.new(
|
80
|
-
|
81
|
-
|
82
|
-
|
76
|
+
text_name '/html/body/p'
|
77
|
+
end
|
78
|
+
original = Yasuri::LinksNode.new(
|
79
|
+
'/html/body/a', "root", [
|
80
|
+
Yasuri::TextNode.new('/html/body/p', "name")
|
81
|
+
]
|
82
|
+
)
|
83
|
+
compare_generated_vs_original(generated, original, uri)
|
83
84
|
end
|
84
85
|
|
85
86
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -90,27 +91,50 @@ describe 'Yasuri' do
|
|
90
91
|
end
|
91
92
|
end
|
92
93
|
|
93
|
-
original = Yasuri::LinksNode.new(
|
94
|
-
|
95
|
-
|
96
|
-
Yasuri::
|
97
|
-
|
98
|
-
|
99
|
-
|
94
|
+
original = Yasuri::LinksNode.new(
|
95
|
+
'/html/body/a', "root", [
|
96
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
97
|
+
Yasuri::LinksNode.new(
|
98
|
+
'/html/body/ul/li/a', "sub_link", [
|
99
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
100
|
+
]
|
101
|
+
)
|
102
|
+
]
|
103
|
+
)
|
104
|
+
compare_generated_vs_original(generated, original, uri)
|
100
105
|
end
|
101
106
|
|
102
107
|
it 'return child node as symbol' do
|
103
|
-
root_node = Yasuri::LinksNode.new(
|
104
|
-
|
105
|
-
|
108
|
+
root_node = Yasuri::LinksNode.new(
|
109
|
+
'/html/body/a', "root", [
|
110
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
111
|
+
]
|
112
|
+
)
|
106
113
|
|
107
|
-
actual = root_node.
|
114
|
+
actual = root_node.scrape(uri, symbolize_names: true)
|
108
115
|
expected = [
|
109
|
-
{:
|
110
|
-
{:
|
111
|
-
{:
|
116
|
+
{ content: "Child 01 page." },
|
117
|
+
{ content: "Child 02 page." },
|
118
|
+
{ content: "Child 03 page." }
|
112
119
|
]
|
113
120
|
expect(actual).to match expected
|
114
121
|
end
|
122
|
+
|
123
|
+
it 'scrape with interval for each request' do
|
124
|
+
allow(Kernel).to receive(:sleep)
|
125
|
+
|
126
|
+
root_node = Yasuri::LinksNode.new(
|
127
|
+
'/html/body/a', "root", [
|
128
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
129
|
+
]
|
130
|
+
)
|
131
|
+
actual = root_node.scrape(uri, interval_ms: 100)
|
132
|
+
expect(actual.size).to match 3
|
133
|
+
|
134
|
+
# request will be run 4(1+3) times because root page will be requested
|
135
|
+
expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
|
136
|
+
expect(interval_sec).to match 0.1
|
137
|
+
end
|
138
|
+
end
|
115
139
|
end
|
116
140
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
describe '::MapNode' do
|
7
|
+
it "multi scrape in singe page" do
|
8
|
+
map = Yasuri.map_sample do
|
9
|
+
text_title '/html/head/title'
|
10
|
+
text_body_p '/html/body/p[1]'
|
11
|
+
end
|
12
|
+
actual = map.scrape(uri)
|
13
|
+
|
14
|
+
expected = {
|
15
|
+
"title" => "Yasuri Test",
|
16
|
+
"body_p" => "Hello,Yasuri"
|
17
|
+
}
|
18
|
+
expect(actual).to include expected
|
19
|
+
end
|
20
|
+
|
21
|
+
it "nested multi scrape in singe page" do
|
22
|
+
map = Yasuri.map_sample do
|
23
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
24
|
+
map_group2 do
|
25
|
+
text_child01 '/html/body/a[1]'
|
26
|
+
text_child03 '/html/body/a[3]'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
actual = map.scrape(uri)
|
30
|
+
|
31
|
+
expected = {
|
32
|
+
"group1" => {
|
33
|
+
"child01" => "child01"
|
34
|
+
},
|
35
|
+
"group2" => {
|
36
|
+
"child01" => "child01",
|
37
|
+
"child03" => "child03"
|
38
|
+
}
|
39
|
+
}
|
40
|
+
expect(actual).to include expected
|
41
|
+
end
|
42
|
+
|
43
|
+
it "scrape with links node" do
|
44
|
+
map = Yasuri.map_sample do
|
45
|
+
map_group1 do
|
46
|
+
links_a '/html/body/a' do
|
47
|
+
text_content '/html/body/p'
|
48
|
+
end
|
49
|
+
text_child01 '/html/body/a[1]'
|
50
|
+
end
|
51
|
+
map_group2 do
|
52
|
+
text_child03 '/html/body/a[3]'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
actual = map.scrape(uri)
|
56
|
+
|
57
|
+
expected = {
|
58
|
+
"group1" => {
|
59
|
+
"a" => [
|
60
|
+
{ "content" => "Child 01 page." },
|
61
|
+
{ "content" => "Child 02 page." },
|
62
|
+
{ "content" => "Child 03 page." }
|
63
|
+
],
|
64
|
+
"child01" => "child01"
|
65
|
+
},
|
66
|
+
"group2" => { "child03" => "child03" }
|
67
|
+
}
|
68
|
+
expect(actual).to include expected
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -1,107 +1,96 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
############
|
7
|
-
# Paginate #
|
8
|
-
############
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::PaginateNode' do
|
13
|
-
|
14
|
-
@agent = Mechanize.new
|
15
|
-
@uri = uri + "/pagination/page01.html"
|
16
|
-
@page = @agent.get(@uri)
|
17
|
-
end
|
8
|
+
let(:uri_paginate) { "#{uri}/pagination/page01.html" }
|
18
9
|
|
19
10
|
it "scrape each paginated pages" do
|
20
|
-
root_node = Yasuri::PaginateNode.new(
|
21
|
-
|
22
|
-
|
23
|
-
|
11
|
+
root_node = Yasuri::PaginateNode.new(
|
12
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
13
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
14
|
+
]
|
15
|
+
)
|
16
|
+
actual = root_node.scrape(uri_paginate)
|
24
17
|
expected = [
|
25
|
-
{"content" => "PaginationTest01"},
|
26
|
-
{"content" => "PaginationTest02"},
|
27
|
-
{"content" => "PaginationTest03"},
|
28
|
-
{"content" => "PaginationTest04"}
|
18
|
+
{ "content" => "PaginationTest01" },
|
19
|
+
{ "content" => "PaginationTest02" },
|
20
|
+
{ "content" => "PaginationTest03" },
|
21
|
+
{ "content" => "PaginationTest04" }
|
29
22
|
]
|
30
23
|
expect(actual).to match expected
|
31
24
|
end
|
32
25
|
|
33
26
|
it "scrape each paginated pages with flatten" do
|
34
|
-
root_node = Yasuri::PaginateNode.new(
|
35
|
-
|
36
|
-
|
37
|
-
Yasuri::
|
38
|
-
|
39
|
-
|
40
|
-
|
27
|
+
root_node = Yasuri::PaginateNode.new(
|
28
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
29
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
30
|
+
Yasuri::StructNode.new(
|
31
|
+
'/html/body/nav/span', "span", [
|
32
|
+
Yasuri::TextNode.new('./a', "text")
|
33
|
+
]
|
34
|
+
)
|
35
|
+
], flatten: true
|
36
|
+
)
|
37
|
+
actual = root_node.scrape(uri_paginate)
|
41
38
|
expected = [
|
42
|
-
"PaginationTest01",
|
43
|
-
{"text"=>""},
|
44
|
-
{"text"=>""},
|
45
|
-
|
46
|
-
{"text" => "
|
47
|
-
{"text" => "4"},
|
48
|
-
{"text"=>"NextPage »"},
|
49
|
-
|
50
|
-
{"text"=>"« PreviousPage"},
|
51
|
-
{"text" => "1"},
|
52
|
-
{"text"=>""},
|
53
|
-
|
54
|
-
{"text" => "
|
55
|
-
{"text"=>"
|
56
|
-
"
|
57
|
-
{"text"=>"« PreviousPage"},
|
58
|
-
{"text" => "1"},
|
59
|
-
{"text" => "2"},
|
60
|
-
{"text"=>""},
|
61
|
-
{"text" => "4"},
|
62
|
-
{"text"=>"NextPage »"},
|
63
|
-
"PaginationTest04",
|
64
|
-
{"text"=>"« PreviousPage"},
|
65
|
-
{"text" => "1"},
|
66
|
-
{"text" => "2"},
|
67
|
-
{"text" => "3"},
|
68
|
-
{"text"=>""},
|
69
|
-
{"text"=>""},
|
39
|
+
"PaginationTest01", { "text" => "" },
|
40
|
+
{ "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
|
41
|
+
{ "text" => "NextPage »" },
|
42
|
+
|
43
|
+
"PaginationTest02", { "text" => "« PreviousPage" },
|
44
|
+
{ "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
|
45
|
+
{ "text" => "NextPage »" },
|
46
|
+
|
47
|
+
"PaginationTest03", { "text" => "« PreviousPage" },
|
48
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
|
49
|
+
{ "text" => "NextPage »" },
|
50
|
+
|
51
|
+
"PaginationTest04", { "text" => "« PreviousPage" },
|
52
|
+
{ "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
|
53
|
+
{ "text" => "" }
|
70
54
|
]
|
71
55
|
|
72
56
|
expect(actual).to match expected
|
73
57
|
end
|
74
58
|
|
75
|
-
|
76
59
|
it "scrape each paginated pages limited" do
|
77
|
-
root_node = Yasuri::PaginateNode.new(
|
78
|
-
|
79
|
-
|
80
|
-
|
60
|
+
root_node = Yasuri::PaginateNode.new(
|
61
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
62
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
63
|
+
], limit: 3
|
64
|
+
)
|
65
|
+
actual = root_node.scrape(uri_paginate)
|
81
66
|
expected = [
|
82
|
-
{"content" => "PaginationTest01"},
|
83
|
-
{"content" => "PaginationTest02"},
|
84
|
-
{"content" => "PaginationTest03"}
|
67
|
+
{ "content" => "PaginationTest01" },
|
68
|
+
{ "content" => "PaginationTest02" },
|
69
|
+
{ "content" => "PaginationTest03" }
|
85
70
|
]
|
86
71
|
expect(actual).to match expected
|
87
72
|
end
|
88
73
|
|
89
74
|
it 'return first content if paginate link node is not found' do
|
90
75
|
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
91
|
-
root_node = Yasuri::PaginateNode.new(
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
76
|
+
root_node = Yasuri::PaginateNode.new(
|
77
|
+
missing_xpath, "root", [
|
78
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = root_node.scrape(uri_paginate)
|
82
|
+
expected = [{ "content" => "PaginationTest01" }]
|
96
83
|
expect(actual).to match_array expected
|
97
84
|
end
|
98
85
|
|
99
86
|
it 'return empty hashes if content node is not found' do
|
100
|
-
root_node = Yasuri::PaginateNode.new(
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
87
|
+
root_node = Yasuri::PaginateNode.new(
|
88
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
89
|
+
Yasuri::TextNode.new('/html/body/hoge', "content")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
actual = root_node.scrape(uri_paginate)
|
93
|
+
expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
|
105
94
|
expect(actual).to match_array expected
|
106
95
|
end
|
107
96
|
|
@@ -109,34 +98,56 @@ describe 'Yasuri' do
|
|
109
98
|
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
110
99
|
text_content '/html/body/p'
|
111
100
|
end
|
112
|
-
original = Yasuri::PaginateNode.new(
|
113
|
-
|
114
|
-
|
115
|
-
|
101
|
+
original = Yasuri::PaginateNode.new(
|
102
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
103
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
104
|
+
]
|
105
|
+
)
|
106
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
116
107
|
end
|
117
108
|
|
118
109
|
it 'can be defined by DSL, return single PaginateNode content limited' do
|
119
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
|
110
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
|
120
111
|
text_content '/html/body/p'
|
121
112
|
end
|
122
|
-
original = Yasuri::PaginateNode.new(
|
123
|
-
|
124
|
-
|
125
|
-
|
113
|
+
original = Yasuri::PaginateNode.new(
|
114
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
115
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
116
|
+
], limit: 2
|
117
|
+
)
|
118
|
+
compare_generated_vs_original(generated, original, uri_paginate)
|
126
119
|
end
|
127
120
|
|
128
121
|
it "return child node as symbol" do
|
129
|
-
root_node = Yasuri::PaginateNode.new(
|
130
|
-
|
131
|
-
|
132
|
-
|
122
|
+
root_node = Yasuri::PaginateNode.new(
|
123
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
124
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
125
|
+
]
|
126
|
+
)
|
127
|
+
actual = root_node.scrape(uri_paginate, symbolize_names: true)
|
133
128
|
expected = [
|
134
|
-
{:
|
135
|
-
{:
|
136
|
-
{:
|
137
|
-
{:
|
129
|
+
{ content: "PaginationTest01" },
|
130
|
+
{ content: "PaginationTest02" },
|
131
|
+
{ content: "PaginationTest03" },
|
132
|
+
{ content: "PaginationTest04" }
|
138
133
|
]
|
139
134
|
expect(actual).to match expected
|
140
135
|
end
|
136
|
+
|
137
|
+
it "scrape with interval for each request" do
|
138
|
+
allow(Kernel).to receive(:sleep)
|
139
|
+
|
140
|
+
root_node = Yasuri::PaginateNode.new(
|
141
|
+
"/html/body/nav/span/a[@class='next']", "root", [
|
142
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
143
|
+
]
|
144
|
+
)
|
145
|
+
actual = root_node.scrape(uri_paginate, interval_ms: 1000)
|
146
|
+
expect(actual.size).to match 4
|
147
|
+
|
148
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
149
|
+
expect(interval_sec).to match 1.0
|
150
|
+
end
|
151
|
+
end
|
141
152
|
end
|
142
153
|
end
|