yasuri 2.0.13 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,77 +1,86 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- #########
7
- # Links #
8
- #########
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::LinksNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @uri = uri
16
- @index_page = @agent.get(@uri)
17
- end
18
-
19
8
  it 'scrape links' do
20
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
21
- Yasuri::TextNode.new('/html/body/p', "content"),
22
- ])
9
+ root_node = Yasuri::LinksNode.new(
10
+ '/html/body/a', "root", [
11
+ Yasuri::TextNode.new('/html/body/p', "content")
12
+ ]
13
+ )
23
14
 
24
- actual = root_node.inject(@agent, @index_page)
15
+ actual = root_node.scrape(uri)
25
16
  expected = [
26
- {"content" => "Child 01 page."},
27
- {"content" => "Child 02 page."},
28
- {"content" => "Child 03 page."},
17
+ { "content" => "Child 01 page." },
18
+ { "content" => "Child 02 page." },
19
+ { "content" => "Child 03 page." }
29
20
  ]
30
21
  expect(actual).to match expected
31
22
  end
32
23
 
33
24
  it 'return empty set if no match node' do
34
25
  missing_xpath = '/html/body/b'
35
- root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
36
- Yasuri::TextNode.new('/html/body/p', "content"),
37
- ])
26
+ root_node = Yasuri::LinksNode.new(
27
+ missing_xpath, "root", [
28
+ Yasuri::TextNode.new('/html/body/p', "content")
29
+ ]
30
+ )
38
31
 
39
- actual = root_node.inject(@agent, @index_page)
32
+ actual = root_node.scrape(uri)
40
33
  expect(actual).to be_empty
41
34
  end
42
35
 
43
36
  it 'scrape links, recursive' do
44
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
45
- Yasuri::TextNode.new('/html/body/p', "content"),
46
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
47
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
48
- ]),
49
- ])
50
- actual = root_node.inject(@agent, @index_page)
37
+ root_node = Yasuri::LinksNode.new(
38
+ '/html/body/a', "root", [
39
+ Yasuri::TextNode.new('/html/body/p', "content"),
40
+ Yasuri::LinksNode.new(
41
+ '/html/body/ul/li/a', "sub_link", [
42
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
43
+ ]
44
+ )
45
+ ]
46
+ )
47
+ actual = root_node.scrape(uri)
51
48
  expected = [
52
- {"content" => "Child 01 page.",
53
- "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
54
- {"sub_page_title" => "Child 02 SubPage Test"}],},
55
- {"content" => "Child 02 page.",
56
- "sub_link" => [],},
57
- {"content" => "Child 03 page.",
58
- "sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
49
+ { "content" => "Child 01 page.",
50
+ "sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
51
+ { "sub_page_title" => "Child 02 SubPage Test" }] },
52
+ { "content" => "Child 02 page.",
53
+ "sub_link" => [] },
54
+ { "content" => "Child 03 page.",
55
+ "sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
59
56
  ]
60
57
  expect(actual).to match expected
61
58
  end
62
- it 'can be defined by DSL, return single LinkNode title' do
63
- generated = Yasuri.links_title '/html/body/a'
64
- original = Yasuri::LinksNode.new('/html/body/a', "title")
65
- compare_generated_vs_original(generated, original, @index_page)
59
+
60
+ it 'can be defined by DSL, return no contains if no child node' do
61
+ root_node = Yasuri.links_title '/html/body/a'
62
+ actual = root_node.scrape(uri)
63
+ expected = [{}, {}, {}] # Empty if no child node under links node.
64
+ expect(actual).to match expected
65
+ end
66
+
67
+ it 'can be defined return no contains if no child node' do
68
+ root_node = Yasuri::LinksNode.new('/html/body/a', "title")
69
+ actual = root_node.scrape(uri)
70
+ expected = [{}, {}, {}] # Empty if no child node under links node.
71
+ expect(actual).to match expected
66
72
  end
73
+
67
74
  it 'can be defined by DSL, return nested contents under link' do
68
75
  generated = Yasuri.links_title '/html/body/a' do
69
- text_name '/html/body/p'
70
- end
71
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
72
- Yasuri::TextNode.new('/html/body/p', "name"),
73
- ])
74
- compare_generated_vs_original(generated, original, @index_page)
76
+ text_name '/html/body/p'
77
+ end
78
+ original = Yasuri::LinksNode.new(
79
+ '/html/body/a', "root", [
80
+ Yasuri::TextNode.new('/html/body/p', "name")
81
+ ]
82
+ )
83
+ compare_generated_vs_original(generated, original, uri)
75
84
  end
76
85
 
77
86
  it 'can be defined by DSL, return recursive links node' do
@@ -82,27 +91,50 @@ describe 'Yasuri' do
82
91
  end
83
92
  end
84
93
 
85
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
86
- Yasuri::TextNode.new('/html/body/p', "content"),
87
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
88
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
89
- ]),
90
- ])
91
- compare_generated_vs_original(generated, original, @index_page)
94
+ original = Yasuri::LinksNode.new(
95
+ '/html/body/a', "root", [
96
+ Yasuri::TextNode.new('/html/body/p', "content"),
97
+ Yasuri::LinksNode.new(
98
+ '/html/body/ul/li/a', "sub_link", [
99
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
100
+ ]
101
+ )
102
+ ]
103
+ )
104
+ compare_generated_vs_original(generated, original, uri)
92
105
  end
93
106
 
94
107
  it 'return child node as symbol' do
95
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
96
- Yasuri::TextNode.new('/html/body/p', "content"),
97
- ])
108
+ root_node = Yasuri::LinksNode.new(
109
+ '/html/body/a', "root", [
110
+ Yasuri::TextNode.new('/html/body/p', "content")
111
+ ]
112
+ )
98
113
 
99
- actual = root_node.inject(@agent, @index_page, symbolize_names: true )
114
+ actual = root_node.scrape(uri, symbolize_names: true)
100
115
  expected = [
101
- {:content => "Child 01 page."},
102
- {:content => "Child 02 page."},
103
- {:content => "Child 03 page."},
116
+ { content: "Child 01 page." },
117
+ { content: "Child 02 page." },
118
+ { content: "Child 03 page." }
104
119
  ]
105
120
  expect(actual).to match expected
106
121
  end
122
+
123
+ it 'scrape with interval for each request' do
124
+ allow(Kernel).to receive(:sleep)
125
+
126
+ root_node = Yasuri::LinksNode.new(
127
+ '/html/body/a', "root", [
128
+ Yasuri::TextNode.new('/html/body/p', "content")
129
+ ]
130
+ )
131
+ actual = root_node.scrape(uri, interval_ms: 100)
132
+ expect(actual.size).to match 3
133
+
134
+ # request will be run 4(1+3) times because root page will be requested
135
+ expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
136
+ expect(interval_sec).to match 0.1
137
+ end
138
+ end
107
139
  end
108
140
  end
@@ -0,0 +1,71 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ describe '::MapNode' do
7
+ it "multi scrape in singe page" do
8
+ map = Yasuri.map_sample do
9
+ text_title '/html/head/title'
10
+ text_body_p '/html/body/p[1]'
11
+ end
12
+ actual = map.scrape(uri)
13
+
14
+ expected = {
15
+ "title" => "Yasuri Test",
16
+ "body_p" => "Hello,Yasuri"
17
+ }
18
+ expect(actual).to include expected
19
+ end
20
+
21
+ it "nested multi scrape in singe page" do
22
+ map = Yasuri.map_sample do
23
+ map_group1 { text_child01 '/html/body/a[1]' }
24
+ map_group2 do
25
+ text_child01 '/html/body/a[1]'
26
+ text_child03 '/html/body/a[3]'
27
+ end
28
+ end
29
+ actual = map.scrape(uri)
30
+
31
+ expected = {
32
+ "group1" => {
33
+ "child01" => "child01"
34
+ },
35
+ "group2" => {
36
+ "child01" => "child01",
37
+ "child03" => "child03"
38
+ }
39
+ }
40
+ expect(actual).to include expected
41
+ end
42
+
43
+ it "scrape with links node" do
44
+ map = Yasuri.map_sample do
45
+ map_group1 do
46
+ links_a '/html/body/a' do
47
+ text_content '/html/body/p'
48
+ end
49
+ text_child01 '/html/body/a[1]'
50
+ end
51
+ map_group2 do
52
+ text_child03 '/html/body/a[3]'
53
+ end
54
+ end
55
+ actual = map.scrape(uri)
56
+
57
+ expected = {
58
+ "group1" => {
59
+ "a" => [
60
+ { "content" => "Child 01 page." },
61
+ { "content" => "Child 02 page." },
62
+ { "content" => "Child 03 page." }
63
+ ],
64
+ "child01" => "child01"
65
+ },
66
+ "group2" => { "child03" => "child03" }
67
+ }
68
+ expect(actual).to include expected
69
+ end
70
+ end
71
+ end
@@ -1,107 +1,96 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- ############
7
- # Paginate #
8
- ############
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::PaginateNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @uri = uri + "/pagination/page01.html"
16
- @page = @agent.get(@uri)
17
- end
8
+ let(:uri_paginate) { "#{uri}/pagination/page01.html" }
18
9
 
19
10
  it "scrape each paginated pages" do
20
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
21
- Yasuri::TextNode.new('/html/body/p', "content"),
22
- ])
23
- actual = root_node.inject(@agent, @page)
11
+ root_node = Yasuri::PaginateNode.new(
12
+ "/html/body/nav/span/a[@class='next']", "root", [
13
+ Yasuri::TextNode.new('/html/body/p', "content")
14
+ ]
15
+ )
16
+ actual = root_node.scrape(uri_paginate)
24
17
  expected = [
25
- {"content" => "PaginationTest01"},
26
- {"content" => "PaginationTest02"},
27
- {"content" => "PaginationTest03"},
28
- {"content" => "PaginationTest04"},
18
+ { "content" => "PaginationTest01" },
19
+ { "content" => "PaginationTest02" },
20
+ { "content" => "PaginationTest03" },
21
+ { "content" => "PaginationTest04" }
29
22
  ]
30
23
  expect(actual).to match expected
31
24
  end
32
25
 
33
26
  it "scrape each paginated pages with flatten" do
34
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
- Yasuri::TextNode.new('/html/body/p', "content"),
36
- Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
- Yasuri::TextNode.new('./a', "text"),
38
- ]),
39
- ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
27
+ root_node = Yasuri::PaginateNode.new(
28
+ "/html/body/nav/span/a[@class='next']", "root", [
29
+ Yasuri::TextNode.new('/html/body/p', "content"),
30
+ Yasuri::StructNode.new(
31
+ '/html/body/nav/span', "span", [
32
+ Yasuri::TextNode.new('./a', "text")
33
+ ]
34
+ )
35
+ ], flatten: true
36
+ )
37
+ actual = root_node.scrape(uri_paginate)
41
38
  expected = [
42
- "PaginationTest01",
43
- {"text"=>""},
44
- {"text"=>""},
45
- {"text" => "2"},
46
- {"text" => "3"},
47
- {"text" => "4"},
48
- {"text"=>"NextPage »"},
49
- "PaginationTest02",
50
- {"text"=>"« PreviousPage"},
51
- {"text" => "1"},
52
- {"text"=>""},
53
- {"text" => "3"},
54
- {"text" => "4"},
55
- {"text"=>"NextPage »"},
56
- "PaginationTest03",
57
- {"text"=>"« PreviousPage"},
58
- {"text" => "1"},
59
- {"text" => "2"},
60
- {"text"=>""},
61
- {"text" => "4"},
62
- {"text"=>"NextPage »"},
63
- "PaginationTest04",
64
- {"text"=>"« PreviousPage"},
65
- {"text" => "1"},
66
- {"text" => "2"},
67
- {"text" => "3"},
68
- {"text"=>""},
69
- {"text"=>""},
39
+ "PaginationTest01", { "text" => "" },
40
+ { "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
41
+ { "text" => "NextPage »" },
42
+
43
+ "PaginationTest02", { "text" => "« PreviousPage" },
44
+ { "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
45
+ { "text" => "NextPage »" },
46
+
47
+ "PaginationTest03", { "text" => "« PreviousPage" },
48
+ { "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
49
+ { "text" => "NextPage »" },
50
+
51
+ "PaginationTest04", { "text" => "« PreviousPage" },
52
+ { "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
53
+ { "text" => "" }
70
54
  ]
71
55
 
72
56
  expect(actual).to match expected
73
57
  end
74
58
 
75
-
76
59
  it "scrape each paginated pages limited" do
77
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
- Yasuri::TextNode.new('/html/body/p', "content"),
79
- ], limit:3)
80
- actual = root_node.inject(@agent, @page)
60
+ root_node = Yasuri::PaginateNode.new(
61
+ "/html/body/nav/span/a[@class='next']", "root", [
62
+ Yasuri::TextNode.new('/html/body/p', "content")
63
+ ], limit: 3
64
+ )
65
+ actual = root_node.scrape(uri_paginate)
81
66
  expected = [
82
- {"content" => "PaginationTest01"},
83
- {"content" => "PaginationTest02"},
84
- {"content" => "PaginationTest03"},
67
+ { "content" => "PaginationTest01" },
68
+ { "content" => "PaginationTest02" },
69
+ { "content" => "PaginationTest03" }
85
70
  ]
86
71
  expect(actual).to match expected
87
72
  end
88
73
 
89
74
  it 'return first content if paginate link node is not found' do
90
75
  missing_xpath = "/html/body/nav/span/b[@class='next']"
91
- root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
92
- Yasuri::TextNode.new('/html/body/p', "content"),
93
- ])
94
- actual = root_node.inject(@agent, @page)
95
- expected = [ {"content" => "PaginationTest01"}, ]
76
+ root_node = Yasuri::PaginateNode.new(
77
+ missing_xpath, "root", [
78
+ Yasuri::TextNode.new('/html/body/p', "content")
79
+ ]
80
+ )
81
+ actual = root_node.scrape(uri_paginate)
82
+ expected = [{ "content" => "PaginationTest01" }]
96
83
  expect(actual).to match_array expected
97
84
  end
98
85
 
99
86
  it 'return empty hashes if content node is not found' do
100
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
101
- Yasuri::TextNode.new('/html/body/hoge', "content"),
102
- ])
103
- actual = root_node.inject(@agent, @page)
104
- expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
87
+ root_node = Yasuri::PaginateNode.new(
88
+ "/html/body/nav/span/a[@class='next']", "root", [
89
+ Yasuri::TextNode.new('/html/body/hoge', "content")
90
+ ]
91
+ )
92
+ actual = root_node.scrape(uri_paginate)
93
+ expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
105
94
  expect(actual).to match_array expected
106
95
  end
107
96
 
@@ -109,34 +98,56 @@ describe 'Yasuri' do
109
98
  generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
110
99
  text_content '/html/body/p'
111
100
  end
112
- original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
113
- Yasuri::TextNode.new('/html/body/p', "content"),
114
- ])
115
- compare_generated_vs_original(generated, original, @page)
101
+ original = Yasuri::PaginateNode.new(
102
+ "/html/body/nav/span/a[@class='next']", "root", [
103
+ Yasuri::TextNode.new('/html/body/p', "content")
104
+ ]
105
+ )
106
+ compare_generated_vs_original(generated, original, uri_paginate)
116
107
  end
117
108
 
118
109
  it 'can be defined by DSL, return single PaginateNode content limited' do
119
- generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
110
+ generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
120
111
  text_content '/html/body/p'
121
112
  end
122
- original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
123
- Yasuri::TextNode.new('/html/body/p', "content"),
124
- ], limit: 2)
125
- compare_generated_vs_original(generated, original, @page)
113
+ original = Yasuri::PaginateNode.new(
114
+ "/html/body/nav/span/a[@class='next']", "root", [
115
+ Yasuri::TextNode.new('/html/body/p', "content")
116
+ ], limit: 2
117
+ )
118
+ compare_generated_vs_original(generated, original, uri_paginate)
126
119
  end
127
120
 
128
121
  it "return child node as symbol" do
129
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
130
- Yasuri::TextNode.new('/html/body/p', "content"),
131
- ])
132
- actual = root_node.inject(@agent, @page, symbolize_names:true)
122
+ root_node = Yasuri::PaginateNode.new(
123
+ "/html/body/nav/span/a[@class='next']", "root", [
124
+ Yasuri::TextNode.new('/html/body/p', "content")
125
+ ]
126
+ )
127
+ actual = root_node.scrape(uri_paginate, symbolize_names: true)
133
128
  expected = [
134
- {:content => "PaginationTest01"},
135
- {:content => "PaginationTest02"},
136
- {:content => "PaginationTest03"},
137
- {:content => "PaginationTest04"},
129
+ { content: "PaginationTest01" },
130
+ { content: "PaginationTest02" },
131
+ { content: "PaginationTest03" },
132
+ { content: "PaginationTest04" }
138
133
  ]
139
134
  expect(actual).to match expected
140
135
  end
136
+
137
+ it "scrape with interval for each request" do
138
+ allow(Kernel).to receive(:sleep)
139
+
140
+ root_node = Yasuri::PaginateNode.new(
141
+ "/html/body/nav/span/a[@class='next']", "root", [
142
+ Yasuri::TextNode.new('/html/body/p', "content")
143
+ ]
144
+ )
145
+ actual = root_node.scrape(uri_paginate, interval_ms: 1000)
146
+ expect(actual.size).to match 4
147
+
148
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
149
+ expect(interval_sec).to match 1.0
150
+ end
151
+ end
141
152
  end
142
153
  end