yasuri 3.0.0 → 3.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,85 +1,86 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- #########
7
- # Links #
8
- #########
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::LinksNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @uri = uri
16
- @index_page = @agent.get(@uri)
17
- end
18
-
19
8
  it 'scrape links' do
20
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
21
- Yasuri::TextNode.new('/html/body/p', "content"),
22
- ])
9
+ root_node = Yasuri::LinksNode.new(
10
+ '/html/body/a', "root", [
11
+ Yasuri::TextNode.new('/html/body/p', "content")
12
+ ]
13
+ )
23
14
 
24
- actual = root_node.inject(@agent, @index_page)
15
+ actual = root_node.scrape(uri)
25
16
  expected = [
26
- {"content" => "Child 01 page."},
27
- {"content" => "Child 02 page."},
28
- {"content" => "Child 03 page."},
17
+ { "content" => "Child 01 page." },
18
+ { "content" => "Child 02 page." },
19
+ { "content" => "Child 03 page." }
29
20
  ]
30
21
  expect(actual).to match expected
31
22
  end
32
23
 
33
24
  it 'return empty set if no match node' do
34
25
  missing_xpath = '/html/body/b'
35
- root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
36
- Yasuri::TextNode.new('/html/body/p', "content"),
37
- ])
26
+ root_node = Yasuri::LinksNode.new(
27
+ missing_xpath, "root", [
28
+ Yasuri::TextNode.new('/html/body/p', "content")
29
+ ]
30
+ )
38
31
 
39
- actual = root_node.inject(@agent, @index_page)
32
+ actual = root_node.scrape(uri)
40
33
  expect(actual).to be_empty
41
34
  end
42
35
 
43
36
  it 'scrape links, recursive' do
44
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
45
- Yasuri::TextNode.new('/html/body/p', "content"),
46
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
47
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
48
- ]),
49
- ])
50
- actual = root_node.inject(@agent, @index_page)
37
+ root_node = Yasuri::LinksNode.new(
38
+ '/html/body/a', "root", [
39
+ Yasuri::TextNode.new('/html/body/p', "content"),
40
+ Yasuri::LinksNode.new(
41
+ '/html/body/ul/li/a', "sub_link", [
42
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
43
+ ]
44
+ )
45
+ ]
46
+ )
47
+ actual = root_node.scrape(uri)
51
48
  expected = [
52
- {"content" => "Child 01 page.",
53
- "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
54
- {"sub_page_title" => "Child 02 SubPage Test"}],},
55
- {"content" => "Child 02 page.",
56
- "sub_link" => [],},
57
- {"content" => "Child 03 page.",
58
- "sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
49
+ { "content" => "Child 01 page.",
50
+ "sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
51
+ { "sub_page_title" => "Child 02 SubPage Test" }] },
52
+ { "content" => "Child 02 page.",
53
+ "sub_link" => [] },
54
+ { "content" => "Child 03 page.",
55
+ "sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
59
56
  ]
60
57
  expect(actual).to match expected
61
58
  end
59
+
62
60
  it 'can be defined by DSL, return no contains if no child node' do
63
61
  root_node = Yasuri.links_title '/html/body/a'
64
- actual = root_node.inject(@agent, @index_page)
62
+ actual = root_node.scrape(uri)
65
63
  expected = [{}, {}, {}] # Empty if no child node under links node.
66
64
  expect(actual).to match expected
67
65
  end
68
66
 
69
67
  it 'can be defined return no contains if no child node' do
70
68
  root_node = Yasuri::LinksNode.new('/html/body/a', "title")
71
- actual = root_node.inject(@agent, @index_page)
69
+ actual = root_node.scrape(uri)
72
70
  expected = [{}, {}, {}] # Empty if no child node under links node.
73
71
  expect(actual).to match expected
74
72
  end
73
+
75
74
  it 'can be defined by DSL, return nested contents under link' do
76
75
  generated = Yasuri.links_title '/html/body/a' do
77
- text_name '/html/body/p'
78
- end
79
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
80
- Yasuri::TextNode.new('/html/body/p', "name"),
81
- ])
82
- compare_generated_vs_original(generated, original, @index_page)
76
+ text_name '/html/body/p'
77
+ end
78
+ original = Yasuri::LinksNode.new(
79
+ '/html/body/a', "root", [
80
+ Yasuri::TextNode.new('/html/body/p', "name")
81
+ ]
82
+ )
83
+ compare_generated_vs_original(generated, original, uri)
83
84
  end
84
85
 
85
86
  it 'can be defined by DSL, return recursive links node' do
@@ -90,27 +91,50 @@ describe 'Yasuri' do
90
91
  end
91
92
  end
92
93
 
93
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
94
- Yasuri::TextNode.new('/html/body/p', "content"),
95
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
96
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
97
- ]),
98
- ])
99
- compare_generated_vs_original(generated, original, @index_page)
94
+ original = Yasuri::LinksNode.new(
95
+ '/html/body/a', "root", [
96
+ Yasuri::TextNode.new('/html/body/p', "content"),
97
+ Yasuri::LinksNode.new(
98
+ '/html/body/ul/li/a', "sub_link", [
99
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
100
+ ]
101
+ )
102
+ ]
103
+ )
104
+ compare_generated_vs_original(generated, original, uri)
100
105
  end
101
106
 
102
107
  it 'return child node as symbol' do
103
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
104
- Yasuri::TextNode.new('/html/body/p', "content"),
105
- ])
108
+ root_node = Yasuri::LinksNode.new(
109
+ '/html/body/a', "root", [
110
+ Yasuri::TextNode.new('/html/body/p', "content")
111
+ ]
112
+ )
106
113
 
107
- actual = root_node.inject(@agent, @index_page, symbolize_names: true )
114
+ actual = root_node.scrape(uri, symbolize_names: true)
108
115
  expected = [
109
- {:content => "Child 01 page."},
110
- {:content => "Child 02 page."},
111
- {:content => "Child 03 page."},
116
+ { content: "Child 01 page." },
117
+ { content: "Child 02 page." },
118
+ { content: "Child 03 page." }
112
119
  ]
113
120
  expect(actual).to match expected
114
121
  end
122
+
123
+ it 'scrape with interval for each request' do
124
+ allow(Kernel).to receive(:sleep)
125
+
126
+ root_node = Yasuri::LinksNode.new(
127
+ '/html/body/a', "root", [
128
+ Yasuri::TextNode.new('/html/body/p', "content")
129
+ ]
130
+ )
131
+ actual = root_node.scrape(uri, interval_ms: 100)
132
+ expect(actual.size).to match 3
133
+
134
+ # request will be run 4(1+3) times because root page will be requested
135
+ expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
136
+ expect(interval_sec).to match 0.1
137
+ end
138
+ end
115
139
  end
116
140
  end
@@ -0,0 +1,71 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ describe '::MapNode' do
7
+ it "multi scrape in singe page" do
8
+ map = Yasuri.map_sample do
9
+ text_title '/html/head/title'
10
+ text_body_p '/html/body/p[1]'
11
+ end
12
+ actual = map.scrape(uri)
13
+
14
+ expected = {
15
+ "title" => "Yasuri Test",
16
+ "body_p" => "Hello,Yasuri"
17
+ }
18
+ expect(actual).to include expected
19
+ end
20
+
21
+ it "nested multi scrape in singe page" do
22
+ map = Yasuri.map_sample do
23
+ map_group1 { text_child01 '/html/body/a[1]' }
24
+ map_group2 do
25
+ text_child01 '/html/body/a[1]'
26
+ text_child03 '/html/body/a[3]'
27
+ end
28
+ end
29
+ actual = map.scrape(uri)
30
+
31
+ expected = {
32
+ "group1" => {
33
+ "child01" => "child01"
34
+ },
35
+ "group2" => {
36
+ "child01" => "child01",
37
+ "child03" => "child03"
38
+ }
39
+ }
40
+ expect(actual).to include expected
41
+ end
42
+
43
+ it "scrape with links node" do
44
+ map = Yasuri.map_sample do
45
+ map_group1 do
46
+ links_a '/html/body/a' do
47
+ text_content '/html/body/p'
48
+ end
49
+ text_child01 '/html/body/a[1]'
50
+ end
51
+ map_group2 do
52
+ text_child03 '/html/body/a[3]'
53
+ end
54
+ end
55
+ actual = map.scrape(uri)
56
+
57
+ expected = {
58
+ "group1" => {
59
+ "a" => [
60
+ { "content" => "Child 01 page." },
61
+ { "content" => "Child 02 page." },
62
+ { "content" => "Child 03 page." }
63
+ ],
64
+ "child01" => "child01"
65
+ },
66
+ "group2" => { "child03" => "child03" }
67
+ }
68
+ expect(actual).to include expected
69
+ end
70
+ end
71
+ end
@@ -1,107 +1,96 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- ############
7
- # Paginate #
8
- ############
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::PaginateNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @uri = uri + "/pagination/page01.html"
16
- @page = @agent.get(@uri)
17
- end
8
+ let(:uri_paginate) { "#{uri}/pagination/page01.html" }
18
9
 
19
10
  it "scrape each paginated pages" do
20
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
21
- Yasuri::TextNode.new('/html/body/p', "content"),
22
- ])
23
- actual = root_node.inject(@agent, @page)
11
+ root_node = Yasuri::PaginateNode.new(
12
+ "/html/body/nav/span/a[@class='next']", "root", [
13
+ Yasuri::TextNode.new('/html/body/p', "content")
14
+ ]
15
+ )
16
+ actual = root_node.scrape(uri_paginate)
24
17
  expected = [
25
- {"content" => "PaginationTest01"},
26
- {"content" => "PaginationTest02"},
27
- {"content" => "PaginationTest03"},
28
- {"content" => "PaginationTest04"},
18
+ { "content" => "PaginationTest01" },
19
+ { "content" => "PaginationTest02" },
20
+ { "content" => "PaginationTest03" },
21
+ { "content" => "PaginationTest04" }
29
22
  ]
30
23
  expect(actual).to match expected
31
24
  end
32
25
 
33
26
  it "scrape each paginated pages with flatten" do
34
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
- Yasuri::TextNode.new('/html/body/p', "content"),
36
- Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
- Yasuri::TextNode.new('./a', "text"),
38
- ]),
39
- ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
27
+ root_node = Yasuri::PaginateNode.new(
28
+ "/html/body/nav/span/a[@class='next']", "root", [
29
+ Yasuri::TextNode.new('/html/body/p', "content"),
30
+ Yasuri::StructNode.new(
31
+ '/html/body/nav/span', "span", [
32
+ Yasuri::TextNode.new('./a', "text")
33
+ ]
34
+ )
35
+ ], flatten: true
36
+ )
37
+ actual = root_node.scrape(uri_paginate)
41
38
  expected = [
42
- "PaginationTest01",
43
- {"text"=>""},
44
- {"text"=>""},
45
- {"text" => "2"},
46
- {"text" => "3"},
47
- {"text" => "4"},
48
- {"text"=>"NextPage »"},
49
- "PaginationTest02",
50
- {"text"=>"« PreviousPage"},
51
- {"text" => "1"},
52
- {"text"=>""},
53
- {"text" => "3"},
54
- {"text" => "4"},
55
- {"text"=>"NextPage »"},
56
- "PaginationTest03",
57
- {"text"=>"« PreviousPage"},
58
- {"text" => "1"},
59
- {"text" => "2"},
60
- {"text"=>""},
61
- {"text" => "4"},
62
- {"text"=>"NextPage »"},
63
- "PaginationTest04",
64
- {"text"=>"« PreviousPage"},
65
- {"text" => "1"},
66
- {"text" => "2"},
67
- {"text" => "3"},
68
- {"text"=>""},
69
- {"text"=>""},
39
+ "PaginationTest01", { "text" => "" },
40
+ { "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
41
+ { "text" => "NextPage »" },
42
+
43
+ "PaginationTest02", { "text" => "« PreviousPage" },
44
+ { "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
45
+ { "text" => "NextPage »" },
46
+
47
+ "PaginationTest03", { "text" => "« PreviousPage" },
48
+ { "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
49
+ { "text" => "NextPage »" },
50
+
51
+ "PaginationTest04", { "text" => "« PreviousPage" },
52
+ { "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
53
+ { "text" => "" }
70
54
  ]
71
55
 
72
56
  expect(actual).to match expected
73
57
  end
74
58
 
75
-
76
59
  it "scrape each paginated pages limited" do
77
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
- Yasuri::TextNode.new('/html/body/p', "content"),
79
- ], limit:3)
80
- actual = root_node.inject(@agent, @page)
60
+ root_node = Yasuri::PaginateNode.new(
61
+ "/html/body/nav/span/a[@class='next']", "root", [
62
+ Yasuri::TextNode.new('/html/body/p', "content")
63
+ ], limit: 3
64
+ )
65
+ actual = root_node.scrape(uri_paginate)
81
66
  expected = [
82
- {"content" => "PaginationTest01"},
83
- {"content" => "PaginationTest02"},
84
- {"content" => "PaginationTest03"},
67
+ { "content" => "PaginationTest01" },
68
+ { "content" => "PaginationTest02" },
69
+ { "content" => "PaginationTest03" }
85
70
  ]
86
71
  expect(actual).to match expected
87
72
  end
88
73
 
89
74
  it 'return first content if paginate link node is not found' do
90
75
  missing_xpath = "/html/body/nav/span/b[@class='next']"
91
- root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
92
- Yasuri::TextNode.new('/html/body/p', "content"),
93
- ])
94
- actual = root_node.inject(@agent, @page)
95
- expected = [ {"content" => "PaginationTest01"}, ]
76
+ root_node = Yasuri::PaginateNode.new(
77
+ missing_xpath, "root", [
78
+ Yasuri::TextNode.new('/html/body/p', "content")
79
+ ]
80
+ )
81
+ actual = root_node.scrape(uri_paginate)
82
+ expected = [{ "content" => "PaginationTest01" }]
96
83
  expect(actual).to match_array expected
97
84
  end
98
85
 
99
86
  it 'return empty hashes if content node is not found' do
100
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
101
- Yasuri::TextNode.new('/html/body/hoge', "content"),
102
- ])
103
- actual = root_node.inject(@agent, @page)
104
- expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
87
+ root_node = Yasuri::PaginateNode.new(
88
+ "/html/body/nav/span/a[@class='next']", "root", [
89
+ Yasuri::TextNode.new('/html/body/hoge', "content")
90
+ ]
91
+ )
92
+ actual = root_node.scrape(uri_paginate)
93
+ expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
105
94
  expect(actual).to match_array expected
106
95
  end
107
96
 
@@ -109,34 +98,56 @@ describe 'Yasuri' do
109
98
  generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
110
99
  text_content '/html/body/p'
111
100
  end
112
- original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
113
- Yasuri::TextNode.new('/html/body/p', "content"),
114
- ])
115
- compare_generated_vs_original(generated, original, @page)
101
+ original = Yasuri::PaginateNode.new(
102
+ "/html/body/nav/span/a[@class='next']", "root", [
103
+ Yasuri::TextNode.new('/html/body/p', "content")
104
+ ]
105
+ )
106
+ compare_generated_vs_original(generated, original, uri_paginate)
116
107
  end
117
108
 
118
109
  it 'can be defined by DSL, return single PaginateNode content limited' do
119
- generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
110
+ generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
120
111
  text_content '/html/body/p'
121
112
  end
122
- original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
123
- Yasuri::TextNode.new('/html/body/p', "content"),
124
- ], limit: 2)
125
- compare_generated_vs_original(generated, original, @page)
113
+ original = Yasuri::PaginateNode.new(
114
+ "/html/body/nav/span/a[@class='next']", "root", [
115
+ Yasuri::TextNode.new('/html/body/p', "content")
116
+ ], limit: 2
117
+ )
118
+ compare_generated_vs_original(generated, original, uri_paginate)
126
119
  end
127
120
 
128
121
  it "return child node as symbol" do
129
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
130
- Yasuri::TextNode.new('/html/body/p', "content"),
131
- ])
132
- actual = root_node.inject(@agent, @page, symbolize_names:true)
122
+ root_node = Yasuri::PaginateNode.new(
123
+ "/html/body/nav/span/a[@class='next']", "root", [
124
+ Yasuri::TextNode.new('/html/body/p', "content")
125
+ ]
126
+ )
127
+ actual = root_node.scrape(uri_paginate, symbolize_names: true)
133
128
  expected = [
134
- {:content => "PaginationTest01"},
135
- {:content => "PaginationTest02"},
136
- {:content => "PaginationTest03"},
137
- {:content => "PaginationTest04"},
129
+ { content: "PaginationTest01" },
130
+ { content: "PaginationTest02" },
131
+ { content: "PaginationTest03" },
132
+ { content: "PaginationTest04" }
138
133
  ]
139
134
  expect(actual).to match expected
140
135
  end
136
+
137
+ it "scrape with interval for each request" do
138
+ allow(Kernel).to receive(:sleep)
139
+
140
+ root_node = Yasuri::PaginateNode.new(
141
+ "/html/body/nav/span/a[@class='next']", "root", [
142
+ Yasuri::TextNode.new('/html/body/p', "content")
143
+ ]
144
+ )
145
+ actual = root_node.scrape(uri_paginate, interval_ms: 1000)
146
+ expect(actual.size).to match 4
147
+
148
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
149
+ expect(interval_sec).to match 1.0
150
+ end
151
+ end
141
152
  end
142
153
  end