yasuri 2.0.12 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,75 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ before do
7
+ @uri = uri
8
+ end
9
+
10
+ describe '::MapNode' do
11
+ it "multi scrape in singe page" do
12
+ map = Yasuri.map_sample do
13
+ text_title '/html/head/title'
14
+ text_body_p '/html/body/p[1]'
15
+ end
16
+ actual = map.scrape(@uri)
17
+
18
+ expected = {
19
+ "title" => "Yasuri Test",
20
+ "body_p" => "Hello,Yasuri"
21
+ }
22
+ expect(actual).to include expected
23
+ end
24
+
25
+ it "nested multi scrape in singe page" do
26
+ map = Yasuri.map_sample do
27
+ map_group1 { text_child01 '/html/body/a[1]' }
28
+ map_group2 do
29
+ text_child01 '/html/body/a[1]'
30
+ text_child03 '/html/body/a[3]'
31
+ end
32
+ end
33
+ actual = map.scrape(@uri)
34
+
35
+ expected = {
36
+ "group1" => {
37
+ "child01" => "child01"
38
+ },
39
+ "group2" => {
40
+ "child01" => "child01",
41
+ "child03" => "child03"
42
+ }
43
+ }
44
+ expect(actual).to include expected
45
+ end
46
+
47
+ it "scrape with links node" do
48
+ map = Yasuri.map_sample do
49
+ map_group1 do
50
+ links_a '/html/body/a' do
51
+ text_content '/html/body/p'
52
+ end
53
+ text_child01 '/html/body/a[1]'
54
+ end
55
+ map_group2 do
56
+ text_child03 '/html/body/a[3]'
57
+ end
58
+ end
59
+ actual = map.scrape(@uri)
60
+
61
+ expected = {
62
+ "group1" => {
63
+ "a" => [
64
+ {"content" => "Child 01 page."},
65
+ {"content" => "Child 02 page."},
66
+ {"content" => "Child 03 page."},
67
+ ],
68
+ "child01" => "child01"
69
+ },
70
+ "group2" => { "child03" => "child03" }
71
+ }
72
+ expect(actual).to include expected
73
+ end
74
+ end
75
+ end
@@ -11,16 +11,14 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::PaginateNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri + "/pagination/page01.html"
16
- @page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it "scrape each paginated pages" do
20
18
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
- actual = root_node.inject(@agent, @page)
21
+ actual = root_node.scrape(@uri)
24
22
  expected = [
25
23
  {"content" => "PaginationTest01"},
26
24
  {"content" => "PaginationTest02"},
@@ -37,7 +35,7 @@ describe 'Yasuri' do
37
35
  Yasuri::TextNode.new('./a', "text"),
38
36
  ]),
39
37
  ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
38
+ actual = root_node.scrape(@uri)
41
39
  expected = [
42
40
  "PaginationTest01",
43
41
  {"text"=>""},
@@ -77,7 +75,7 @@ describe 'Yasuri' do
77
75
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
76
  Yasuri::TextNode.new('/html/body/p', "content"),
79
77
  ], limit:3)
80
- actual = root_node.inject(@agent, @page)
78
+ actual = root_node.scrape(@uri)
81
79
  expected = [
82
80
  {"content" => "PaginationTest01"},
83
81
  {"content" => "PaginationTest02"},
@@ -91,7 +89,7 @@ describe 'Yasuri' do
91
89
  root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
92
90
  Yasuri::TextNode.new('/html/body/p', "content"),
93
91
  ])
94
- actual = root_node.inject(@agent, @page)
92
+ actual = root_node.scrape(@uri)
95
93
  expected = [ {"content" => "PaginationTest01"}, ]
96
94
  expect(actual).to match_array expected
97
95
  end
@@ -100,7 +98,7 @@ describe 'Yasuri' do
100
98
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
101
99
  Yasuri::TextNode.new('/html/body/hoge', "content"),
102
100
  ])
103
- actual = root_node.inject(@agent, @page)
101
+ actual = root_node.scrape(@uri)
104
102
  expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
105
103
  expect(actual).to match_array expected
106
104
  end
@@ -112,7 +110,7 @@ describe 'Yasuri' do
112
110
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
113
111
  Yasuri::TextNode.new('/html/body/p', "content"),
114
112
  ])
115
- compare_generated_vs_original(generated, original, @page)
113
+ compare_generated_vs_original(generated, original, @uri)
116
114
  end
117
115
 
118
116
  it 'can be defined by DSL, return single PaginateNode content limited' do
@@ -122,14 +120,14 @@ describe 'Yasuri' do
122
120
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
123
121
  Yasuri::TextNode.new('/html/body/p', "content"),
124
122
  ], limit: 2)
125
- compare_generated_vs_original(generated, original, @page)
123
+ compare_generated_vs_original(generated, original, @uri)
126
124
  end
127
125
 
128
126
  it "return child node as symbol" do
129
127
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
130
128
  Yasuri::TextNode.new('/html/body/p', "content"),
131
129
  ])
132
- actual = root_node.inject(@agent, @page, symbolize_names:true)
130
+ actual = root_node.scrape(@uri, symbolize_names:true)
133
131
  expected = [
134
132
  {:content => "PaginationTest01"},
135
133
  {:content => "PaginationTest02"},
@@ -138,5 +136,19 @@ describe 'Yasuri' do
138
136
  ]
139
137
  expect(actual).to match expected
140
138
  end
139
+
140
+ it "scrape with interval for each request" do
141
+ allow(Kernel).to receive(:sleep)
142
+
143
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
144
+ Yasuri::TextNode.new('/html/body/p', "content"),
145
+ ])
146
+ actual = root_node.scrape(@uri, interval_ms: 1000)
147
+ expect(actual.size).to match 4
148
+
149
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
150
+ expect(interval_sec).to match 1.0
151
+ end
152
+ end
141
153
  end
142
154
  end
data/spec/yasuri_spec.rb CHANGED
@@ -8,11 +8,78 @@ describe 'Yasuri' do
8
8
  include_context 'httpserver'
9
9
 
10
10
  before do
11
- @agent = Mechanize.new
12
11
  @uri = uri
13
- @index_page = @agent.get(@uri)
14
12
  end
15
13
 
14
+
15
+ ############
16
+ # yam2tree #
17
+ ############
18
+ describe '.yaml2tree' do
19
+ it "fail if empty yaml" do
20
+ expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
21
+ end
22
+
23
+ it "return text node" do
24
+ src = <<-EOB
25
+ text_content: "/html/body/p[1]"
26
+ EOB
27
+ generated = Yasuri.yaml2tree(src)
28
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
29
+
30
+ compare_generated_vs_original(generated, original, @uri)
31
+ end
32
+
33
+ it "return text node as symbol" do
34
+ src = <<-EOB
35
+ :text_content:
36
+ :path: "/html/body/p[1]"
37
+ EOB
38
+ generated = Yasuri.yaml2tree(src)
39
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
40
+
41
+ compare_generated_vs_original(generated, original, @uri)
42
+ end
43
+
44
+ it "return LinksNode/TextNode" do
45
+
46
+ src = <<-EOB
47
+ links_root:
48
+ path: "/html/body/a"
49
+ text_content: "/html/body/p"
50
+ EOB
51
+ generated = Yasuri.yaml2tree(src)
52
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
53
+ Yasuri::TextNode.new('/html/body/p', "content"),
54
+ ])
55
+
56
+ compare_generated_vs_original(generated, original, @uri)
57
+ end
58
+
59
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
60
+ src = <<-EOB
61
+ struct_tables:
62
+ path: "/html/body/table"
63
+ struct_table:
64
+ path: "./tr"
65
+ text_title: "./td[1]"
66
+ text_pub_date: "./td[2]"
67
+ EOB
68
+
69
+ generated = Yasuri.yaml2tree(src)
70
+ original = Yasuri::StructNode.new('/html/body/table', "tables", [
71
+ Yasuri::StructNode.new('./tr', "table", [
72
+ Yasuri::TextNode.new('./td[1]', "title"),
73
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
74
+ ])
75
+ ])
76
+ uri = @uri + "/struct/structual_text.html"
77
+ compare_generated_vs_original(generated, original, uri)
78
+ end
79
+
80
+ end # end of describe '.yaml2tree'
81
+
82
+
16
83
  #############
17
84
  # json2tree #
18
85
  #############
@@ -22,103 +89,107 @@ describe 'Yasuri' do
22
89
  end
23
90
 
24
91
  it "return TextNode" do
25
- src = %q| { "node" : "text",
26
- "name" : "content",
27
- "path" : "/html/body/p[1]"
28
- }|
92
+ src = %q|
93
+ {
94
+ "text_content": "/html/body/p[1]"
95
+ }|
29
96
  generated = Yasuri.json2tree(src)
30
97
  original = Yasuri::TextNode.new('/html/body/p[1]', "content")
31
98
 
32
- compare_generated_vs_original(generated, original, @index_page)
99
+ compare_generated_vs_original(generated, original, @uri)
33
100
  end
34
101
 
35
102
  it "return TextNode with truncate_regexp" do
36
- src = %q| { "node" : "text",
37
- "name" : "content",
38
- "path" : "/html/body/p[1]",
39
- "truncate" : "^[^,]+"
40
- }|
103
+ src = %q|
104
+ {
105
+ "text_content": {
106
+ "path": "/html/body/p[1]",
107
+ "truncate" : "^[^,]+"
108
+ }
109
+ }|
41
110
  generated = Yasuri.json2tree(src)
42
111
  original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
43
- compare_generated_vs_original(generated, original, @index_page)
112
+ compare_generated_vs_original(generated, original, @uri)
44
113
  end
45
114
 
115
+ it "return MapNode with TextNodes" do
116
+ src = %q|
117
+ {
118
+ "text_content01": "/html/body/p[1]",
119
+ "text_content02": "/html/body/p[2]"
120
+ }|
121
+ generated = Yasuri.json2tree(src)
122
+ original = Yasuri::MapNode.new('parent', [
123
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
124
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
125
+ ])
126
+ compare_generated_vs_original(generated, original, @uri)
127
+ end
46
128
 
47
129
  it "return LinksNode/TextNode" do
48
- src = %q| { "node" : "links",
49
- "name" : "root",
50
- "path" : "/html/body/a",
51
- "children" : [ { "node" : "text",
52
- "name" : "content",
53
- "path" : "/html/body/p"
54
- } ]
55
- }|
130
+ src = %q|
131
+ {
132
+ "links_root": {
133
+ "path": "/html/body/a",
134
+ "text_content": "/html/body/p"
135
+ }
136
+ }|
137
+
56
138
  generated = Yasuri.json2tree(src)
57
139
  original = Yasuri::LinksNode.new('/html/body/a', "root", [
58
140
  Yasuri::TextNode.new('/html/body/p', "content"),
59
141
  ])
60
142
 
61
- compare_generated_vs_original(generated, original, @index_page)
143
+ compare_generated_vs_original(generated, original, @uri)
62
144
  end
63
145
 
64
146
  it "return PaginateNode/TextNode" do
65
- src = %q|{ "node" : "pages",
66
- "name" : "root",
67
- "path" : "/html/body/nav/span/a[@class=\'next\']",
68
- "children" : [ { "node" : "text",
69
- "name" : "content",
70
- "path" : "/html/body/p"
71
- } ]
72
- }|
147
+ src = %q|
148
+ {
149
+ "pages_root": {
150
+ "path": "/html/body/nav/span/a[@class=\'next\']",
151
+ "text_content": "/html/body/p"
152
+ }
153
+ }|
73
154
  generated = Yasuri.json2tree(src)
74
155
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
75
156
  Yasuri::TextNode.new('/html/body/p', "content"),
76
157
  ])
77
158
 
78
- paginate_test_uri = @uri + "/pagination/page01.html"
79
- paginate_test_page = @agent.get(paginate_test_uri)
80
- compare_generated_vs_original(generated, original, paginate_test_page)
159
+ uri = @uri + "/pagination/page01.html"
160
+ compare_generated_vs_original(generated, original, uri)
81
161
  end
82
162
 
83
163
  it "return PaginateNode/TextNode with limit" do
84
- src = %q|{ "node" : "pages",
85
- "name" : "root",
86
- "path" : "/html/body/nav/span/a[@class=\'next\']",
87
- "limit" : 2,
88
- "children" : [ { "node" : "text",
89
- "name" : "content",
90
- "path" : "/html/body/p"
91
- } ]
92
- }|
164
+ src = %q|
165
+ {
166
+ "pages_root": {
167
+ "path": "/html/body/nav/span/a[@class=\'next\']",
168
+ "limit": 2,
169
+ "text_content": "/html/body/p"
170
+ }
171
+ }|
93
172
  generated = Yasuri.json2tree(src)
94
173
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
95
174
  Yasuri::TextNode.new('/html/body/p', "content"),
96
175
  ], limit:2)
97
176
 
98
- paginate_test_uri = @uri + "/pagination/page01.html"
99
- paginate_test_page = @agent.get(paginate_test_uri)
100
- compare_generated_vs_original(generated, original, paginate_test_page)
177
+ uri = @uri + "/pagination/page01.html"
178
+ compare_generated_vs_original(generated, original, uri)
101
179
  end
102
180
 
103
181
  it "return StructNode/StructNode/[TextNode,TextNode]" do
104
- src = %q| { "node" : "struct",
105
- "name" : "tables",
106
- "path" : "/html/body/table",
107
- "children" : [
108
- { "node" : "struct",
109
- "name" : "table",
110
- "path" : "./tr",
111
- "children" : [
112
- { "node" : "text",
113
- "name" : "title",
114
- "path" : "./td[1]"
115
- },
116
- { "node" : "text",
117
- "name" : "pub_date",
118
- "path" : "./td[2]"
119
- }]
120
- }]
121
- }|
182
+ src = %q|
183
+ {
184
+ "struct_tables": {
185
+ "path": "/html/body/table",
186
+ "struct_table": {
187
+ "path": "./tr",
188
+ "text_title": "./td[1]",
189
+ "text_pub_date": "./td[2]"
190
+ }
191
+ }
192
+ }|
122
193
  generated = Yasuri.json2tree(src)
123
194
  original = Yasuri::StructNode.new('/html/body/table', "tables", [
124
195
  Yasuri::StructNode.new('./tr', "table", [
@@ -126,27 +197,27 @@ describe 'Yasuri' do
126
197
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
198
  ])
128
199
  ])
129
- page = @agent.get(@uri + "/struct/structual_text.html")
130
- compare_generated_vs_original(generated, original, page)
200
+ uri = @uri + "/struct/structual_text.html"
201
+ compare_generated_vs_original(generated, original, uri)
131
202
  end
132
203
  end
133
204
 
205
+
134
206
  #############
135
207
  # tree2json #
136
208
  #############
137
209
  describe '.tree2json' do
138
210
  it "return empty json" do
139
- json = Yasuri.tree2json(nil)
140
- expect(json).to match "{}"
211
+ expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
141
212
  end
142
213
 
143
214
  it "return text node" do
144
215
  node = Yasuri::TextNode.new("/html/head/title", "title")
145
216
  json = Yasuri.tree2json(node)
146
- expected_str = %q| { "node": "text",
147
- "name": "title",
148
- "path": "/html/head/title"
149
- } |
217
+ expected_str = %q|
218
+ {
219
+ "text_title": "/html/head/title"
220
+ }|
150
221
  expected = JSON.parse(expected_str)
151
222
  actual = JSON.parse(json)
152
223
  expect(actual).to match expected
@@ -155,29 +226,49 @@ describe 'Yasuri' do
155
226
  it "return text node with truncate_regexp" do
156
227
  node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
157
228
  json = Yasuri.tree2json(node)
158
- expected_str = %q| { "node": "text",
159
- "name": "title",
160
- "path": "/html/head/title",
161
- "truncate": "^[^,]+"
162
- } |
229
+ expected_str = %q|
230
+ {
231
+ "text_title": {
232
+ "path": "/html/head/title",
233
+ "truncate": "^[^,]+"
234
+ }
235
+ }|
163
236
  expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
164
237
  actual = Yasuri.tree2json(Yasuri.json2tree(json))
165
238
  expect(actual).to match expected
166
239
  end
167
240
 
241
+ it "return map node with text nodes" do
242
+ tree = Yasuri::MapNode.new('parent', [
243
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
244
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
245
+ ])
246
+ actual_json = Yasuri.tree2json(tree)
247
+
248
+ expected_json = %q|
249
+ {
250
+ "text_content01": "/html/body/p[1]",
251
+ "text_content02": "/html/body/p[2]"
252
+ }|
253
+
254
+ expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
255
+ actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
256
+ expect(actual).to match expected
257
+ end
258
+
168
259
  it "return LinksNode/TextNode" do
169
260
  tree = Yasuri::LinksNode.new('/html/body/a', "root", [
170
261
  Yasuri::TextNode.new('/html/body/p', "content"),
171
262
  ])
172
263
  json = Yasuri.tree2json(tree)
173
- expected_src = %q| { "node" : "links",
174
- "name" : "root",
175
- "path" : "/html/body/a",
176
- "children" : [ { "node" : "text",
177
- "name" : "content",
178
- "path" : "/html/body/p"
179
- } ]
180
- }|
264
+
265
+ expected_src = %q|
266
+ {
267
+ "links_root": {
268
+ "path": "/html/body/a",
269
+ "text_content":"/html/body/p"
270
+ }
271
+ }|
181
272
  expected = JSON.parse(expected_src)
182
273
  actual = JSON.parse(json)
183
274
  expect(actual).to match expected
@@ -189,25 +280,84 @@ describe 'Yasuri' do
189
280
  ], limit:10)
190
281
 
191
282
  json = Yasuri.tree2json(tree)
192
- expected_src = %q| { "node" : "pages",
193
- "name" : "root",
194
- "path" : "/html/body/nav/span/a[@class='next']",
195
- "limit" : 10,
196
- "flatten" : false,
197
- "children" : [ { "node" : "text",
198
- "name" : "content",
199
- "path" : "/html/body/p"
200
- } ]
201
- }|
283
+ expected_src = %q|
284
+ {
285
+ "pages_root": {
286
+ "path": "/html/body/nav/span/a[@class='next']",
287
+ "limit": 10,
288
+ "flatten": false,
289
+ "text_content": "/html/body/p"
290
+ }
291
+ }|
202
292
  expected = JSON.parse(expected_src)
203
293
  actual = JSON.parse(json)
204
294
  expect(actual).to match expected
205
295
  end
296
+ end
297
+
298
+ describe '.with_retry' do
299
+ it "call once if success" do
300
+ actual = Yasuri.with_retry(0){ 42 }
301
+ expect(actual).to match 42
302
+ end
303
+
304
+ it "call untile success" do
305
+ i = [1,1,0,0]
306
+ actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
307
+ expect(actual).to match 42/1
308
+ end
309
+
310
+ it "raise error when exceed retry count" do
311
+ i = [1,0,0,0]
312
+ expect {
313
+ Yasuri.with_retry(2){42 / i.pop } # do this 3 times
314
+ }.to raise_error(Exception)
315
+ end
316
+
317
+ it "wait interval before run" do
318
+ allow(Kernel).to receive(:sleep)
319
+ Yasuri.with_retry(0){ 42 }
320
+ expect(Kernel).to have_received(:sleep).once
321
+ end
206
322
 
323
+ it "wait interval before run" do
324
+ allow(Kernel).to receive(:sleep)
325
+ Yasuri.with_retry(0){ 42 }
326
+ expect(Kernel).to have_received(:sleep).once
327
+ end
207
328
 
329
+ it "wait interval for each runs" do
330
+ allow(Kernel).to receive(:sleep)
208
331
 
332
+ i = [1,1,0,0]
333
+ Yasuri.with_retry(2){42 / i.pop } # 3 times in max
334
+ expect(Kernel).to have_received(:sleep).exactly(3).times
335
+ end
209
336
  end
210
337
 
338
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
339
+ tree = Yasuri::StructNode.new('/html/body/table', "tables", [
340
+ Yasuri::StructNode.new('./tr', "table", [
341
+ Yasuri::TextNode.new('./td[1]', "title"),
342
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
343
+ ])
344
+ ])
345
+ json = Yasuri.tree2json(tree)
346
+ expected_src = %q|
347
+ {
348
+ "struct_tables": {
349
+ "path": "/html/body/table",
350
+ "struct_table": {
351
+ "path": "./tr",
352
+ "text_title": "./td[1]",
353
+ "text_pub_date": "./td[2]"
354
+ }
355
+ }
356
+ }|
357
+ expected = JSON.parse(expected_src)
358
+ actual = JSON.parse(json)
359
+ expect(actual).to match expected
360
+ end
211
361
 
212
362
  it 'has a version number' do
213
363
  expect(Yasuri::VERSION).not_to be nil