yasuri 2.0.12 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ before do
7
+ @uri = uri
8
+ end
9
+
10
+ describe '::MapNode' do
11
+ it "multi scrape in singe page" do
12
+ map = Yasuri.map_sample do
13
+ text_title '/html/head/title'
14
+ text_body_p '/html/body/p[1]'
15
+ end
16
+ actual = map.scrape(@uri)
17
+
18
+ expected = {
19
+ "title" => "Yasuri Test",
20
+ "body_p" => "Hello,Yasuri"
21
+ }
22
+ expect(actual).to include expected
23
+ end
24
+
25
+ it "nested multi scrape in singe page" do
26
+ map = Yasuri.map_sample do
27
+ map_group1 { text_child01 '/html/body/a[1]' }
28
+ map_group2 do
29
+ text_child01 '/html/body/a[1]'
30
+ text_child03 '/html/body/a[3]'
31
+ end
32
+ end
33
+ actual = map.scrape(@uri)
34
+
35
+ expected = {
36
+ "group1" => {
37
+ "child01" => "child01"
38
+ },
39
+ "group2" => {
40
+ "child01" => "child01",
41
+ "child03" => "child03"
42
+ }
43
+ }
44
+ expect(actual).to include expected
45
+ end
46
+
47
+ it "scrape with links node" do
48
+ map = Yasuri.map_sample do
49
+ map_group1 do
50
+ links_a '/html/body/a' do
51
+ text_content '/html/body/p'
52
+ end
53
+ text_child01 '/html/body/a[1]'
54
+ end
55
+ map_group2 do
56
+ text_child03 '/html/body/a[3]'
57
+ end
58
+ end
59
+ actual = map.scrape(@uri)
60
+
61
+ expected = {
62
+ "group1" => {
63
+ "a" => [
64
+ {"content" => "Child 01 page."},
65
+ {"content" => "Child 02 page."},
66
+ {"content" => "Child 03 page."},
67
+ ],
68
+ "child01" => "child01"
69
+ },
70
+ "group2" => { "child03" => "child03" }
71
+ }
72
+ expect(actual).to include expected
73
+ end
74
+ end
75
+ end
@@ -11,16 +11,14 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::PaginateNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri + "/pagination/page01.html"
16
- @page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it "scrape each paginated pages" do
20
18
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
- actual = root_node.inject(@agent, @page)
21
+ actual = root_node.scrape(@uri)
24
22
  expected = [
25
23
  {"content" => "PaginationTest01"},
26
24
  {"content" => "PaginationTest02"},
@@ -37,7 +35,7 @@ describe 'Yasuri' do
37
35
  Yasuri::TextNode.new('./a', "text"),
38
36
  ]),
39
37
  ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
38
+ actual = root_node.scrape(@uri)
41
39
  expected = [
42
40
  "PaginationTest01",
43
41
  {"text"=>""},
@@ -77,7 +75,7 @@ describe 'Yasuri' do
77
75
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
76
  Yasuri::TextNode.new('/html/body/p', "content"),
79
77
  ], limit:3)
80
- actual = root_node.inject(@agent, @page)
78
+ actual = root_node.scrape(@uri)
81
79
  expected = [
82
80
  {"content" => "PaginationTest01"},
83
81
  {"content" => "PaginationTest02"},
@@ -91,7 +89,7 @@ describe 'Yasuri' do
91
89
  root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
92
90
  Yasuri::TextNode.new('/html/body/p', "content"),
93
91
  ])
94
- actual = root_node.inject(@agent, @page)
92
+ actual = root_node.scrape(@uri)
95
93
  expected = [ {"content" => "PaginationTest01"}, ]
96
94
  expect(actual).to match_array expected
97
95
  end
@@ -100,7 +98,7 @@ describe 'Yasuri' do
100
98
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
101
99
  Yasuri::TextNode.new('/html/body/hoge', "content"),
102
100
  ])
103
- actual = root_node.inject(@agent, @page)
101
+ actual = root_node.scrape(@uri)
104
102
  expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
105
103
  expect(actual).to match_array expected
106
104
  end
@@ -112,7 +110,7 @@ describe 'Yasuri' do
112
110
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
113
111
  Yasuri::TextNode.new('/html/body/p', "content"),
114
112
  ])
115
- compare_generated_vs_original(generated, original, @page)
113
+ compare_generated_vs_original(generated, original, @uri)
116
114
  end
117
115
 
118
116
  it 'can be defined by DSL, return single PaginateNode content limited' do
@@ -122,14 +120,14 @@ describe 'Yasuri' do
122
120
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
123
121
  Yasuri::TextNode.new('/html/body/p', "content"),
124
122
  ], limit: 2)
125
- compare_generated_vs_original(generated, original, @page)
123
+ compare_generated_vs_original(generated, original, @uri)
126
124
  end
127
125
 
128
126
  it "return child node as symbol" do
129
127
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
130
128
  Yasuri::TextNode.new('/html/body/p', "content"),
131
129
  ])
132
- actual = root_node.inject(@agent, @page, symbolize_names:true)
130
+ actual = root_node.scrape(@uri, symbolize_names:true)
133
131
  expected = [
134
132
  {:content => "PaginationTest01"},
135
133
  {:content => "PaginationTest02"},
@@ -138,5 +136,19 @@ describe 'Yasuri' do
138
136
  ]
139
137
  expect(actual).to match expected
140
138
  end
139
+
140
+ it "scrape with interval for each request" do
141
+ allow(Kernel).to receive(:sleep)
142
+
143
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
144
+ Yasuri::TextNode.new('/html/body/p', "content"),
145
+ ])
146
+ actual = root_node.scrape(@uri, interval_ms: 1000)
147
+ expect(actual.size).to match 4
148
+
149
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
150
+ expect(interval_sec).to match 1.0
151
+ end
152
+ end
141
153
  end
142
154
  end
data/spec/yasuri_spec.rb CHANGED
@@ -8,11 +8,78 @@ describe 'Yasuri' do
8
8
  include_context 'httpserver'
9
9
 
10
10
  before do
11
- @agent = Mechanize.new
12
11
  @uri = uri
13
- @index_page = @agent.get(@uri)
14
12
  end
15
13
 
14
+
15
+ ############
16
+ # yam2tree #
17
+ ############
18
+ describe '.yaml2tree' do
19
+ it "fail if empty yaml" do
20
+ expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
21
+ end
22
+
23
+ it "return text node" do
24
+ src = <<-EOB
25
+ text_content: "/html/body/p[1]"
26
+ EOB
27
+ generated = Yasuri.yaml2tree(src)
28
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
29
+
30
+ compare_generated_vs_original(generated, original, @uri)
31
+ end
32
+
33
+ it "return text node as symbol" do
34
+ src = <<-EOB
35
+ :text_content:
36
+ :path: "/html/body/p[1]"
37
+ EOB
38
+ generated = Yasuri.yaml2tree(src)
39
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
40
+
41
+ compare_generated_vs_original(generated, original, @uri)
42
+ end
43
+
44
+ it "return LinksNode/TextNode" do
45
+
46
+ src = <<-EOB
47
+ links_root:
48
+ path: "/html/body/a"
49
+ text_content: "/html/body/p"
50
+ EOB
51
+ generated = Yasuri.yaml2tree(src)
52
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
53
+ Yasuri::TextNode.new('/html/body/p', "content"),
54
+ ])
55
+
56
+ compare_generated_vs_original(generated, original, @uri)
57
+ end
58
+
59
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
60
+ src = <<-EOB
61
+ struct_tables:
62
+ path: "/html/body/table"
63
+ struct_table:
64
+ path: "./tr"
65
+ text_title: "./td[1]"
66
+ text_pub_date: "./td[2]"
67
+ EOB
68
+
69
+ generated = Yasuri.yaml2tree(src)
70
+ original = Yasuri::StructNode.new('/html/body/table', "tables", [
71
+ Yasuri::StructNode.new('./tr', "table", [
72
+ Yasuri::TextNode.new('./td[1]', "title"),
73
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
74
+ ])
75
+ ])
76
+ uri = @uri + "/struct/structual_text.html"
77
+ compare_generated_vs_original(generated, original, uri)
78
+ end
79
+
80
+ end # end of describe '.yaml2tree'
81
+
82
+
16
83
  #############
17
84
  # json2tree #
18
85
  #############
@@ -22,103 +89,107 @@ describe 'Yasuri' do
22
89
  end
23
90
 
24
91
  it "return TextNode" do
25
- src = %q| { "node" : "text",
26
- "name" : "content",
27
- "path" : "/html/body/p[1]"
28
- }|
92
+ src = %q|
93
+ {
94
+ "text_content": "/html/body/p[1]"
95
+ }|
29
96
  generated = Yasuri.json2tree(src)
30
97
  original = Yasuri::TextNode.new('/html/body/p[1]', "content")
31
98
 
32
- compare_generated_vs_original(generated, original, @index_page)
99
+ compare_generated_vs_original(generated, original, @uri)
33
100
  end
34
101
 
35
102
  it "return TextNode with truncate_regexp" do
36
- src = %q| { "node" : "text",
37
- "name" : "content",
38
- "path" : "/html/body/p[1]",
39
- "truncate" : "^[^,]+"
40
- }|
103
+ src = %q|
104
+ {
105
+ "text_content": {
106
+ "path": "/html/body/p[1]",
107
+ "truncate" : "^[^,]+"
108
+ }
109
+ }|
41
110
  generated = Yasuri.json2tree(src)
42
111
  original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
43
- compare_generated_vs_original(generated, original, @index_page)
112
+ compare_generated_vs_original(generated, original, @uri)
44
113
  end
45
114
 
115
+ it "return MapNode with TextNodes" do
116
+ src = %q|
117
+ {
118
+ "text_content01": "/html/body/p[1]",
119
+ "text_content02": "/html/body/p[2]"
120
+ }|
121
+ generated = Yasuri.json2tree(src)
122
+ original = Yasuri::MapNode.new('parent', [
123
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
124
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
125
+ ])
126
+ compare_generated_vs_original(generated, original, @uri)
127
+ end
46
128
 
47
129
  it "return LinksNode/TextNode" do
48
- src = %q| { "node" : "links",
49
- "name" : "root",
50
- "path" : "/html/body/a",
51
- "children" : [ { "node" : "text",
52
- "name" : "content",
53
- "path" : "/html/body/p"
54
- } ]
55
- }|
130
+ src = %q|
131
+ {
132
+ "links_root": {
133
+ "path": "/html/body/a",
134
+ "text_content": "/html/body/p"
135
+ }
136
+ }|
137
+
56
138
  generated = Yasuri.json2tree(src)
57
139
  original = Yasuri::LinksNode.new('/html/body/a', "root", [
58
140
  Yasuri::TextNode.new('/html/body/p', "content"),
59
141
  ])
60
142
 
61
- compare_generated_vs_original(generated, original, @index_page)
143
+ compare_generated_vs_original(generated, original, @uri)
62
144
  end
63
145
 
64
146
  it "return PaginateNode/TextNode" do
65
- src = %q|{ "node" : "pages",
66
- "name" : "root",
67
- "path" : "/html/body/nav/span/a[@class=\'next\']",
68
- "children" : [ { "node" : "text",
69
- "name" : "content",
70
- "path" : "/html/body/p"
71
- } ]
72
- }|
147
+ src = %q|
148
+ {
149
+ "pages_root": {
150
+ "path": "/html/body/nav/span/a[@class=\'next\']",
151
+ "text_content": "/html/body/p"
152
+ }
153
+ }|
73
154
  generated = Yasuri.json2tree(src)
74
155
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
75
156
  Yasuri::TextNode.new('/html/body/p', "content"),
76
157
  ])
77
158
 
78
- paginate_test_uri = @uri + "/pagination/page01.html"
79
- paginate_test_page = @agent.get(paginate_test_uri)
80
- compare_generated_vs_original(generated, original, paginate_test_page)
159
+ uri = @uri + "/pagination/page01.html"
160
+ compare_generated_vs_original(generated, original, uri)
81
161
  end
82
162
 
83
163
  it "return PaginateNode/TextNode with limit" do
84
- src = %q|{ "node" : "pages",
85
- "name" : "root",
86
- "path" : "/html/body/nav/span/a[@class=\'next\']",
87
- "limit" : 2,
88
- "children" : [ { "node" : "text",
89
- "name" : "content",
90
- "path" : "/html/body/p"
91
- } ]
92
- }|
164
+ src = %q|
165
+ {
166
+ "pages_root": {
167
+ "path": "/html/body/nav/span/a[@class=\'next\']",
168
+ "limit": 2,
169
+ "text_content": "/html/body/p"
170
+ }
171
+ }|
93
172
  generated = Yasuri.json2tree(src)
94
173
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
95
174
  Yasuri::TextNode.new('/html/body/p', "content"),
96
175
  ], limit:2)
97
176
 
98
- paginate_test_uri = @uri + "/pagination/page01.html"
99
- paginate_test_page = @agent.get(paginate_test_uri)
100
- compare_generated_vs_original(generated, original, paginate_test_page)
177
+ uri = @uri + "/pagination/page01.html"
178
+ compare_generated_vs_original(generated, original, uri)
101
179
  end
102
180
 
103
181
  it "return StructNode/StructNode/[TextNode,TextNode]" do
104
- src = %q| { "node" : "struct",
105
- "name" : "tables",
106
- "path" : "/html/body/table",
107
- "children" : [
108
- { "node" : "struct",
109
- "name" : "table",
110
- "path" : "./tr",
111
- "children" : [
112
- { "node" : "text",
113
- "name" : "title",
114
- "path" : "./td[1]"
115
- },
116
- { "node" : "text",
117
- "name" : "pub_date",
118
- "path" : "./td[2]"
119
- }]
120
- }]
121
- }|
182
+ src = %q|
183
+ {
184
+ "struct_tables": {
185
+ "path": "/html/body/table",
186
+ "struct_table": {
187
+ "path": "./tr",
188
+ "text_title": "./td[1]",
189
+ "text_pub_date": "./td[2]"
190
+ }
191
+ }
192
+ }|
122
193
  generated = Yasuri.json2tree(src)
123
194
  original = Yasuri::StructNode.new('/html/body/table', "tables", [
124
195
  Yasuri::StructNode.new('./tr', "table", [
@@ -126,27 +197,27 @@ describe 'Yasuri' do
126
197
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
198
  ])
128
199
  ])
129
- page = @agent.get(@uri + "/struct/structual_text.html")
130
- compare_generated_vs_original(generated, original, page)
200
+ uri = @uri + "/struct/structual_text.html"
201
+ compare_generated_vs_original(generated, original, uri)
131
202
  end
132
203
  end
133
204
 
205
+
134
206
  #############
135
207
  # tree2json #
136
208
  #############
137
209
  describe '.tree2json' do
138
210
  it "return empty json" do
139
- json = Yasuri.tree2json(nil)
140
- expect(json).to match "{}"
211
+ expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
141
212
  end
142
213
 
143
214
  it "return text node" do
144
215
  node = Yasuri::TextNode.new("/html/head/title", "title")
145
216
  json = Yasuri.tree2json(node)
146
- expected_str = %q| { "node": "text",
147
- "name": "title",
148
- "path": "/html/head/title"
149
- } |
217
+ expected_str = %q|
218
+ {
219
+ "text_title": "/html/head/title"
220
+ }|
150
221
  expected = JSON.parse(expected_str)
151
222
  actual = JSON.parse(json)
152
223
  expect(actual).to match expected
@@ -155,29 +226,49 @@ describe 'Yasuri' do
155
226
  it "return text node with truncate_regexp" do
156
227
  node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
157
228
  json = Yasuri.tree2json(node)
158
- expected_str = %q| { "node": "text",
159
- "name": "title",
160
- "path": "/html/head/title",
161
- "truncate": "^[^,]+"
162
- } |
229
+ expected_str = %q|
230
+ {
231
+ "text_title": {
232
+ "path": "/html/head/title",
233
+ "truncate": "^[^,]+"
234
+ }
235
+ }|
163
236
  expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
164
237
  actual = Yasuri.tree2json(Yasuri.json2tree(json))
165
238
  expect(actual).to match expected
166
239
  end
167
240
 
241
+ it "return map node with text nodes" do
242
+ tree = Yasuri::MapNode.new('parent', [
243
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
244
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
245
+ ])
246
+ actual_json = Yasuri.tree2json(tree)
247
+
248
+ expected_json = %q|
249
+ {
250
+ "text_content01": "/html/body/p[1]",
251
+ "text_content02": "/html/body/p[2]"
252
+ }|
253
+
254
+ expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
255
+ actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
256
+ expect(actual).to match expected
257
+ end
258
+
168
259
  it "return LinksNode/TextNode" do
169
260
  tree = Yasuri::LinksNode.new('/html/body/a', "root", [
170
261
  Yasuri::TextNode.new('/html/body/p', "content"),
171
262
  ])
172
263
  json = Yasuri.tree2json(tree)
173
- expected_src = %q| { "node" : "links",
174
- "name" : "root",
175
- "path" : "/html/body/a",
176
- "children" : [ { "node" : "text",
177
- "name" : "content",
178
- "path" : "/html/body/p"
179
- } ]
180
- }|
264
+
265
+ expected_src = %q|
266
+ {
267
+ "links_root": {
268
+ "path": "/html/body/a",
269
+ "text_content":"/html/body/p"
270
+ }
271
+ }|
181
272
  expected = JSON.parse(expected_src)
182
273
  actual = JSON.parse(json)
183
274
  expect(actual).to match expected
@@ -189,25 +280,84 @@ describe 'Yasuri' do
189
280
  ], limit:10)
190
281
 
191
282
  json = Yasuri.tree2json(tree)
192
- expected_src = %q| { "node" : "pages",
193
- "name" : "root",
194
- "path" : "/html/body/nav/span/a[@class='next']",
195
- "limit" : 10,
196
- "flatten" : false,
197
- "children" : [ { "node" : "text",
198
- "name" : "content",
199
- "path" : "/html/body/p"
200
- } ]
201
- }|
283
+ expected_src = %q|
284
+ {
285
+ "pages_root": {
286
+ "path": "/html/body/nav/span/a[@class='next']",
287
+ "limit": 10,
288
+ "flatten": false,
289
+ "text_content": "/html/body/p"
290
+ }
291
+ }|
202
292
  expected = JSON.parse(expected_src)
203
293
  actual = JSON.parse(json)
204
294
  expect(actual).to match expected
205
295
  end
296
+ end
297
+
298
+ describe '.with_retry' do
299
+ it "call once if success" do
300
+ actual = Yasuri.with_retry(0){ 42 }
301
+ expect(actual).to match 42
302
+ end
303
+
304
+ it "call untile success" do
305
+ i = [1,1,0,0]
306
+ actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
307
+ expect(actual).to match 42/1
308
+ end
309
+
310
+ it "raise error when exceed retry count" do
311
+ i = [1,0,0,0]
312
+ expect {
313
+ Yasuri.with_retry(2){42 / i.pop } # do this 3 times
314
+ }.to raise_error(Exception)
315
+ end
316
+
317
+ it "wait interval before run" do
318
+ allow(Kernel).to receive(:sleep)
319
+ Yasuri.with_retry(0){ 42 }
320
+ expect(Kernel).to have_received(:sleep).once
321
+ end
206
322
 
323
+ it "wait interval before run" do
324
+ allow(Kernel).to receive(:sleep)
325
+ Yasuri.with_retry(0){ 42 }
326
+ expect(Kernel).to have_received(:sleep).once
327
+ end
207
328
 
329
+ it "wait interval for each runs" do
330
+ allow(Kernel).to receive(:sleep)
208
331
 
332
+ i = [1,1,0,0]
333
+ Yasuri.with_retry(2){42 / i.pop } # 3 times in max
334
+ expect(Kernel).to have_received(:sleep).exactly(3).times
335
+ end
209
336
  end
210
337
 
338
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
339
+ tree = Yasuri::StructNode.new('/html/body/table', "tables", [
340
+ Yasuri::StructNode.new('./tr', "table", [
341
+ Yasuri::TextNode.new('./td[1]', "title"),
342
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
343
+ ])
344
+ ])
345
+ json = Yasuri.tree2json(tree)
346
+ expected_src = %q|
347
+ {
348
+ "struct_tables": {
349
+ "path": "/html/body/table",
350
+ "struct_table": {
351
+ "path": "./tr",
352
+ "text_title": "./td[1]",
353
+ "text_pub_date": "./td[2]"
354
+ }
355
+ }
356
+ }|
357
+ expected = JSON.parse(expected_src)
358
+ actual = JSON.parse(json)
359
+ expect(actual).to match expected
360
+ end
211
361
 
212
362
  it 'has a version number' do
213
363
  expect(Yasuri::VERSION).not_to be nil