yasuri 3.2.0 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/yasuri_spec.rb CHANGED
@@ -8,9 +8,7 @@ describe 'Yasuri' do
8
8
  include_context 'httpserver'
9
9
 
10
10
  before do
11
- @agent = Mechanize.new
12
11
  @uri = uri
13
- @index_page = @agent.get(@uri)
14
12
  end
15
13
 
16
14
 
@@ -29,7 +27,7 @@ describe 'Yasuri' do
29
27
  generated = Yasuri.yaml2tree(src)
30
28
  original = Yasuri::TextNode.new('/html/body/p[1]', "content")
31
29
 
32
- compare_generated_vs_original(generated, original, @index_page)
30
+ compare_generated_vs_original(generated, original, @uri)
33
31
  end
34
32
 
35
33
  it "return text node as symbol" do
@@ -40,7 +38,7 @@ describe 'Yasuri' do
40
38
  generated = Yasuri.yaml2tree(src)
41
39
  original = Yasuri::TextNode.new('/html/body/p[1]', "content")
42
40
 
43
- compare_generated_vs_original(generated, original, @index_page)
41
+ compare_generated_vs_original(generated, original, @uri)
44
42
  end
45
43
 
46
44
  it "return LinksNode/TextNode" do
@@ -55,7 +53,7 @@ describe 'Yasuri' do
55
53
  Yasuri::TextNode.new('/html/body/p', "content"),
56
54
  ])
57
55
 
58
- compare_generated_vs_original(generated, original, @index_page)
56
+ compare_generated_vs_original(generated, original, @uri)
59
57
  end
60
58
 
61
59
  it "return StructNode/StructNode/[TextNode,TextNode]" do
@@ -75,8 +73,8 @@ describe 'Yasuri' do
75
73
  Yasuri::TextNode.new('./td[2]', "pub_date"),
76
74
  ])
77
75
  ])
78
- page = @agent.get(@uri + "/struct/structual_text.html")
79
- compare_generated_vs_original(generated, original, page)
76
+ uri = @uri + "/struct/structual_text.html"
77
+ compare_generated_vs_original(generated, original, uri)
80
78
  end
81
79
 
82
80
  end # end of describe '.yaml2tree'
@@ -98,7 +96,7 @@ describe 'Yasuri' do
98
96
  generated = Yasuri.json2tree(src)
99
97
  original = Yasuri::TextNode.new('/html/body/p[1]', "content")
100
98
 
101
- compare_generated_vs_original(generated, original, @index_page)
99
+ compare_generated_vs_original(generated, original, @uri)
102
100
  end
103
101
 
104
102
  it "return TextNode with truncate_regexp" do
@@ -111,7 +109,7 @@ describe 'Yasuri' do
111
109
  }|
112
110
  generated = Yasuri.json2tree(src)
113
111
  original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
114
- compare_generated_vs_original(generated, original, @index_page)
112
+ compare_generated_vs_original(generated, original, @uri)
115
113
  end
116
114
 
117
115
  it "return MapNode with TextNodes" do
@@ -125,7 +123,7 @@ describe 'Yasuri' do
125
123
  Yasuri::TextNode.new('/html/body/p[1]', "content01"),
126
124
  Yasuri::TextNode.new('/html/body/p[2]', "content02"),
127
125
  ])
128
- compare_generated_vs_original(generated, original, @index_page)
126
+ compare_generated_vs_original(generated, original, @uri)
129
127
  end
130
128
 
131
129
  it "return LinksNode/TextNode" do
@@ -142,7 +140,7 @@ describe 'Yasuri' do
142
140
  Yasuri::TextNode.new('/html/body/p', "content"),
143
141
  ])
144
142
 
145
- compare_generated_vs_original(generated, original, @index_page)
143
+ compare_generated_vs_original(generated, original, @uri)
146
144
  end
147
145
 
148
146
  it "return PaginateNode/TextNode" do
@@ -158,9 +156,8 @@ describe 'Yasuri' do
158
156
  Yasuri::TextNode.new('/html/body/p', "content"),
159
157
  ])
160
158
 
161
- paginate_test_uri = @uri + "/pagination/page01.html"
162
- paginate_test_page = @agent.get(paginate_test_uri)
163
- compare_generated_vs_original(generated, original, paginate_test_page)
159
+ uri = @uri + "/pagination/page01.html"
160
+ compare_generated_vs_original(generated, original, uri)
164
161
  end
165
162
 
166
163
  it "return PaginateNode/TextNode with limit" do
@@ -177,9 +174,8 @@ describe 'Yasuri' do
177
174
  Yasuri::TextNode.new('/html/body/p', "content"),
178
175
  ], limit:2)
179
176
 
180
- paginate_test_uri = @uri + "/pagination/page01.html"
181
- paginate_test_page = @agent.get(paginate_test_uri)
182
- compare_generated_vs_original(generated, original, paginate_test_page)
177
+ uri = @uri + "/pagination/page01.html"
178
+ compare_generated_vs_original(generated, original, uri)
183
179
  end
184
180
 
185
181
  it "return StructNode/StructNode/[TextNode,TextNode]" do
@@ -201,8 +197,8 @@ describe 'Yasuri' do
201
197
  Yasuri::TextNode.new('./td[2]', "pub_date"),
202
198
  ])
203
199
  ])
204
- page = @agent.get(@uri + "/struct/structual_text.html")
205
- compare_generated_vs_original(generated, original, page)
200
+ uri = @uri + "/struct/structual_text.html"
201
+ compare_generated_vs_original(generated, original, uri)
206
202
  end
207
203
  end
208
204
 
@@ -299,6 +295,46 @@ describe 'Yasuri' do
299
295
  end
300
296
  end
301
297
 
298
+ describe '.with_retry' do
299
+ it "call once if success" do
300
+ actual = Yasuri.with_retry(0){ 42 }
301
+ expect(actual).to match 42
302
+ end
303
+
304
+ it "call untile success" do
305
+ i = [1,1,0,0]
306
+ actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
307
+ expect(actual).to match 42/1
308
+ end
309
+
310
+ it "raise error when exceed retry count" do
311
+ i = [1,0,0,0]
312
+ expect {
313
+ Yasuri.with_retry(2){42 / i.pop } # do this 3 times
314
+ }.to raise_error(Exception)
315
+ end
316
+
317
+ it "wait interval before run" do
318
+ allow(Kernel).to receive(:sleep)
319
+ Yasuri.with_retry(0){ 42 }
320
+ expect(Kernel).to have_received(:sleep).once
321
+ end
322
+
323
+ it "wait interval before run" do
324
+ allow(Kernel).to receive(:sleep)
325
+ Yasuri.with_retry(0){ 42 }
326
+ expect(Kernel).to have_received(:sleep).once
327
+ end
328
+
329
+ it "wait interval for each runs" do
330
+ allow(Kernel).to receive(:sleep)
331
+
332
+ i = [1,1,0,0]
333
+ Yasuri.with_retry(2){42 / i.pop } # 3 times in max
334
+ expect(Kernel).to have_received(:sleep).exactly(3).times
335
+ end
336
+ end
337
+
302
338
  it "return StructNode/StructNode/[TextNode,TextNode]" do
303
339
  tree = Yasuri::StructNode.new('/html/body/table', "tables", [
304
340
  Yasuri::StructNode.new('./tr', "table", [
@@ -11,8 +11,7 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::StructNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
14
+ @uri = uri + "/struct/structual_text.html"
16
15
 
17
16
  @table_1996 = [
18
17
  { "title" => "The Perfect Insider",
@@ -53,7 +52,7 @@ describe 'Yasuri' do
53
52
  Yasuri::TextNode.new('./td[2]', "pub_date"),
54
53
  ])
55
54
  expected = @table_1996
56
- actual = node.inject(@agent, @page)
55
+ actual = node.scrape(@uri)
57
56
  expect(actual).to match expected
58
57
  end
59
58
 
@@ -63,7 +62,7 @@ describe 'Yasuri' do
63
62
  Yasuri::TextNode.new('./td[2]', "pub_date"),
64
63
  ])
65
64
  expected = @table_1996.first
66
- actual = node.inject(@agent, @page)
65
+ actual = node.scrape(@uri)
67
66
  expect(actual).to match expected
68
67
  end
69
68
 
@@ -72,7 +71,7 @@ describe 'Yasuri' do
72
71
  node = Yasuri::StructNode.new(no_match_xpath, "table", [
73
72
  Yasuri::TextNode.new('./td[1]', "title")
74
73
  ])
75
- actual = node.inject(@agent, @page)
74
+ actual = node.scrape(@uri)
76
75
  expect(actual).to be_empty
77
76
  end
78
77
 
@@ -81,7 +80,7 @@ describe 'Yasuri' do
81
80
  node = Yasuri::StructNode.new(invalid_xpath, "table", [
82
81
  Yasuri::TextNode.new('./td[1]', "title")
83
82
  ])
84
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
83
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
85
84
  end
86
85
 
87
86
  it 'fail with invalid xpath in children' do
@@ -90,7 +89,7 @@ describe 'Yasuri' do
90
89
  Yasuri::TextNode.new(invalid_xpath, "title"),
91
90
  Yasuri::TextNode.new('./td[2]', "pub_date"),
92
91
  ])
93
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
92
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
94
93
  end
95
94
 
96
95
  it 'scrape all tables' do
@@ -101,7 +100,7 @@ describe 'Yasuri' do
101
100
  ])
102
101
  ])
103
102
  expected = @all_tables
104
- actual = node.inject(@agent, @page)
103
+ actual = node.scrape(@uri)
105
104
  expect(actual).to match expected
106
105
  end
107
106
 
@@ -118,7 +117,7 @@ describe 'Yasuri' do
118
117
  Yasuri::TextNode.new('./td[2]', "pub_date"),
119
118
  ])
120
119
  ])
121
- compare_generated_vs_original(generated, original, @page)
120
+ compare_generated_vs_original(generated, original, @uri)
122
121
  end
123
122
 
124
123
  it 'return child node as symbol' do
@@ -127,7 +126,7 @@ describe 'Yasuri' do
127
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
128
127
  ])
129
128
  expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
- actual = node.inject(@agent, @page, symbolize_names:true)
129
+ actual = node.scrape(@uri, symbolize_names:true)
131
130
  expect(actual).to match expected
132
131
  end
133
132
 
@@ -135,9 +134,7 @@ describe 'Yasuri' do
135
134
 
136
135
  describe '::StructNode::Links' do
137
136
  before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
137
+ @uri = uri + "/struct/structual_links.html"
141
138
  @table = [
142
139
  { "title" => "Child01,02",
143
140
  "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
@@ -155,22 +152,21 @@ describe 'Yasuri' do
155
152
  ])
156
153
  ])
157
154
  expected = @table
158
- actual = node.inject(@agent, @page)
155
+ actual = node.scrape(@uri)
159
156
  expect(actual).to match expected
160
157
  end
161
158
  end # descrive
162
159
 
163
160
  describe '::StructNode::Pages' do
164
161
  before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
162
+ @uri = uri + "/struct/structual_text.html"
167
163
  end
168
164
 
169
165
  it 'not supported' do
170
166
  node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
167
  Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
168
  ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
169
+ expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
170
  end
175
171
  end
176
172
  end
@@ -10,69 +10,68 @@ describe 'Yasuri' do
10
10
  include_context 'httpserver'
11
11
 
12
12
  before do
13
- @agent = Mechanize.new
14
- @index_page = @agent.get(uri)
13
+ @uri = uri
15
14
  end
16
15
 
17
16
  describe '::TextNode' do
18
17
  before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
19
18
 
20
19
  it 'scrape text text <p>Hello,Yasuri</p>' do
21
- actual = @node.inject(@agent, @index_page)
20
+ actual = @node.scrape(@uri)
22
21
  expect(actual).to eq "Hello,Yasuri"
23
22
  end
24
23
 
25
24
  it 'return empty text if no match node' do
26
25
  no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
27
- actual = no_match_node.inject(@agent, @index_page)
26
+ actual = no_match_node.scrape(@uri)
28
27
  expect(actual).to be_empty
29
28
  end
30
29
 
31
30
  it 'fail with invalid xpath' do
32
31
  invalid_xpath = '/html/body/no_match_node['
33
32
  node = Yasuri::TextNode.new(invalid_xpath, "title")
34
- expect { node.inject(@agent, @index_page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
33
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
35
34
  end
36
35
 
37
36
  it "can be defined by DSL, return single TextNode title" do
38
37
  generated = Yasuri.text_title '/html/body/p[1]'
39
38
  original = Yasuri::TextNode.new('/html/body/p[1]', "title")
40
- compare_generated_vs_original(generated, original, @index_page)
39
+ compare_generated_vs_original(generated, original, @uri)
41
40
  end
42
41
 
43
42
  it "can be truncated with regexp" do
44
43
  node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
45
- actual = node.inject(@agent, @index_page)
44
+ actual = node.scrape(@uri)
46
45
  expect(actual).to eq "Hello"
47
46
  end
48
47
 
49
48
  it "return first captured if matched given capture pattern" do
50
49
  node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
51
- actual = node.inject(@agent, @index_page)
50
+ actual = node.scrape(@uri)
52
51
  expect(actual).to eq "ello,Yasur"
53
52
  end
54
53
 
55
54
  it "can be truncated with regexp" do
56
55
  node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
57
- actual = node.inject(@agent, @index_page)
56
+ actual = node.scrape(@uri)
58
57
  expect(actual).to eq "Yasuri"
59
58
  end
60
59
 
61
60
  it "return empty string if truncated with no match to regexp" do
62
61
  node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
63
- actual = node.inject(@agent, @index_page)
62
+ actual = node.scrape(@uri)
64
63
  expect(actual).to be_empty
65
64
  end
66
65
 
67
66
  it "return symbol method applied string" do
68
67
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
69
- actual = node.inject(@agent, @index_page)
68
+ actual = node.scrape(@uri)
70
69
  expect(actual).to eq "HELLO,YASURI"
71
70
  end
72
71
 
73
72
  it "return apply multi arguments" do
74
73
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
75
- actual = node.inject(@agent, @index_page)
74
+ actual = node.scrape(@uri)
76
75
  expect(actual).to eq "ELLO,YASUR"
77
76
  end
78
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-03-25 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -170,7 +170,10 @@ files:
170
170
  - Rakefile
171
171
  - USAGE.ja.md
172
172
  - USAGE.md
173
- - app.rb
173
+ - examples/example.rb
174
+ - examples/github.yml
175
+ - examples/sample.json
176
+ - examples/sample.yml
174
177
  - exe/yasuri
175
178
  - lib/yasuri.rb
176
179
  - lib/yasuri/version.rb
data/app.rb DELETED
@@ -1,52 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- # Author:: TAC (tac@tac42.net)
5
-
6
- require 'pp'
7
- require 'time'
8
- require 'mechanize'
9
-
10
- require_relative 'lib/yasuri/yasuri'
11
-
12
- agent = Mechanize.new
13
-
14
- uri = "http://www.asahi.com/"
15
-
16
- # Node tree constructing by DSL
17
- root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
18
- text_title '//*[@id="MainInner"]/div[1]/div/h1'
19
- text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
20
- end
21
-
22
- # Node tree constructing by JSON
23
- src = <<-EOJSON
24
- { "node" : "links",
25
- "name" : "root",
26
- "path" : "//*[@id='MainInner']/div[1]/ul/li/a",
27
- "children" : [
28
- { "node" : "text",
29
- "name" : "title",
30
- "path" : "//*[@id='MainInner']/div[1]/div/h1"
31
- },
32
- { "node" : "text",
33
- "name" : "article",
34
- "path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
35
- }
36
- ]
37
- }
38
- EOJSON
39
- root = Yasuri.json2tree(src)
40
-
41
- # Access to parsed resources
42
- page = agent.get(uri)
43
- contents = root.inject(agent, page)
44
-
45
- contents.each do |h|
46
- t = h['title']
47
- a = h['article']
48
-
49
- puts t
50
- puts a
51
- puts "=" * 100
52
- end