yasuri 3.2.0 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -7
- data/USAGE.ja.md +107 -86
- data/USAGE.md +106 -87
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +6 -2
- data/lib/yasuri/yasuri_cli.rb +6 -6
- data/lib/yasuri/yasuri_links_node.rb +3 -1
- data/lib/yasuri/yasuri_map_node.rb +1 -0
- data/lib/yasuri/yasuri_node.rb +14 -0
- data/lib/yasuri/yasuri_paginate_node.rb +2 -1
- data/spec/spec_helper.rb +3 -3
- data/spec/yasuri_cli_spec.rb +17 -4
- data/spec/yasuri_links_node_spec.rb +24 -10
- data/spec/yasuri_map_spec.rb +4 -5
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +55 -19
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- metadata +6 -3
- data/app.rb +0 -52
data/spec/yasuri_spec.rb
CHANGED
@@ -8,9 +8,7 @@ describe 'Yasuri' do
|
|
8
8
|
include_context 'httpserver'
|
9
9
|
|
10
10
|
before do
|
11
|
-
@agent = Mechanize.new
|
12
11
|
@uri = uri
|
13
|
-
@index_page = @agent.get(@uri)
|
14
12
|
end
|
15
13
|
|
16
14
|
|
@@ -29,7 +27,7 @@ describe 'Yasuri' do
|
|
29
27
|
generated = Yasuri.yaml2tree(src)
|
30
28
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
29
|
|
32
|
-
compare_generated_vs_original(generated, original, @
|
30
|
+
compare_generated_vs_original(generated, original, @uri)
|
33
31
|
end
|
34
32
|
|
35
33
|
it "return text node as symbol" do
|
@@ -40,7 +38,7 @@ describe 'Yasuri' do
|
|
40
38
|
generated = Yasuri.yaml2tree(src)
|
41
39
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
42
40
|
|
43
|
-
compare_generated_vs_original(generated, original, @
|
41
|
+
compare_generated_vs_original(generated, original, @uri)
|
44
42
|
end
|
45
43
|
|
46
44
|
it "return LinksNode/TextNode" do
|
@@ -55,7 +53,7 @@ describe 'Yasuri' do
|
|
55
53
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
56
54
|
])
|
57
55
|
|
58
|
-
compare_generated_vs_original(generated, original, @
|
56
|
+
compare_generated_vs_original(generated, original, @uri)
|
59
57
|
end
|
60
58
|
|
61
59
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -75,8 +73,8 @@ describe 'Yasuri' do
|
|
75
73
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
76
74
|
])
|
77
75
|
])
|
78
|
-
|
79
|
-
compare_generated_vs_original(generated, original,
|
76
|
+
uri = @uri + "/struct/structual_text.html"
|
77
|
+
compare_generated_vs_original(generated, original, uri)
|
80
78
|
end
|
81
79
|
|
82
80
|
end # end of describe '.yaml2tree'
|
@@ -98,7 +96,7 @@ describe 'Yasuri' do
|
|
98
96
|
generated = Yasuri.json2tree(src)
|
99
97
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
100
98
|
|
101
|
-
compare_generated_vs_original(generated, original, @
|
99
|
+
compare_generated_vs_original(generated, original, @uri)
|
102
100
|
end
|
103
101
|
|
104
102
|
it "return TextNode with truncate_regexp" do
|
@@ -111,7 +109,7 @@ describe 'Yasuri' do
|
|
111
109
|
}|
|
112
110
|
generated = Yasuri.json2tree(src)
|
113
111
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
114
|
-
compare_generated_vs_original(generated, original, @
|
112
|
+
compare_generated_vs_original(generated, original, @uri)
|
115
113
|
end
|
116
114
|
|
117
115
|
it "return MapNode with TextNodes" do
|
@@ -125,7 +123,7 @@ describe 'Yasuri' do
|
|
125
123
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
126
124
|
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
127
125
|
])
|
128
|
-
compare_generated_vs_original(generated, original, @
|
126
|
+
compare_generated_vs_original(generated, original, @uri)
|
129
127
|
end
|
130
128
|
|
131
129
|
it "return LinksNode/TextNode" do
|
@@ -142,7 +140,7 @@ describe 'Yasuri' do
|
|
142
140
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
143
141
|
])
|
144
142
|
|
145
|
-
compare_generated_vs_original(generated, original, @
|
143
|
+
compare_generated_vs_original(generated, original, @uri)
|
146
144
|
end
|
147
145
|
|
148
146
|
it "return PaginateNode/TextNode" do
|
@@ -158,9 +156,8 @@ describe 'Yasuri' do
|
|
158
156
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
159
157
|
])
|
160
158
|
|
161
|
-
|
162
|
-
|
163
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
159
|
+
uri = @uri + "/pagination/page01.html"
|
160
|
+
compare_generated_vs_original(generated, original, uri)
|
164
161
|
end
|
165
162
|
|
166
163
|
it "return PaginateNode/TextNode with limit" do
|
@@ -177,9 +174,8 @@ describe 'Yasuri' do
|
|
177
174
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
178
175
|
], limit:2)
|
179
176
|
|
180
|
-
|
181
|
-
|
182
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
177
|
+
uri = @uri + "/pagination/page01.html"
|
178
|
+
compare_generated_vs_original(generated, original, uri)
|
183
179
|
end
|
184
180
|
|
185
181
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -201,8 +197,8 @@ describe 'Yasuri' do
|
|
201
197
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
202
198
|
])
|
203
199
|
])
|
204
|
-
|
205
|
-
compare_generated_vs_original(generated, original,
|
200
|
+
uri = @uri + "/struct/structual_text.html"
|
201
|
+
compare_generated_vs_original(generated, original, uri)
|
206
202
|
end
|
207
203
|
end
|
208
204
|
|
@@ -299,6 +295,46 @@ describe 'Yasuri' do
|
|
299
295
|
end
|
300
296
|
end
|
301
297
|
|
298
|
+
describe '.with_retry' do
|
299
|
+
it "call once if success" do
|
300
|
+
actual = Yasuri.with_retry(0){ 42 }
|
301
|
+
expect(actual).to match 42
|
302
|
+
end
|
303
|
+
|
304
|
+
it "call untile success" do
|
305
|
+
i = [1,1,0,0]
|
306
|
+
actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
307
|
+
expect(actual).to match 42/1
|
308
|
+
end
|
309
|
+
|
310
|
+
it "raise error when exceed retry count" do
|
311
|
+
i = [1,0,0,0]
|
312
|
+
expect {
|
313
|
+
Yasuri.with_retry(2){42 / i.pop } # do this 3 times
|
314
|
+
}.to raise_error(Exception)
|
315
|
+
end
|
316
|
+
|
317
|
+
it "wait interval before run" do
|
318
|
+
allow(Kernel).to receive(:sleep)
|
319
|
+
Yasuri.with_retry(0){ 42 }
|
320
|
+
expect(Kernel).to have_received(:sleep).once
|
321
|
+
end
|
322
|
+
|
323
|
+
it "wait interval before run" do
|
324
|
+
allow(Kernel).to receive(:sleep)
|
325
|
+
Yasuri.with_retry(0){ 42 }
|
326
|
+
expect(Kernel).to have_received(:sleep).once
|
327
|
+
end
|
328
|
+
|
329
|
+
it "wait interval for each runs" do
|
330
|
+
allow(Kernel).to receive(:sleep)
|
331
|
+
|
332
|
+
i = [1,1,0,0]
|
333
|
+
Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
334
|
+
expect(Kernel).to have_received(:sleep).exactly(3).times
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
302
338
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
339
|
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
340
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -11,8 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
|
-
@
|
15
|
-
@page = @agent.get(uri + "/struct/structual_text.html")
|
14
|
+
@uri = uri + "/struct/structual_text.html"
|
16
15
|
|
17
16
|
@table_1996 = [
|
18
17
|
{ "title" => "The Perfect Insider",
|
@@ -53,7 +52,7 @@ describe 'Yasuri' do
|
|
53
52
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
53
|
])
|
55
54
|
expected = @table_1996
|
56
|
-
actual = node.
|
55
|
+
actual = node.scrape(@uri)
|
57
56
|
expect(actual).to match expected
|
58
57
|
end
|
59
58
|
|
@@ -63,7 +62,7 @@ describe 'Yasuri' do
|
|
63
62
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
64
63
|
])
|
65
64
|
expected = @table_1996.first
|
66
|
-
actual = node.
|
65
|
+
actual = node.scrape(@uri)
|
67
66
|
expect(actual).to match expected
|
68
67
|
end
|
69
68
|
|
@@ -72,7 +71,7 @@ describe 'Yasuri' do
|
|
72
71
|
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
73
72
|
Yasuri::TextNode.new('./td[1]', "title")
|
74
73
|
])
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to be_empty
|
77
76
|
end
|
78
77
|
|
@@ -81,7 +80,7 @@ describe 'Yasuri' do
|
|
81
80
|
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
82
81
|
Yasuri::TextNode.new('./td[1]', "title")
|
83
82
|
])
|
84
|
-
expect { node.
|
83
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
85
84
|
end
|
86
85
|
|
87
86
|
it 'fail with invalid xpath in children' do
|
@@ -90,7 +89,7 @@ describe 'Yasuri' do
|
|
90
89
|
Yasuri::TextNode.new(invalid_xpath, "title"),
|
91
90
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
92
91
|
])
|
93
|
-
expect { node.
|
92
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
94
93
|
end
|
95
94
|
|
96
95
|
it 'scrape all tables' do
|
@@ -101,7 +100,7 @@ describe 'Yasuri' do
|
|
101
100
|
])
|
102
101
|
])
|
103
102
|
expected = @all_tables
|
104
|
-
actual = node.
|
103
|
+
actual = node.scrape(@uri)
|
105
104
|
expect(actual).to match expected
|
106
105
|
end
|
107
106
|
|
@@ -118,7 +117,7 @@ describe 'Yasuri' do
|
|
118
117
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
119
118
|
])
|
120
119
|
])
|
121
|
-
compare_generated_vs_original(generated, original, @
|
120
|
+
compare_generated_vs_original(generated, original, @uri)
|
122
121
|
end
|
123
122
|
|
124
123
|
it 'return child node as symbol' do
|
@@ -127,7 +126,7 @@ describe 'Yasuri' do
|
|
127
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
127
|
])
|
129
128
|
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
|
-
actual = node.
|
129
|
+
actual = node.scrape(@uri, symbolize_names:true)
|
131
130
|
expect(actual).to match expected
|
132
131
|
end
|
133
132
|
|
@@ -135,9 +134,7 @@ describe 'Yasuri' do
|
|
135
134
|
|
136
135
|
describe '::StructNode::Links' do
|
137
136
|
before do
|
138
|
-
@
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
137
|
+
@uri = uri + "/struct/structual_links.html"
|
141
138
|
@table = [
|
142
139
|
{ "title" => "Child01,02",
|
143
140
|
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
@@ -155,22 +152,21 @@ describe 'Yasuri' do
|
|
155
152
|
])
|
156
153
|
])
|
157
154
|
expected = @table
|
158
|
-
actual = node.
|
155
|
+
actual = node.scrape(@uri)
|
159
156
|
expect(actual).to match expected
|
160
157
|
end
|
161
158
|
end # descrive
|
162
159
|
|
163
160
|
describe '::StructNode::Pages' do
|
164
161
|
before do
|
165
|
-
@
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
162
|
+
@uri = uri + "/struct/structual_text.html"
|
167
163
|
end
|
168
164
|
|
169
165
|
it 'not supported' do
|
170
166
|
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
167
|
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
168
|
])
|
173
|
-
expect{ node.
|
169
|
+
expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
170
|
end
|
175
171
|
end
|
176
172
|
end
|
@@ -10,69 +10,68 @@ describe 'Yasuri' do
|
|
10
10
|
include_context 'httpserver'
|
11
11
|
|
12
12
|
before do
|
13
|
-
@
|
14
|
-
@index_page = @agent.get(uri)
|
13
|
+
@uri = uri
|
15
14
|
end
|
16
15
|
|
17
16
|
describe '::TextNode' do
|
18
17
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
18
|
|
20
19
|
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
-
actual = @node.
|
20
|
+
actual = @node.scrape(@uri)
|
22
21
|
expect(actual).to eq "Hello,Yasuri"
|
23
22
|
end
|
24
23
|
|
25
24
|
it 'return empty text if no match node' do
|
26
25
|
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
-
actual = no_match_node.
|
26
|
+
actual = no_match_node.scrape(@uri)
|
28
27
|
expect(actual).to be_empty
|
29
28
|
end
|
30
29
|
|
31
30
|
it 'fail with invalid xpath' do
|
32
31
|
invalid_xpath = '/html/body/no_match_node['
|
33
32
|
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
-
expect { node.
|
33
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
35
34
|
end
|
36
35
|
|
37
36
|
it "can be defined by DSL, return single TextNode title" do
|
38
37
|
generated = Yasuri.text_title '/html/body/p[1]'
|
39
38
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
-
compare_generated_vs_original(generated, original, @
|
39
|
+
compare_generated_vs_original(generated, original, @uri)
|
41
40
|
end
|
42
41
|
|
43
42
|
it "can be truncated with regexp" do
|
44
43
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
45
|
-
actual = node.
|
44
|
+
actual = node.scrape(@uri)
|
46
45
|
expect(actual).to eq "Hello"
|
47
46
|
end
|
48
47
|
|
49
48
|
it "return first captured if matched given capture pattern" do
|
50
49
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
|
51
|
-
actual = node.
|
50
|
+
actual = node.scrape(@uri)
|
52
51
|
expect(actual).to eq "ello,Yasur"
|
53
52
|
end
|
54
53
|
|
55
54
|
it "can be truncated with regexp" do
|
56
55
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
|
57
|
-
actual = node.
|
56
|
+
actual = node.scrape(@uri)
|
58
57
|
expect(actual).to eq "Yasuri"
|
59
58
|
end
|
60
59
|
|
61
60
|
it "return empty string if truncated with no match to regexp" do
|
62
61
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
|
63
|
-
actual = node.
|
62
|
+
actual = node.scrape(@uri)
|
64
63
|
expect(actual).to be_empty
|
65
64
|
end
|
66
65
|
|
67
66
|
it "return symbol method applied string" do
|
68
67
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
69
|
-
actual = node.
|
68
|
+
actual = node.scrape(@uri)
|
70
69
|
expect(actual).to eq "HELLO,YASURI"
|
71
70
|
end
|
72
71
|
|
73
72
|
it "return apply multi arguments" do
|
74
73
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to eq "ELLO,YASUR"
|
77
76
|
end
|
78
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -170,7 +170,10 @@ files:
|
|
170
170
|
- Rakefile
|
171
171
|
- USAGE.ja.md
|
172
172
|
- USAGE.md
|
173
|
-
-
|
173
|
+
- examples/example.rb
|
174
|
+
- examples/github.yml
|
175
|
+
- examples/sample.json
|
176
|
+
- examples/sample.yml
|
174
177
|
- exe/yasuri
|
175
178
|
- lib/yasuri.rb
|
176
179
|
- lib/yasuri/version.rb
|
data/app.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
# Author:: TAC (tac@tac42.net)
|
5
|
-
|
6
|
-
require 'pp'
|
7
|
-
require 'time'
|
8
|
-
require 'mechanize'
|
9
|
-
|
10
|
-
require_relative 'lib/yasuri/yasuri'
|
11
|
-
|
12
|
-
agent = Mechanize.new
|
13
|
-
|
14
|
-
uri = "http://www.asahi.com/"
|
15
|
-
|
16
|
-
# Node tree constructing by DSL
|
17
|
-
root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
|
18
|
-
text_title '//*[@id="MainInner"]/div[1]/div/h1'
|
19
|
-
text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
|
20
|
-
end
|
21
|
-
|
22
|
-
# Node tree constructing by JSON
|
23
|
-
src = <<-EOJSON
|
24
|
-
{ "node" : "links",
|
25
|
-
"name" : "root",
|
26
|
-
"path" : "//*[@id='MainInner']/div[1]/ul/li/a",
|
27
|
-
"children" : [
|
28
|
-
{ "node" : "text",
|
29
|
-
"name" : "title",
|
30
|
-
"path" : "//*[@id='MainInner']/div[1]/div/h1"
|
31
|
-
},
|
32
|
-
{ "node" : "text",
|
33
|
-
"name" : "article",
|
34
|
-
"path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
|
35
|
-
}
|
36
|
-
]
|
37
|
-
}
|
38
|
-
EOJSON
|
39
|
-
root = Yasuri.json2tree(src)
|
40
|
-
|
41
|
-
# Access to parsed resources
|
42
|
-
page = agent.get(uri)
|
43
|
-
contents = root.inject(agent, page)
|
44
|
-
|
45
|
-
contents.each do |h|
|
46
|
-
t = h['title']
|
47
|
-
a = h['article']
|
48
|
-
|
49
|
-
puts t
|
50
|
-
puts a
|
51
|
-
puts "=" * 100
|
52
|
-
end
|