yasuri 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -7
- data/USAGE.ja.md +107 -86
- data/USAGE.md +106 -87
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +6 -2
- data/lib/yasuri/yasuri_cli.rb +6 -6
- data/lib/yasuri/yasuri_links_node.rb +3 -1
- data/lib/yasuri/yasuri_map_node.rb +1 -0
- data/lib/yasuri/yasuri_node.rb +14 -0
- data/lib/yasuri/yasuri_paginate_node.rb +2 -1
- data/spec/spec_helper.rb +3 -3
- data/spec/yasuri_cli_spec.rb +17 -4
- data/spec/yasuri_links_node_spec.rb +24 -10
- data/spec/yasuri_map_spec.rb +4 -5
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +55 -19
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- metadata +6 -3
- data/app.rb +0 -52
data/spec/yasuri_spec.rb
CHANGED
@@ -8,9 +8,7 @@ describe 'Yasuri' do
|
|
8
8
|
include_context 'httpserver'
|
9
9
|
|
10
10
|
before do
|
11
|
-
@agent = Mechanize.new
|
12
11
|
@uri = uri
|
13
|
-
@index_page = @agent.get(@uri)
|
14
12
|
end
|
15
13
|
|
16
14
|
|
@@ -29,7 +27,7 @@ describe 'Yasuri' do
|
|
29
27
|
generated = Yasuri.yaml2tree(src)
|
30
28
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
29
|
|
32
|
-
compare_generated_vs_original(generated, original, @
|
30
|
+
compare_generated_vs_original(generated, original, @uri)
|
33
31
|
end
|
34
32
|
|
35
33
|
it "return text node as symbol" do
|
@@ -40,7 +38,7 @@ describe 'Yasuri' do
|
|
40
38
|
generated = Yasuri.yaml2tree(src)
|
41
39
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
42
40
|
|
43
|
-
compare_generated_vs_original(generated, original, @
|
41
|
+
compare_generated_vs_original(generated, original, @uri)
|
44
42
|
end
|
45
43
|
|
46
44
|
it "return LinksNode/TextNode" do
|
@@ -55,7 +53,7 @@ describe 'Yasuri' do
|
|
55
53
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
56
54
|
])
|
57
55
|
|
58
|
-
compare_generated_vs_original(generated, original, @
|
56
|
+
compare_generated_vs_original(generated, original, @uri)
|
59
57
|
end
|
60
58
|
|
61
59
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -75,8 +73,8 @@ describe 'Yasuri' do
|
|
75
73
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
76
74
|
])
|
77
75
|
])
|
78
|
-
|
79
|
-
compare_generated_vs_original(generated, original,
|
76
|
+
uri = @uri + "/struct/structual_text.html"
|
77
|
+
compare_generated_vs_original(generated, original, uri)
|
80
78
|
end
|
81
79
|
|
82
80
|
end # end of describe '.yaml2tree'
|
@@ -98,7 +96,7 @@ describe 'Yasuri' do
|
|
98
96
|
generated = Yasuri.json2tree(src)
|
99
97
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
100
98
|
|
101
|
-
compare_generated_vs_original(generated, original, @
|
99
|
+
compare_generated_vs_original(generated, original, @uri)
|
102
100
|
end
|
103
101
|
|
104
102
|
it "return TextNode with truncate_regexp" do
|
@@ -111,7 +109,7 @@ describe 'Yasuri' do
|
|
111
109
|
}|
|
112
110
|
generated = Yasuri.json2tree(src)
|
113
111
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
114
|
-
compare_generated_vs_original(generated, original, @
|
112
|
+
compare_generated_vs_original(generated, original, @uri)
|
115
113
|
end
|
116
114
|
|
117
115
|
it "return MapNode with TextNodes" do
|
@@ -125,7 +123,7 @@ describe 'Yasuri' do
|
|
125
123
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
126
124
|
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
127
125
|
])
|
128
|
-
compare_generated_vs_original(generated, original, @
|
126
|
+
compare_generated_vs_original(generated, original, @uri)
|
129
127
|
end
|
130
128
|
|
131
129
|
it "return LinksNode/TextNode" do
|
@@ -142,7 +140,7 @@ describe 'Yasuri' do
|
|
142
140
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
143
141
|
])
|
144
142
|
|
145
|
-
compare_generated_vs_original(generated, original, @
|
143
|
+
compare_generated_vs_original(generated, original, @uri)
|
146
144
|
end
|
147
145
|
|
148
146
|
it "return PaginateNode/TextNode" do
|
@@ -158,9 +156,8 @@ describe 'Yasuri' do
|
|
158
156
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
159
157
|
])
|
160
158
|
|
161
|
-
|
162
|
-
|
163
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
159
|
+
uri = @uri + "/pagination/page01.html"
|
160
|
+
compare_generated_vs_original(generated, original, uri)
|
164
161
|
end
|
165
162
|
|
166
163
|
it "return PaginateNode/TextNode with limit" do
|
@@ -177,9 +174,8 @@ describe 'Yasuri' do
|
|
177
174
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
178
175
|
], limit:2)
|
179
176
|
|
180
|
-
|
181
|
-
|
182
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
177
|
+
uri = @uri + "/pagination/page01.html"
|
178
|
+
compare_generated_vs_original(generated, original, uri)
|
183
179
|
end
|
184
180
|
|
185
181
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
@@ -201,8 +197,8 @@ describe 'Yasuri' do
|
|
201
197
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
202
198
|
])
|
203
199
|
])
|
204
|
-
|
205
|
-
compare_generated_vs_original(generated, original,
|
200
|
+
uri = @uri + "/struct/structual_text.html"
|
201
|
+
compare_generated_vs_original(generated, original, uri)
|
206
202
|
end
|
207
203
|
end
|
208
204
|
|
@@ -299,6 +295,46 @@ describe 'Yasuri' do
|
|
299
295
|
end
|
300
296
|
end
|
301
297
|
|
298
|
+
describe '.with_retry' do
|
299
|
+
it "call once if success" do
|
300
|
+
actual = Yasuri.with_retry(0){ 42 }
|
301
|
+
expect(actual).to match 42
|
302
|
+
end
|
303
|
+
|
304
|
+
it "call untile success" do
|
305
|
+
i = [1,1,0,0]
|
306
|
+
actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
307
|
+
expect(actual).to match 42/1
|
308
|
+
end
|
309
|
+
|
310
|
+
it "raise error when exceed retry count" do
|
311
|
+
i = [1,0,0,0]
|
312
|
+
expect {
|
313
|
+
Yasuri.with_retry(2){42 / i.pop } # do this 3 times
|
314
|
+
}.to raise_error(Exception)
|
315
|
+
end
|
316
|
+
|
317
|
+
it "wait interval before run" do
|
318
|
+
allow(Kernel).to receive(:sleep)
|
319
|
+
Yasuri.with_retry(0){ 42 }
|
320
|
+
expect(Kernel).to have_received(:sleep).once
|
321
|
+
end
|
322
|
+
|
323
|
+
it "wait interval before run" do
|
324
|
+
allow(Kernel).to receive(:sleep)
|
325
|
+
Yasuri.with_retry(0){ 42 }
|
326
|
+
expect(Kernel).to have_received(:sleep).once
|
327
|
+
end
|
328
|
+
|
329
|
+
it "wait interval for each runs" do
|
330
|
+
allow(Kernel).to receive(:sleep)
|
331
|
+
|
332
|
+
i = [1,1,0,0]
|
333
|
+
Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
334
|
+
expect(Kernel).to have_received(:sleep).exactly(3).times
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
302
338
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
339
|
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
340
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -11,8 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
|
-
@
|
15
|
-
@page = @agent.get(uri + "/struct/structual_text.html")
|
14
|
+
@uri = uri + "/struct/structual_text.html"
|
16
15
|
|
17
16
|
@table_1996 = [
|
18
17
|
{ "title" => "The Perfect Insider",
|
@@ -53,7 +52,7 @@ describe 'Yasuri' do
|
|
53
52
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
53
|
])
|
55
54
|
expected = @table_1996
|
56
|
-
actual = node.
|
55
|
+
actual = node.scrape(@uri)
|
57
56
|
expect(actual).to match expected
|
58
57
|
end
|
59
58
|
|
@@ -63,7 +62,7 @@ describe 'Yasuri' do
|
|
63
62
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
64
63
|
])
|
65
64
|
expected = @table_1996.first
|
66
|
-
actual = node.
|
65
|
+
actual = node.scrape(@uri)
|
67
66
|
expect(actual).to match expected
|
68
67
|
end
|
69
68
|
|
@@ -72,7 +71,7 @@ describe 'Yasuri' do
|
|
72
71
|
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
73
72
|
Yasuri::TextNode.new('./td[1]', "title")
|
74
73
|
])
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to be_empty
|
77
76
|
end
|
78
77
|
|
@@ -81,7 +80,7 @@ describe 'Yasuri' do
|
|
81
80
|
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
82
81
|
Yasuri::TextNode.new('./td[1]', "title")
|
83
82
|
])
|
84
|
-
expect { node.
|
83
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
85
84
|
end
|
86
85
|
|
87
86
|
it 'fail with invalid xpath in children' do
|
@@ -90,7 +89,7 @@ describe 'Yasuri' do
|
|
90
89
|
Yasuri::TextNode.new(invalid_xpath, "title"),
|
91
90
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
92
91
|
])
|
93
|
-
expect { node.
|
92
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
94
93
|
end
|
95
94
|
|
96
95
|
it 'scrape all tables' do
|
@@ -101,7 +100,7 @@ describe 'Yasuri' do
|
|
101
100
|
])
|
102
101
|
])
|
103
102
|
expected = @all_tables
|
104
|
-
actual = node.
|
103
|
+
actual = node.scrape(@uri)
|
105
104
|
expect(actual).to match expected
|
106
105
|
end
|
107
106
|
|
@@ -118,7 +117,7 @@ describe 'Yasuri' do
|
|
118
117
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
119
118
|
])
|
120
119
|
])
|
121
|
-
compare_generated_vs_original(generated, original, @
|
120
|
+
compare_generated_vs_original(generated, original, @uri)
|
122
121
|
end
|
123
122
|
|
124
123
|
it 'return child node as symbol' do
|
@@ -127,7 +126,7 @@ describe 'Yasuri' do
|
|
127
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
127
|
])
|
129
128
|
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
|
-
actual = node.
|
129
|
+
actual = node.scrape(@uri, symbolize_names:true)
|
131
130
|
expect(actual).to match expected
|
132
131
|
end
|
133
132
|
|
@@ -135,9 +134,7 @@ describe 'Yasuri' do
|
|
135
134
|
|
136
135
|
describe '::StructNode::Links' do
|
137
136
|
before do
|
138
|
-
@
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
137
|
+
@uri = uri + "/struct/structual_links.html"
|
141
138
|
@table = [
|
142
139
|
{ "title" => "Child01,02",
|
143
140
|
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
@@ -155,22 +152,21 @@ describe 'Yasuri' do
|
|
155
152
|
])
|
156
153
|
])
|
157
154
|
expected = @table
|
158
|
-
actual = node.
|
155
|
+
actual = node.scrape(@uri)
|
159
156
|
expect(actual).to match expected
|
160
157
|
end
|
161
158
|
end # descrive
|
162
159
|
|
163
160
|
describe '::StructNode::Pages' do
|
164
161
|
before do
|
165
|
-
@
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
162
|
+
@uri = uri + "/struct/structual_text.html"
|
167
163
|
end
|
168
164
|
|
169
165
|
it 'not supported' do
|
170
166
|
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
167
|
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
168
|
])
|
173
|
-
expect{ node.
|
169
|
+
expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
170
|
end
|
175
171
|
end
|
176
172
|
end
|
@@ -10,69 +10,68 @@ describe 'Yasuri' do
|
|
10
10
|
include_context 'httpserver'
|
11
11
|
|
12
12
|
before do
|
13
|
-
@
|
14
|
-
@index_page = @agent.get(uri)
|
13
|
+
@uri = uri
|
15
14
|
end
|
16
15
|
|
17
16
|
describe '::TextNode' do
|
18
17
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
18
|
|
20
19
|
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
-
actual = @node.
|
20
|
+
actual = @node.scrape(@uri)
|
22
21
|
expect(actual).to eq "Hello,Yasuri"
|
23
22
|
end
|
24
23
|
|
25
24
|
it 'return empty text if no match node' do
|
26
25
|
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
-
actual = no_match_node.
|
26
|
+
actual = no_match_node.scrape(@uri)
|
28
27
|
expect(actual).to be_empty
|
29
28
|
end
|
30
29
|
|
31
30
|
it 'fail with invalid xpath' do
|
32
31
|
invalid_xpath = '/html/body/no_match_node['
|
33
32
|
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
-
expect { node.
|
33
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
35
34
|
end
|
36
35
|
|
37
36
|
it "can be defined by DSL, return single TextNode title" do
|
38
37
|
generated = Yasuri.text_title '/html/body/p[1]'
|
39
38
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
-
compare_generated_vs_original(generated, original, @
|
39
|
+
compare_generated_vs_original(generated, original, @uri)
|
41
40
|
end
|
42
41
|
|
43
42
|
it "can be truncated with regexp" do
|
44
43
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
45
|
-
actual = node.
|
44
|
+
actual = node.scrape(@uri)
|
46
45
|
expect(actual).to eq "Hello"
|
47
46
|
end
|
48
47
|
|
49
48
|
it "return first captured if matched given capture pattern" do
|
50
49
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
|
51
|
-
actual = node.
|
50
|
+
actual = node.scrape(@uri)
|
52
51
|
expect(actual).to eq "ello,Yasur"
|
53
52
|
end
|
54
53
|
|
55
54
|
it "can be truncated with regexp" do
|
56
55
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
|
57
|
-
actual = node.
|
56
|
+
actual = node.scrape(@uri)
|
58
57
|
expect(actual).to eq "Yasuri"
|
59
58
|
end
|
60
59
|
|
61
60
|
it "return empty string if truncated with no match to regexp" do
|
62
61
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
|
63
|
-
actual = node.
|
62
|
+
actual = node.scrape(@uri)
|
64
63
|
expect(actual).to be_empty
|
65
64
|
end
|
66
65
|
|
67
66
|
it "return symbol method applied string" do
|
68
67
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
69
|
-
actual = node.
|
68
|
+
actual = node.scrape(@uri)
|
70
69
|
expect(actual).to eq "HELLO,YASURI"
|
71
70
|
end
|
72
71
|
|
73
72
|
it "return apply multi arguments" do
|
74
73
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to eq "ELLO,YASUR"
|
77
76
|
end
|
78
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -170,7 +170,10 @@ files:
|
|
170
170
|
- Rakefile
|
171
171
|
- USAGE.ja.md
|
172
172
|
- USAGE.md
|
173
|
-
-
|
173
|
+
- examples/example.rb
|
174
|
+
- examples/github.yml
|
175
|
+
- examples/sample.json
|
176
|
+
- examples/sample.yml
|
174
177
|
- exe/yasuri
|
175
178
|
- lib/yasuri.rb
|
176
179
|
- lib/yasuri/version.rb
|
data/app.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
# Author:: TAC (tac@tac42.net)
|
5
|
-
|
6
|
-
require 'pp'
|
7
|
-
require 'time'
|
8
|
-
require 'mechanize'
|
9
|
-
|
10
|
-
require_relative 'lib/yasuri/yasuri'
|
11
|
-
|
12
|
-
agent = Mechanize.new
|
13
|
-
|
14
|
-
uri = "http://www.asahi.com/"
|
15
|
-
|
16
|
-
# Node tree constructing by DSL
|
17
|
-
root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
|
18
|
-
text_title '//*[@id="MainInner"]/div[1]/div/h1'
|
19
|
-
text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
|
20
|
-
end
|
21
|
-
|
22
|
-
# Node tree constructing by JSON
|
23
|
-
src = <<-EOJSON
|
24
|
-
{ "node" : "links",
|
25
|
-
"name" : "root",
|
26
|
-
"path" : "//*[@id='MainInner']/div[1]/ul/li/a",
|
27
|
-
"children" : [
|
28
|
-
{ "node" : "text",
|
29
|
-
"name" : "title",
|
30
|
-
"path" : "//*[@id='MainInner']/div[1]/div/h1"
|
31
|
-
},
|
32
|
-
{ "node" : "text",
|
33
|
-
"name" : "article",
|
34
|
-
"path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
|
35
|
-
}
|
36
|
-
]
|
37
|
-
}
|
38
|
-
EOJSON
|
39
|
-
root = Yasuri.json2tree(src)
|
40
|
-
|
41
|
-
# Access to parsed resources
|
42
|
-
page = agent.get(uri)
|
43
|
-
contents = root.inject(agent, page)
|
44
|
-
|
45
|
-
contents.each do |h|
|
46
|
-
t = h['title']
|
47
|
-
a = h['article']
|
48
|
-
|
49
|
-
puts t
|
50
|
-
puts a
|
51
|
-
puts "=" * 100
|
52
|
-
end
|