yasuri 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +32 -10
- data/spec/yasuri_spec.rb +100 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66449daf368bdf42e7406cdf1b7db1eedc9625c9
|
4
|
+
data.tar.gz: d0bbef804af0da5228594df407f5e47edf5cf14e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2fce89a90d0d878d12d48b1e803d58581e1b3442efc76cfe2e3b938c048ada3fd558099634a3d25a2f48851771ee1b047e3c8303eed51ebf6dd9e5bf1eae122
|
7
|
+
data.tar.gz: 58f748c39156abade472ee3c8c0b9332c5321fc6562dad5a35cb3a01e80c9e7cafb9abb327ae5626b50aaf4afd4d4f2e67e2da5b341ca575bf60514b924ad19e
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -21,19 +21,27 @@ module Yasuri
|
|
21
21
|
|
22
22
|
class TextNode
|
23
23
|
include Node
|
24
|
-
def
|
24
|
+
def initialize(xpath, name, children = [])
|
25
|
+
super(xpath, name, children)
|
26
|
+
@truncate_regexp, dummy = *children
|
27
|
+
end
|
28
|
+
def inject(agent, page, retry_count = 5)
|
25
29
|
node = page.search(@xpath)
|
26
|
-
node.text.to_s
|
30
|
+
text = node.text.to_s
|
31
|
+
|
32
|
+
text = text[@truncate_regexp, 0] if @truncate_regexp
|
33
|
+
|
34
|
+
text.to_s
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
30
38
|
class StructNode
|
31
39
|
include Node
|
32
|
-
def inject(agent, page)
|
40
|
+
def inject(agent, page, retry_count = 5)
|
33
41
|
sub_tags = page.search(@xpath)
|
34
42
|
sub_tags.map do |sub_tag|
|
35
43
|
child_results_kv = @children.map do |child_node|
|
36
|
-
[child_node.name, child_node.inject(agent, sub_tag)]
|
44
|
+
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
37
45
|
end
|
38
46
|
Hash[child_results_kv]
|
39
47
|
end
|
@@ -42,14 +50,14 @@ module Yasuri
|
|
42
50
|
|
43
51
|
class LinksNode
|
44
52
|
include Node
|
45
|
-
def inject(agent, page)
|
53
|
+
def inject(agent, page, retry_count = 5)
|
46
54
|
links = page.search(@xpath) || [] # links expected
|
47
55
|
links.map do |link|
|
48
56
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
49
|
-
child_page = link_button.click
|
57
|
+
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
50
58
|
|
51
59
|
child_results_kv = @children.map do |child_node|
|
52
|
-
[child_node.name, child_node.inject(agent, child_page)]
|
60
|
+
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
53
61
|
end
|
54
62
|
|
55
63
|
Hash[child_results_kv]
|
@@ -59,12 +67,12 @@ module Yasuri
|
|
59
67
|
|
60
68
|
class PaginateNode
|
61
69
|
include Node
|
62
|
-
def inject(agent, page)
|
70
|
+
def inject(agent, page, retry_count = 5)
|
63
71
|
|
64
72
|
child_results = []
|
65
73
|
while page
|
66
74
|
child_results_kv = @children.map do |child_node|
|
67
|
-
[child_node.name, child_node.inject(agent, page)]
|
75
|
+
[child_node.name, child_node.inject(agent, page, retry_count)]
|
68
76
|
end
|
69
77
|
child_results << Hash[child_results_kv]
|
70
78
|
|
@@ -72,7 +80,7 @@ module Yasuri
|
|
72
80
|
break if link == nil
|
73
81
|
|
74
82
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
75
|
-
page = link_button.click
|
83
|
+
page = Yasuri.with_retry(retry_count) { link_button.click }
|
76
84
|
end
|
77
85
|
|
78
86
|
child_results
|
@@ -98,6 +106,7 @@ module Yasuri
|
|
98
106
|
|
99
107
|
case name
|
100
108
|
when /^text_(.+)$/
|
109
|
+
truncate_regexp, dummy = children
|
101
110
|
Yasuri::TextNode.new(xpath, $1, children || [])
|
102
111
|
when /^struct_(.+)$/
|
103
112
|
Yasuri::StructNode.new(xpath, $1, children || [])
|
@@ -134,6 +143,19 @@ module Yasuri
|
|
134
143
|
klass = Text2Node[node]
|
135
144
|
klass ? klass.new(path, name, childnodes) : nil
|
136
145
|
end
|
146
|
+
|
147
|
+
def self.with_retry(retry_count = 5)
|
148
|
+
begin
|
149
|
+
return yield() if block_given?
|
150
|
+
rescue => e
|
151
|
+
if retry_count > 0
|
152
|
+
pp "retry #{retry_count}"
|
153
|
+
retry_count -= 1
|
154
|
+
retry
|
155
|
+
end
|
156
|
+
fail e
|
157
|
+
end
|
158
|
+
end
|
137
159
|
end
|
138
160
|
|
139
161
|
# alias for DSL
|
data/spec/yasuri_spec.rb
CHANGED
@@ -24,6 +24,9 @@ describe 'Yasuri' do
|
|
24
24
|
expect(actual).to match expected
|
25
25
|
end
|
26
26
|
|
27
|
+
########
|
28
|
+
# Text #
|
29
|
+
########
|
27
30
|
describe '::TextNode' do
|
28
31
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
29
32
|
|
@@ -32,13 +35,46 @@ describe 'Yasuri' do
|
|
32
35
|
expect(actual).to eq "Hello,Yasuri"
|
33
36
|
end
|
34
37
|
|
38
|
+
it 'return empty text if no match node' do
|
39
|
+
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
40
|
+
actual = no_match_node.inject(@agent, @index_page)
|
41
|
+
expect(actual).to be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'fail with invalid xpath' do
|
45
|
+
invalid_xpath = '/html/body/no_match_node['
|
46
|
+
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
47
|
+
expect { node.inject(@agent, @index_page) }.to raise_error
|
48
|
+
end
|
49
|
+
|
35
50
|
it "can be defined by DSL, return single TextNode title" do
|
36
51
|
generated = text_title '/html/body/p[1]'
|
37
52
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
38
53
|
compare_generated_vs_original(generated, original)
|
39
54
|
end
|
55
|
+
|
56
|
+
it "can be truncated with regexp" do
|
57
|
+
node = text_title '/html/body/p[1]', /^[^,]+/
|
58
|
+
actual = node.inject(@agent, @index_page)
|
59
|
+
expect(actual).to eq "Hello"
|
60
|
+
end
|
61
|
+
|
62
|
+
it "can be truncated with regexp" do
|
63
|
+
node = text_title '/html/body/p[1]', /[^,]+$/
|
64
|
+
actual = node.inject(@agent, @index_page)
|
65
|
+
expect(actual).to eq "Yasuri"
|
66
|
+
end
|
67
|
+
|
68
|
+
it "return empty string if truncated with no match to regexp" do
|
69
|
+
node = text_title '/html/body/p[1]', /^hoge/
|
70
|
+
actual = node.inject(@agent, @index_page)
|
71
|
+
expect(actual).to be_empty
|
72
|
+
end
|
40
73
|
end
|
41
74
|
|
75
|
+
##########
|
76
|
+
# Struct #
|
77
|
+
##########
|
42
78
|
describe '::StructNode' do
|
43
79
|
before do
|
44
80
|
@page = @agent.get(@uri + "/structual_text.html")
|
@@ -84,6 +120,32 @@ describe 'Yasuri' do
|
|
84
120
|
expect(actual).to match expected
|
85
121
|
end
|
86
122
|
|
123
|
+
it 'return empty text if no match node' do
|
124
|
+
no_match_xpath = '/html/body/table[1]/t'
|
125
|
+
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
126
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
127
|
+
])
|
128
|
+
actual = node.inject(@agent, @page)
|
129
|
+
expect(actual).to be_empty
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'fail with invalid xpath' do
|
133
|
+
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
134
|
+
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
135
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
136
|
+
])
|
137
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'fail with invalid xpath in children' do
|
141
|
+
invalid_xpath = './td[1]['
|
142
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
143
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
144
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
145
|
+
])
|
146
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
147
|
+
end
|
148
|
+
|
87
149
|
it 'scrape all tables' do
|
88
150
|
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
89
151
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -113,6 +175,9 @@ describe 'Yasuri' do
|
|
113
175
|
end
|
114
176
|
end
|
115
177
|
|
178
|
+
#########
|
179
|
+
# Links #
|
180
|
+
#########
|
116
181
|
describe '::LinksNode' do
|
117
182
|
it 'scrape links' do
|
118
183
|
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
@@ -128,6 +193,16 @@ describe 'Yasuri' do
|
|
128
193
|
expect(actual).to match expected
|
129
194
|
end
|
130
195
|
|
196
|
+
it 'return empty set if no match node' do
|
197
|
+
missing_xpath = '/html/body/b'
|
198
|
+
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
199
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
200
|
+
])
|
201
|
+
|
202
|
+
actual = root_node.inject(@agent, @index_page)
|
203
|
+
expect(actual).to be_empty
|
204
|
+
end
|
205
|
+
|
131
206
|
it 'scrape links, recursive' do
|
132
207
|
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
133
208
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -180,6 +255,9 @@ describe 'Yasuri' do
|
|
180
255
|
end
|
181
256
|
end
|
182
257
|
|
258
|
+
############
|
259
|
+
# Paginate #
|
260
|
+
############
|
183
261
|
describe '::PaginateNode' do
|
184
262
|
before do
|
185
263
|
@uri += "/pagination/page01.html"
|
@@ -200,6 +278,25 @@ describe 'Yasuri' do
|
|
200
278
|
expect(actual).to match expected
|
201
279
|
end
|
202
280
|
|
281
|
+
it 'return first content if paginate link node is not found' do
|
282
|
+
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
283
|
+
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
284
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
285
|
+
])
|
286
|
+
actual = root_node.inject(@agent, @page)
|
287
|
+
expected = [ {"content" => "PaginationTest01"}, ]
|
288
|
+
expect(actual).to match_array expected
|
289
|
+
end
|
290
|
+
|
291
|
+
it 'return empty hashes if content node is not found' do
|
292
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
293
|
+
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
294
|
+
])
|
295
|
+
actual = root_node.inject(@agent, @page)
|
296
|
+
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
297
|
+
expect(actual).to match_array expected
|
298
|
+
end
|
299
|
+
|
203
300
|
it 'can be defined by DSL, return single PaginateNode content' do
|
204
301
|
generated = pages_next "/html/body/nav/span/a[@class='next']" do
|
205
302
|
text_content '/html/body/p'
|
@@ -211,6 +308,9 @@ describe 'Yasuri' do
|
|
211
308
|
end
|
212
309
|
end
|
213
310
|
|
311
|
+
#############
|
312
|
+
# json2tree #
|
313
|
+
#############
|
214
314
|
describe '.json2tree' do
|
215
315
|
it "return empty tree" do
|
216
316
|
tree = Yasuri.json2tree("{}")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|