yasuri 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +32 -10
- data/spec/yasuri_spec.rb +100 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66449daf368bdf42e7406cdf1b7db1eedc9625c9
|
4
|
+
data.tar.gz: d0bbef804af0da5228594df407f5e47edf5cf14e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2fce89a90d0d878d12d48b1e803d58581e1b3442efc76cfe2e3b938c048ada3fd558099634a3d25a2f48851771ee1b047e3c8303eed51ebf6dd9e5bf1eae122
|
7
|
+
data.tar.gz: 58f748c39156abade472ee3c8c0b9332c5321fc6562dad5a35cb3a01e80c9e7cafb9abb327ae5626b50aaf4afd4d4f2e67e2da5b341ca575bf60514b924ad19e
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -21,19 +21,27 @@ module Yasuri
|
|
21
21
|
|
22
22
|
class TextNode
|
23
23
|
include Node
|
24
|
-
def
|
24
|
+
def initialize(xpath, name, children = [])
|
25
|
+
super(xpath, name, children)
|
26
|
+
@truncate_regexp, dummy = *children
|
27
|
+
end
|
28
|
+
def inject(agent, page, retry_count = 5)
|
25
29
|
node = page.search(@xpath)
|
26
|
-
node.text.to_s
|
30
|
+
text = node.text.to_s
|
31
|
+
|
32
|
+
text = text[@truncate_regexp, 0] if @truncate_regexp
|
33
|
+
|
34
|
+
text.to_s
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
30
38
|
class StructNode
|
31
39
|
include Node
|
32
|
-
def inject(agent, page)
|
40
|
+
def inject(agent, page, retry_count = 5)
|
33
41
|
sub_tags = page.search(@xpath)
|
34
42
|
sub_tags.map do |sub_tag|
|
35
43
|
child_results_kv = @children.map do |child_node|
|
36
|
-
[child_node.name, child_node.inject(agent, sub_tag)]
|
44
|
+
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
37
45
|
end
|
38
46
|
Hash[child_results_kv]
|
39
47
|
end
|
@@ -42,14 +50,14 @@ module Yasuri
|
|
42
50
|
|
43
51
|
class LinksNode
|
44
52
|
include Node
|
45
|
-
def inject(agent, page)
|
53
|
+
def inject(agent, page, retry_count = 5)
|
46
54
|
links = page.search(@xpath) || [] # links expected
|
47
55
|
links.map do |link|
|
48
56
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
49
|
-
child_page = link_button.click
|
57
|
+
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
50
58
|
|
51
59
|
child_results_kv = @children.map do |child_node|
|
52
|
-
[child_node.name, child_node.inject(agent, child_page)]
|
60
|
+
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
53
61
|
end
|
54
62
|
|
55
63
|
Hash[child_results_kv]
|
@@ -59,12 +67,12 @@ module Yasuri
|
|
59
67
|
|
60
68
|
class PaginateNode
|
61
69
|
include Node
|
62
|
-
def inject(agent, page)
|
70
|
+
def inject(agent, page, retry_count = 5)
|
63
71
|
|
64
72
|
child_results = []
|
65
73
|
while page
|
66
74
|
child_results_kv = @children.map do |child_node|
|
67
|
-
[child_node.name, child_node.inject(agent, page)]
|
75
|
+
[child_node.name, child_node.inject(agent, page, retry_count)]
|
68
76
|
end
|
69
77
|
child_results << Hash[child_results_kv]
|
70
78
|
|
@@ -72,7 +80,7 @@ module Yasuri
|
|
72
80
|
break if link == nil
|
73
81
|
|
74
82
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
75
|
-
page = link_button.click
|
83
|
+
page = Yasuri.with_retry(retry_count) { link_button.click }
|
76
84
|
end
|
77
85
|
|
78
86
|
child_results
|
@@ -98,6 +106,7 @@ module Yasuri
|
|
98
106
|
|
99
107
|
case name
|
100
108
|
when /^text_(.+)$/
|
109
|
+
truncate_regexp, dummy = children
|
101
110
|
Yasuri::TextNode.new(xpath, $1, children || [])
|
102
111
|
when /^struct_(.+)$/
|
103
112
|
Yasuri::StructNode.new(xpath, $1, children || [])
|
@@ -134,6 +143,19 @@ module Yasuri
|
|
134
143
|
klass = Text2Node[node]
|
135
144
|
klass ? klass.new(path, name, childnodes) : nil
|
136
145
|
end
|
146
|
+
|
147
|
+
def self.with_retry(retry_count = 5)
|
148
|
+
begin
|
149
|
+
return yield() if block_given?
|
150
|
+
rescue => e
|
151
|
+
if retry_count > 0
|
152
|
+
pp "retry #{retry_count}"
|
153
|
+
retry_count -= 1
|
154
|
+
retry
|
155
|
+
end
|
156
|
+
fail e
|
157
|
+
end
|
158
|
+
end
|
137
159
|
end
|
138
160
|
|
139
161
|
# alias for DSL
|
data/spec/yasuri_spec.rb
CHANGED
@@ -24,6 +24,9 @@ describe 'Yasuri' do
|
|
24
24
|
expect(actual).to match expected
|
25
25
|
end
|
26
26
|
|
27
|
+
########
|
28
|
+
# Text #
|
29
|
+
########
|
27
30
|
describe '::TextNode' do
|
28
31
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
29
32
|
|
@@ -32,13 +35,46 @@ describe 'Yasuri' do
|
|
32
35
|
expect(actual).to eq "Hello,Yasuri"
|
33
36
|
end
|
34
37
|
|
38
|
+
it 'return empty text if no match node' do
|
39
|
+
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
40
|
+
actual = no_match_node.inject(@agent, @index_page)
|
41
|
+
expect(actual).to be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'fail with invalid xpath' do
|
45
|
+
invalid_xpath = '/html/body/no_match_node['
|
46
|
+
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
47
|
+
expect { node.inject(@agent, @index_page) }.to raise_error
|
48
|
+
end
|
49
|
+
|
35
50
|
it "can be defined by DSL, return single TextNode title" do
|
36
51
|
generated = text_title '/html/body/p[1]'
|
37
52
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
38
53
|
compare_generated_vs_original(generated, original)
|
39
54
|
end
|
55
|
+
|
56
|
+
it "can be truncated with regexp" do
|
57
|
+
node = text_title '/html/body/p[1]', /^[^,]+/
|
58
|
+
actual = node.inject(@agent, @index_page)
|
59
|
+
expect(actual).to eq "Hello"
|
60
|
+
end
|
61
|
+
|
62
|
+
it "can be truncated with regexp" do
|
63
|
+
node = text_title '/html/body/p[1]', /[^,]+$/
|
64
|
+
actual = node.inject(@agent, @index_page)
|
65
|
+
expect(actual).to eq "Yasuri"
|
66
|
+
end
|
67
|
+
|
68
|
+
it "return empty string if truncated with no match to regexp" do
|
69
|
+
node = text_title '/html/body/p[1]', /^hoge/
|
70
|
+
actual = node.inject(@agent, @index_page)
|
71
|
+
expect(actual).to be_empty
|
72
|
+
end
|
40
73
|
end
|
41
74
|
|
75
|
+
##########
|
76
|
+
# Struct #
|
77
|
+
##########
|
42
78
|
describe '::StructNode' do
|
43
79
|
before do
|
44
80
|
@page = @agent.get(@uri + "/structual_text.html")
|
@@ -84,6 +120,32 @@ describe 'Yasuri' do
|
|
84
120
|
expect(actual).to match expected
|
85
121
|
end
|
86
122
|
|
123
|
+
it 'return empty text if no match node' do
|
124
|
+
no_match_xpath = '/html/body/table[1]/t'
|
125
|
+
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
126
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
127
|
+
])
|
128
|
+
actual = node.inject(@agent, @page)
|
129
|
+
expect(actual).to be_empty
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'fail with invalid xpath' do
|
133
|
+
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
134
|
+
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
135
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
136
|
+
])
|
137
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'fail with invalid xpath in children' do
|
141
|
+
invalid_xpath = './td[1]['
|
142
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
143
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
144
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
145
|
+
])
|
146
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
147
|
+
end
|
148
|
+
|
87
149
|
it 'scrape all tables' do
|
88
150
|
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
89
151
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -113,6 +175,9 @@ describe 'Yasuri' do
|
|
113
175
|
end
|
114
176
|
end
|
115
177
|
|
178
|
+
#########
|
179
|
+
# Links #
|
180
|
+
#########
|
116
181
|
describe '::LinksNode' do
|
117
182
|
it 'scrape links' do
|
118
183
|
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
@@ -128,6 +193,16 @@ describe 'Yasuri' do
|
|
128
193
|
expect(actual).to match expected
|
129
194
|
end
|
130
195
|
|
196
|
+
it 'return empty set if no match node' do
|
197
|
+
missing_xpath = '/html/body/b'
|
198
|
+
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
199
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
200
|
+
])
|
201
|
+
|
202
|
+
actual = root_node.inject(@agent, @index_page)
|
203
|
+
expect(actual).to be_empty
|
204
|
+
end
|
205
|
+
|
131
206
|
it 'scrape links, recursive' do
|
132
207
|
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
133
208
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -180,6 +255,9 @@ describe 'Yasuri' do
|
|
180
255
|
end
|
181
256
|
end
|
182
257
|
|
258
|
+
############
|
259
|
+
# Paginate #
|
260
|
+
############
|
183
261
|
describe '::PaginateNode' do
|
184
262
|
before do
|
185
263
|
@uri += "/pagination/page01.html"
|
@@ -200,6 +278,25 @@ describe 'Yasuri' do
|
|
200
278
|
expect(actual).to match expected
|
201
279
|
end
|
202
280
|
|
281
|
+
it 'return first content if paginate link node is not found' do
|
282
|
+
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
283
|
+
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
284
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
285
|
+
])
|
286
|
+
actual = root_node.inject(@agent, @page)
|
287
|
+
expected = [ {"content" => "PaginationTest01"}, ]
|
288
|
+
expect(actual).to match_array expected
|
289
|
+
end
|
290
|
+
|
291
|
+
it 'return empty hashes if content node is not found' do
|
292
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
293
|
+
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
294
|
+
])
|
295
|
+
actual = root_node.inject(@agent, @page)
|
296
|
+
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
297
|
+
expect(actual).to match_array expected
|
298
|
+
end
|
299
|
+
|
203
300
|
it 'can be defined by DSL, return single PaginateNode content' do
|
204
301
|
generated = pages_next "/html/body/nav/span/a[@class='next']" do
|
205
302
|
text_content '/html/body/p'
|
@@ -211,6 +308,9 @@ describe 'Yasuri' do
|
|
211
308
|
end
|
212
309
|
end
|
213
310
|
|
311
|
+
#############
|
312
|
+
# json2tree #
|
313
|
+
#############
|
214
314
|
describe '.json2tree' do
|
215
315
|
it "return empty tree" do
|
216
316
|
tree = Yasuri.json2tree("{}")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|