yasuri 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1573790f94cdd8bf8621b1178f7c5ef5e00eee15
4
- data.tar.gz: b37d3a38cf679b38a1d5d9dba2bc785e54d6d2b3
3
+ metadata.gz: 66449daf368bdf42e7406cdf1b7db1eedc9625c9
4
+ data.tar.gz: d0bbef804af0da5228594df407f5e47edf5cf14e
5
5
  SHA512:
6
- metadata.gz: 8c8da39e14541e21ed520c051a4e425d6a5aed2f530b7b50e1652894ead3678d4717db7aa686a64f6c6f905d3df1e1bbfe71d1226381b0f08778a043fafaa2ba
7
- data.tar.gz: 1ee0447e8757ea26ff7b4dc36cc089dda9003be8de746259e8f430d27c608e0519e2558daf37b145f009e69b909c6647990f1e46876c47d47df7ba24b75f3862
6
+ metadata.gz: f2fce89a90d0d878d12d48b1e803d58581e1b3442efc76cfe2e3b938c048ada3fd558099634a3d25a2f48851771ee1b047e3c8303eed51ebf6dd9e5bf1eae122
7
+ data.tar.gz: 58f748c39156abade472ee3c8c0b9332c5321fc6562dad5a35cb3a01e80c9e7cafb9abb327ae5626b50aaf4afd4d4f2e67e2da5b341ca575bf60514b924ad19e
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -21,19 +21,27 @@ module Yasuri
21
21
 
22
22
  class TextNode
23
23
  include Node
24
- def inject(agent, page)
24
+ def initialize(xpath, name, children = [])
25
+ super(xpath, name, children)
26
+ @truncate_regexp, dummy = *children
27
+ end
28
+ def inject(agent, page, retry_count = 5)
25
29
  node = page.search(@xpath)
26
- node.text.to_s
30
+ text = node.text.to_s
31
+
32
+ text = text[@truncate_regexp, 0] if @truncate_regexp
33
+
34
+ text.to_s
27
35
  end
28
36
  end
29
37
 
30
38
  class StructNode
31
39
  include Node
32
- def inject(agent, page)
40
+ def inject(agent, page, retry_count = 5)
33
41
  sub_tags = page.search(@xpath)
34
42
  sub_tags.map do |sub_tag|
35
43
  child_results_kv = @children.map do |child_node|
36
- [child_node.name, child_node.inject(agent, sub_tag)]
44
+ [child_node.name, child_node.inject(agent, sub_tag, retry_count)]
37
45
  end
38
46
  Hash[child_results_kv]
39
47
  end
@@ -42,14 +50,14 @@ module Yasuri
42
50
 
43
51
  class LinksNode
44
52
  include Node
45
- def inject(agent, page)
53
+ def inject(agent, page, retry_count = 5)
46
54
  links = page.search(@xpath) || [] # links expected
47
55
  links.map do |link|
48
56
  link_button = Mechanize::Page::Link.new(link, agent, page)
49
- child_page = link_button.click
57
+ child_page = Yasuri.with_retry(retry_count) { link_button.click }
50
58
 
51
59
  child_results_kv = @children.map do |child_node|
52
- [child_node.name, child_node.inject(agent, child_page)]
60
+ [child_node.name, child_node.inject(agent, child_page, retry_count)]
53
61
  end
54
62
 
55
63
  Hash[child_results_kv]
@@ -59,12 +67,12 @@ module Yasuri
59
67
 
60
68
  class PaginateNode
61
69
  include Node
62
- def inject(agent, page)
70
+ def inject(agent, page, retry_count = 5)
63
71
 
64
72
  child_results = []
65
73
  while page
66
74
  child_results_kv = @children.map do |child_node|
67
- [child_node.name, child_node.inject(agent, page)]
75
+ [child_node.name, child_node.inject(agent, page, retry_count)]
68
76
  end
69
77
  child_results << Hash[child_results_kv]
70
78
 
@@ -72,7 +80,7 @@ module Yasuri
72
80
  break if link == nil
73
81
 
74
82
  link_button = Mechanize::Page::Link.new(link, agent, page)
75
- page = link_button.click
83
+ page = Yasuri.with_retry(retry_count) { link_button.click }
76
84
  end
77
85
 
78
86
  child_results
@@ -98,6 +106,7 @@ module Yasuri
98
106
 
99
107
  case name
100
108
  when /^text_(.+)$/
109
+ truncate_regexp, dummy = children
101
110
  Yasuri::TextNode.new(xpath, $1, children || [])
102
111
  when /^struct_(.+)$/
103
112
  Yasuri::StructNode.new(xpath, $1, children || [])
@@ -134,6 +143,19 @@ module Yasuri
134
143
  klass = Text2Node[node]
135
144
  klass ? klass.new(path, name, childnodes) : nil
136
145
  end
146
+
147
+ def self.with_retry(retry_count = 5)
148
+ begin
149
+ return yield() if block_given?
150
+ rescue => e
151
+ if retry_count > 0
152
+ pp "retry #{retry_count}"
153
+ retry_count -= 1
154
+ retry
155
+ end
156
+ fail e
157
+ end
158
+ end
137
159
  end
138
160
 
139
161
  # alias for DSL
data/spec/yasuri_spec.rb CHANGED
@@ -24,6 +24,9 @@ describe 'Yasuri' do
24
24
  expect(actual).to match expected
25
25
  end
26
26
 
27
+ ########
28
+ # Text #
29
+ ########
27
30
  describe '::TextNode' do
28
31
  before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
29
32
 
@@ -32,13 +35,46 @@ describe 'Yasuri' do
32
35
  expect(actual).to eq "Hello,Yasuri"
33
36
  end
34
37
 
38
+ it 'return empty text if no match node' do
39
+ no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
40
+ actual = no_match_node.inject(@agent, @index_page)
41
+ expect(actual).to be_empty
42
+ end
43
+
44
+ it 'fail with invalid xpath' do
45
+ invalid_xpath = '/html/body/no_match_node['
46
+ node = Yasuri::TextNode.new(invalid_xpath, "title")
47
+ expect { node.inject(@agent, @index_page) }.to raise_error
48
+ end
49
+
35
50
  it "can be defined by DSL, return single TextNode title" do
36
51
  generated = text_title '/html/body/p[1]'
37
52
  original = Yasuri::TextNode.new('/html/body/p[1]', "title")
38
53
  compare_generated_vs_original(generated, original)
39
54
  end
55
+
56
+ it "can be truncated with regexp" do
57
+ node = text_title '/html/body/p[1]', /^[^,]+/
58
+ actual = node.inject(@agent, @index_page)
59
+ expect(actual).to eq "Hello"
60
+ end
61
+
62
+ it "can be truncated with regexp" do
63
+ node = text_title '/html/body/p[1]', /[^,]+$/
64
+ actual = node.inject(@agent, @index_page)
65
+ expect(actual).to eq "Yasuri"
66
+ end
67
+
68
+ it "return empty string if truncated with no match to regexp" do
69
+ node = text_title '/html/body/p[1]', /^hoge/
70
+ actual = node.inject(@agent, @index_page)
71
+ expect(actual).to be_empty
72
+ end
40
73
  end
41
74
 
75
+ ##########
76
+ # Struct #
77
+ ##########
42
78
  describe '::StructNode' do
43
79
  before do
44
80
  @page = @agent.get(@uri + "/structual_text.html")
@@ -84,6 +120,32 @@ describe 'Yasuri' do
84
120
  expect(actual).to match expected
85
121
  end
86
122
 
123
+ it 'return empty text if no match node' do
124
+ no_match_xpath = '/html/body/table[1]/t'
125
+ node = Yasuri::StructNode.new(no_match_xpath, "table", [
126
+ Yasuri::TextNode.new('./td[1]', "title")
127
+ ])
128
+ actual = node.inject(@agent, @page)
129
+ expect(actual).to be_empty
130
+ end
131
+
132
+ it 'fail with invalid xpath' do
133
+ invalid_xpath = '/html/body/table[1]/table[1]/tr['
134
+ node = Yasuri::StructNode.new(invalid_xpath, "table", [
135
+ Yasuri::TextNode.new('./td[1]', "title")
136
+ ])
137
+ expect { node.inject(@agent, @page) }.to raise_error
138
+ end
139
+
140
+ it 'fail with invalid xpath in children' do
141
+ invalid_xpath = './td[1]['
142
+ node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
143
+ Yasuri::TextNode.new(invalid_xpath, "title"),
144
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
145
+ ])
146
+ expect { node.inject(@agent, @page) }.to raise_error
147
+ end
148
+
87
149
  it 'scrape all tables' do
88
150
  node = Yasuri::StructNode.new('/html/body/table', "tables", [
89
151
  Yasuri::StructNode.new('./tr', "table", [
@@ -113,6 +175,9 @@ describe 'Yasuri' do
113
175
  end
114
176
  end
115
177
 
178
+ #########
179
+ # Links #
180
+ #########
116
181
  describe '::LinksNode' do
117
182
  it 'scrape links' do
118
183
  root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
@@ -128,6 +193,16 @@ describe 'Yasuri' do
128
193
  expect(actual).to match expected
129
194
  end
130
195
 
196
+ it 'return empty set if no match node' do
197
+ missing_xpath = '/html/body/b'
198
+ root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
199
+ Yasuri::TextNode.new('/html/body/p', "content"),
200
+ ])
201
+
202
+ actual = root_node.inject(@agent, @index_page)
203
+ expect(actual).to be_empty
204
+ end
205
+
131
206
  it 'scrape links, recursive' do
132
207
  root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
133
208
  Yasuri::TextNode.new('/html/body/p', "content"),
@@ -180,6 +255,9 @@ describe 'Yasuri' do
180
255
  end
181
256
  end
182
257
 
258
+ ############
259
+ # Paginate #
260
+ ############
183
261
  describe '::PaginateNode' do
184
262
  before do
185
263
  @uri += "/pagination/page01.html"
@@ -200,6 +278,25 @@ describe 'Yasuri' do
200
278
  expect(actual).to match expected
201
279
  end
202
280
 
281
+ it 'return first content if paginate link node is not found' do
282
+ missing_xpath = "/html/body/nav/span/b[@class='next']"
283
+ root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
284
+ Yasuri::TextNode.new('/html/body/p', "content"),
285
+ ])
286
+ actual = root_node.inject(@agent, @page)
287
+ expected = [ {"content" => "PaginationTest01"}, ]
288
+ expect(actual).to match_array expected
289
+ end
290
+
291
+ it 'return empty hashes if content node is not found' do
292
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
293
+ Yasuri::TextNode.new('/html/body/hoge', "content"),
294
+ ])
295
+ actual = root_node.inject(@agent, @page)
296
+ expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
297
+ expect(actual).to match_array expected
298
+ end
299
+
203
300
  it 'can be defined by DSL, return single PaginateNode content' do
204
301
  generated = pages_next "/html/body/nav/span/a[@class='next']" do
205
302
  text_content '/html/body/p'
@@ -211,6 +308,9 @@ describe 'Yasuri' do
211
308
  end
212
309
  end
213
310
 
311
+ #############
312
+ # json2tree #
313
+ #############
214
314
  describe '.json2tree' do
215
315
  it "return empty tree" do
216
316
  tree = Yasuri.json2tree("{}")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-21 00:00:00.000000000 Z
11
+ date: 2015-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler