yasuri 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1573790f94cdd8bf8621b1178f7c5ef5e00eee15
4
- data.tar.gz: b37d3a38cf679b38a1d5d9dba2bc785e54d6d2b3
3
+ metadata.gz: 66449daf368bdf42e7406cdf1b7db1eedc9625c9
4
+ data.tar.gz: d0bbef804af0da5228594df407f5e47edf5cf14e
5
5
  SHA512:
6
- metadata.gz: 8c8da39e14541e21ed520c051a4e425d6a5aed2f530b7b50e1652894ead3678d4717db7aa686a64f6c6f905d3df1e1bbfe71d1226381b0f08778a043fafaa2ba
7
- data.tar.gz: 1ee0447e8757ea26ff7b4dc36cc089dda9003be8de746259e8f430d27c608e0519e2558daf37b145f009e69b909c6647990f1e46876c47d47df7ba24b75f3862
6
+ metadata.gz: f2fce89a90d0d878d12d48b1e803d58581e1b3442efc76cfe2e3b938c048ada3fd558099634a3d25a2f48851771ee1b047e3c8303eed51ebf6dd9e5bf1eae122
7
+ data.tar.gz: 58f748c39156abade472ee3c8c0b9332c5321fc6562dad5a35cb3a01e80c9e7cafb9abb327ae5626b50aaf4afd4d4f2e67e2da5b341ca575bf60514b924ad19e
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -21,19 +21,27 @@ module Yasuri
21
21
 
22
22
  class TextNode
23
23
  include Node
24
- def inject(agent, page)
24
+ def initialize(xpath, name, children = [])
25
+ super(xpath, name, children)
26
+ @truncate_regexp, dummy = *children
27
+ end
28
+ def inject(agent, page, retry_count = 5)
25
29
  node = page.search(@xpath)
26
- node.text.to_s
30
+ text = node.text.to_s
31
+
32
+ text = text[@truncate_regexp, 0] if @truncate_regexp
33
+
34
+ text.to_s
27
35
  end
28
36
  end
29
37
 
30
38
  class StructNode
31
39
  include Node
32
- def inject(agent, page)
40
+ def inject(agent, page, retry_count = 5)
33
41
  sub_tags = page.search(@xpath)
34
42
  sub_tags.map do |sub_tag|
35
43
  child_results_kv = @children.map do |child_node|
36
- [child_node.name, child_node.inject(agent, sub_tag)]
44
+ [child_node.name, child_node.inject(agent, sub_tag, retry_count)]
37
45
  end
38
46
  Hash[child_results_kv]
39
47
  end
@@ -42,14 +50,14 @@ module Yasuri
42
50
 
43
51
  class LinksNode
44
52
  include Node
45
- def inject(agent, page)
53
+ def inject(agent, page, retry_count = 5)
46
54
  links = page.search(@xpath) || [] # links expected
47
55
  links.map do |link|
48
56
  link_button = Mechanize::Page::Link.new(link, agent, page)
49
- child_page = link_button.click
57
+ child_page = Yasuri.with_retry(retry_count) { link_button.click }
50
58
 
51
59
  child_results_kv = @children.map do |child_node|
52
- [child_node.name, child_node.inject(agent, child_page)]
60
+ [child_node.name, child_node.inject(agent, child_page, retry_count)]
53
61
  end
54
62
 
55
63
  Hash[child_results_kv]
@@ -59,12 +67,12 @@ module Yasuri
59
67
 
60
68
  class PaginateNode
61
69
  include Node
62
- def inject(agent, page)
70
+ def inject(agent, page, retry_count = 5)
63
71
 
64
72
  child_results = []
65
73
  while page
66
74
  child_results_kv = @children.map do |child_node|
67
- [child_node.name, child_node.inject(agent, page)]
75
+ [child_node.name, child_node.inject(agent, page, retry_count)]
68
76
  end
69
77
  child_results << Hash[child_results_kv]
70
78
 
@@ -72,7 +80,7 @@ module Yasuri
72
80
  break if link == nil
73
81
 
74
82
  link_button = Mechanize::Page::Link.new(link, agent, page)
75
- page = link_button.click
83
+ page = Yasuri.with_retry(retry_count) { link_button.click }
76
84
  end
77
85
 
78
86
  child_results
@@ -98,6 +106,7 @@ module Yasuri
98
106
 
99
107
  case name
100
108
  when /^text_(.+)$/
109
+ truncate_regexp, dummy = children
101
110
  Yasuri::TextNode.new(xpath, $1, children || [])
102
111
  when /^struct_(.+)$/
103
112
  Yasuri::StructNode.new(xpath, $1, children || [])
@@ -134,6 +143,19 @@ module Yasuri
134
143
  klass = Text2Node[node]
135
144
  klass ? klass.new(path, name, childnodes) : nil
136
145
  end
146
+
147
+ def self.with_retry(retry_count = 5)
148
+ begin
149
+ return yield() if block_given?
150
+ rescue => e
151
+ if retry_count > 0
152
+ pp "retry #{retry_count}"
153
+ retry_count -= 1
154
+ retry
155
+ end
156
+ fail e
157
+ end
158
+ end
137
159
  end
138
160
 
139
161
  # alias for DSL
data/spec/yasuri_spec.rb CHANGED
@@ -24,6 +24,9 @@ describe 'Yasuri' do
24
24
  expect(actual).to match expected
25
25
  end
26
26
 
27
+ ########
28
+ # Text #
29
+ ########
27
30
  describe '::TextNode' do
28
31
  before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
29
32
 
@@ -32,13 +35,46 @@ describe 'Yasuri' do
32
35
  expect(actual).to eq "Hello,Yasuri"
33
36
  end
34
37
 
38
+ it 'return empty text if no match node' do
39
+ no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
40
+ actual = no_match_node.inject(@agent, @index_page)
41
+ expect(actual).to be_empty
42
+ end
43
+
44
+ it 'fail with invalid xpath' do
45
+ invalid_xpath = '/html/body/no_match_node['
46
+ node = Yasuri::TextNode.new(invalid_xpath, "title")
47
+ expect { node.inject(@agent, @index_page) }.to raise_error
48
+ end
49
+
35
50
  it "can be defined by DSL, return single TextNode title" do
36
51
  generated = text_title '/html/body/p[1]'
37
52
  original = Yasuri::TextNode.new('/html/body/p[1]', "title")
38
53
  compare_generated_vs_original(generated, original)
39
54
  end
55
+
56
+ it "can be truncated with regexp" do
57
+ node = text_title '/html/body/p[1]', /^[^,]+/
58
+ actual = node.inject(@agent, @index_page)
59
+ expect(actual).to eq "Hello"
60
+ end
61
+
62
+ it "can be truncated with regexp" do
63
+ node = text_title '/html/body/p[1]', /[^,]+$/
64
+ actual = node.inject(@agent, @index_page)
65
+ expect(actual).to eq "Yasuri"
66
+ end
67
+
68
+ it "return empty string if truncated with no match to regexp" do
69
+ node = text_title '/html/body/p[1]', /^hoge/
70
+ actual = node.inject(@agent, @index_page)
71
+ expect(actual).to be_empty
72
+ end
40
73
  end
41
74
 
75
+ ##########
76
+ # Struct #
77
+ ##########
42
78
  describe '::StructNode' do
43
79
  before do
44
80
  @page = @agent.get(@uri + "/structual_text.html")
@@ -84,6 +120,32 @@ describe 'Yasuri' do
84
120
  expect(actual).to match expected
85
121
  end
86
122
 
123
+ it 'return empty text if no match node' do
124
+ no_match_xpath = '/html/body/table[1]/t'
125
+ node = Yasuri::StructNode.new(no_match_xpath, "table", [
126
+ Yasuri::TextNode.new('./td[1]', "title")
127
+ ])
128
+ actual = node.inject(@agent, @page)
129
+ expect(actual).to be_empty
130
+ end
131
+
132
+ it 'fail with invalid xpath' do
133
+ invalid_xpath = '/html/body/table[1]/table[1]/tr['
134
+ node = Yasuri::StructNode.new(invalid_xpath, "table", [
135
+ Yasuri::TextNode.new('./td[1]', "title")
136
+ ])
137
+ expect { node.inject(@agent, @page) }.to raise_error
138
+ end
139
+
140
+ it 'fail with invalid xpath in children' do
141
+ invalid_xpath = './td[1]['
142
+ node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
143
+ Yasuri::TextNode.new(invalid_xpath, "title"),
144
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
145
+ ])
146
+ expect { node.inject(@agent, @page) }.to raise_error
147
+ end
148
+
87
149
  it 'scrape all tables' do
88
150
  node = Yasuri::StructNode.new('/html/body/table', "tables", [
89
151
  Yasuri::StructNode.new('./tr', "table", [
@@ -113,6 +175,9 @@ describe 'Yasuri' do
113
175
  end
114
176
  end
115
177
 
178
+ #########
179
+ # Links #
180
+ #########
116
181
  describe '::LinksNode' do
117
182
  it 'scrape links' do
118
183
  root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
@@ -128,6 +193,16 @@ describe 'Yasuri' do
128
193
  expect(actual).to match expected
129
194
  end
130
195
 
196
+ it 'return empty set if no match node' do
197
+ missing_xpath = '/html/body/b'
198
+ root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
199
+ Yasuri::TextNode.new('/html/body/p', "content"),
200
+ ])
201
+
202
+ actual = root_node.inject(@agent, @index_page)
203
+ expect(actual).to be_empty
204
+ end
205
+
131
206
  it 'scrape links, recursive' do
132
207
  root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
133
208
  Yasuri::TextNode.new('/html/body/p', "content"),
@@ -180,6 +255,9 @@ describe 'Yasuri' do
180
255
  end
181
256
  end
182
257
 
258
+ ############
259
+ # Paginate #
260
+ ############
183
261
  describe '::PaginateNode' do
184
262
  before do
185
263
  @uri += "/pagination/page01.html"
@@ -200,6 +278,25 @@ describe 'Yasuri' do
200
278
  expect(actual).to match expected
201
279
  end
202
280
 
281
+ it 'return first content if paginate link node is not found' do
282
+ missing_xpath = "/html/body/nav/span/b[@class='next']"
283
+ root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
284
+ Yasuri::TextNode.new('/html/body/p', "content"),
285
+ ])
286
+ actual = root_node.inject(@agent, @page)
287
+ expected = [ {"content" => "PaginationTest01"}, ]
288
+ expect(actual).to match_array expected
289
+ end
290
+
291
+ it 'return empty hashes if content node is not found' do
292
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
293
+ Yasuri::TextNode.new('/html/body/hoge', "content"),
294
+ ])
295
+ actual = root_node.inject(@agent, @page)
296
+ expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
297
+ expect(actual).to match_array expected
298
+ end
299
+
203
300
  it 'can be defined by DSL, return single PaginateNode content' do
204
301
  generated = pages_next "/html/body/nav/span/a[@class='next']" do
205
302
  text_content '/html/body/p'
@@ -211,6 +308,9 @@ describe 'Yasuri' do
211
308
  end
212
309
  end
213
310
 
311
+ #############
312
+ # json2tree #
313
+ #############
214
314
  describe '.json2tree' do
215
315
  it "return empty tree" do
216
316
  tree = Yasuri.json2tree("{}")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-21 00:00:00.000000000 Z
11
+ date: 2015-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler