yasuri 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +12 -4
- data/spec/yasuri_spec.rb +25 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39e4792feb25676d0f6e6f524d93ece20fb83530
|
4
|
+
data.tar.gz: fefe743c8e4807b69535f998e53d71784685aff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e532b3ad29681aebaec403bec9c34bcba96f14e6f5b8b6909d89d00938ec1988a4a26396a2fc058abe30a8f04e1c2fdfb8f755daee868c3b9eff6d4daf6c79a2
|
7
|
+
data.tar.gz: 469ac8b9a30715322e2efab60b2665b3da6eab66666bfbfc7af3cfd05d75750aa47e26b8763a373c5691395c93af5a7b35a71704358cf35f96e8662880ada5de
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -67,6 +67,12 @@ module Yasuri
|
|
67
67
|
|
68
68
|
class PaginateNode
|
69
69
|
include Node
|
70
|
+
|
71
|
+
def initialize(xpath, name, children = [], limit = Float::INFINITY)
|
72
|
+
super(xpath, name, children)
|
73
|
+
@limit = limit
|
74
|
+
end
|
75
|
+
|
70
76
|
def inject(agent, page, retry_count = 5)
|
71
77
|
|
72
78
|
child_results = []
|
@@ -81,6 +87,7 @@ module Yasuri
|
|
81
87
|
|
82
88
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
83
89
|
page = Yasuri.with_retry(retry_count) { link_button.click }
|
90
|
+
break if (@limit -= 1) <= 0
|
84
91
|
end
|
85
92
|
|
86
93
|
child_results
|
@@ -101,19 +108,20 @@ module Yasuri
|
|
101
108
|
end
|
102
109
|
|
103
110
|
def self.gen(name, *args, &block)
|
104
|
-
xpath,
|
111
|
+
xpath, opt = *args
|
105
112
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
106
113
|
|
107
114
|
case name
|
108
115
|
when /^text_(.+)$/
|
109
|
-
truncate_regexp
|
110
|
-
Yasuri::TextNode.new(xpath, $1,
|
116
|
+
truncate_regexp = opt
|
117
|
+
Yasuri::TextNode.new(xpath, $1, truncate_regexp)
|
111
118
|
when /^struct_(.+)$/
|
112
119
|
Yasuri::StructNode.new(xpath, $1, children || [])
|
113
120
|
when /^links_(.+)$/
|
114
121
|
Yasuri::LinksNode.new(xpath, $1, children || [])
|
115
122
|
when /^pages_(.+)$/
|
116
|
-
|
123
|
+
limit = opt || Float::INFINITY
|
124
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit)
|
117
125
|
else
|
118
126
|
nil
|
119
127
|
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -278,6 +278,20 @@ describe 'Yasuri' do
|
|
278
278
|
expect(actual).to match expected
|
279
279
|
end
|
280
280
|
|
281
|
+
it "scrape each paginated pages limited" do
|
282
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
+
], 3)
|
285
|
+
actual = root_node.inject(@agent, @page)
|
286
|
+
expected = [
|
287
|
+
{"content" => "PaginationTest01"},
|
288
|
+
{"content" => "PaginationTest02"},
|
289
|
+
{"content" => "PaginationTest03"},
|
290
|
+
]
|
291
|
+
expect(actual).to match expected
|
292
|
+
end
|
293
|
+
|
294
|
+
|
281
295
|
it 'return first content if paginate link node is not found' do
|
282
296
|
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
283
297
|
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
@@ -304,7 +318,17 @@ describe 'Yasuri' do
|
|
304
318
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
305
319
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
306
320
|
])
|
307
|
-
|
321
|
+
compare_generated_vs_original(generated, original, @page)
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'can be defined by DSL, return single PaginateNode content limited' do
|
325
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
326
|
+
text_content '/html/body/p'
|
327
|
+
end
|
328
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
+
], 2)
|
331
|
+
compare_generated_vs_original(generated, original, @page)
|
308
332
|
end
|
309
333
|
end
|
310
334
|
|