yasuri 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +12 -4
- data/spec/yasuri_spec.rb +25 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39e4792feb25676d0f6e6f524d93ece20fb83530
|
4
|
+
data.tar.gz: fefe743c8e4807b69535f998e53d71784685aff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e532b3ad29681aebaec403bec9c34bcba96f14e6f5b8b6909d89d00938ec1988a4a26396a2fc058abe30a8f04e1c2fdfb8f755daee868c3b9eff6d4daf6c79a2
|
7
|
+
data.tar.gz: 469ac8b9a30715322e2efab60b2665b3da6eab66666bfbfc7af3cfd05d75750aa47e26b8763a373c5691395c93af5a7b35a71704358cf35f96e8662880ada5de
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -67,6 +67,12 @@ module Yasuri
|
|
67
67
|
|
68
68
|
class PaginateNode
|
69
69
|
include Node
|
70
|
+
|
71
|
+
def initialize(xpath, name, children = [], limit = Float::INFINITY)
|
72
|
+
super(xpath, name, children)
|
73
|
+
@limit = limit
|
74
|
+
end
|
75
|
+
|
70
76
|
def inject(agent, page, retry_count = 5)
|
71
77
|
|
72
78
|
child_results = []
|
@@ -81,6 +87,7 @@ module Yasuri
|
|
81
87
|
|
82
88
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
83
89
|
page = Yasuri.with_retry(retry_count) { link_button.click }
|
90
|
+
break if (@limit -= 1) <= 0
|
84
91
|
end
|
85
92
|
|
86
93
|
child_results
|
@@ -101,19 +108,20 @@ module Yasuri
|
|
101
108
|
end
|
102
109
|
|
103
110
|
def self.gen(name, *args, &block)
|
104
|
-
xpath,
|
111
|
+
xpath, opt = *args
|
105
112
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
106
113
|
|
107
114
|
case name
|
108
115
|
when /^text_(.+)$/
|
109
|
-
truncate_regexp
|
110
|
-
Yasuri::TextNode.new(xpath, $1,
|
116
|
+
truncate_regexp = opt
|
117
|
+
Yasuri::TextNode.new(xpath, $1, truncate_regexp)
|
111
118
|
when /^struct_(.+)$/
|
112
119
|
Yasuri::StructNode.new(xpath, $1, children || [])
|
113
120
|
when /^links_(.+)$/
|
114
121
|
Yasuri::LinksNode.new(xpath, $1, children || [])
|
115
122
|
when /^pages_(.+)$/
|
116
|
-
|
123
|
+
limit = opt || Float::INFINITY
|
124
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit)
|
117
125
|
else
|
118
126
|
nil
|
119
127
|
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -278,6 +278,20 @@ describe 'Yasuri' do
|
|
278
278
|
expect(actual).to match expected
|
279
279
|
end
|
280
280
|
|
281
|
+
it "scrape each paginated pages limited" do
|
282
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
+
], 3)
|
285
|
+
actual = root_node.inject(@agent, @page)
|
286
|
+
expected = [
|
287
|
+
{"content" => "PaginationTest01"},
|
288
|
+
{"content" => "PaginationTest02"},
|
289
|
+
{"content" => "PaginationTest03"},
|
290
|
+
]
|
291
|
+
expect(actual).to match expected
|
292
|
+
end
|
293
|
+
|
294
|
+
|
281
295
|
it 'return first content if paginate link node is not found' do
|
282
296
|
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
283
297
|
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
@@ -304,7 +318,17 @@ describe 'Yasuri' do
|
|
304
318
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
305
319
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
306
320
|
])
|
307
|
-
|
321
|
+
compare_generated_vs_original(generated, original, @page)
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'can be defined by DSL, return single PaginateNode content limited' do
|
325
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
326
|
+
text_content '/html/body/p'
|
327
|
+
end
|
328
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
+
], 2)
|
331
|
+
compare_generated_vs_original(generated, original, @page)
|
308
332
|
end
|
309
333
|
end
|
310
334
|
|