yasuri 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +13 -9
- data/spec/yasuri_spec.rb +25 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61a2aa3974c697ecc14b991521961ec54f3ff5c0
|
4
|
+
data.tar.gz: 03cf5b02e7a646175183725d38d50a9538df7eed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2108d6b78c8704fa4d99491c1684dfd686c656d7bd039b8c80e52fa2150837958c25f37afabdcf198b10153b2f32216408735182fb0d58b3007f4808ac2226c9
|
7
|
+
data.tar.gz: c956e589ab7676e844110870e18c8f842afb6fd566a5ddd4758ec9e328961ff1b485bac9eebecdac6456f7ed28a9793d9dd4dd9bc2f89dd8f9131003c6db9a5d
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -10,7 +10,7 @@ module Yasuri
|
|
10
10
|
module Node
|
11
11
|
attr_reader :url, :xpath, :name
|
12
12
|
|
13
|
-
def initialize(xpath, name, children = [])
|
13
|
+
def initialize(xpath, name, children = [], opt: {})
|
14
14
|
@xpath, @name, @children = xpath, name, children
|
15
15
|
end
|
16
16
|
|
@@ -21,9 +21,9 @@ module Yasuri
|
|
21
21
|
|
22
22
|
class TextNode
|
23
23
|
include Node
|
24
|
-
def initialize(xpath, name, children = [])
|
24
|
+
def initialize(xpath, name, children = [], truncate_regexp: nil, opt: {})
|
25
25
|
super(xpath, name, children)
|
26
|
-
@truncate_regexp
|
26
|
+
@truncate_regexp = truncate_regexp
|
27
27
|
end
|
28
28
|
def inject(agent, page, retry_count = 5)
|
29
29
|
node = page.search(@xpath)
|
@@ -68,9 +68,9 @@ module Yasuri
|
|
68
68
|
class PaginateNode
|
69
69
|
include Node
|
70
70
|
|
71
|
-
def initialize(xpath, name, children = [], limit
|
71
|
+
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
72
72
|
super(xpath, name, children)
|
73
|
-
@limit = limit
|
73
|
+
@limit = limit || opt["limit"] || Float::MAX
|
74
74
|
end
|
75
75
|
|
76
76
|
def inject(agent, page, retry_count = 5)
|
@@ -120,8 +120,9 @@ module Yasuri
|
|
120
120
|
when /^links_(.+)$/
|
121
121
|
Yasuri::LinksNode.new(xpath, $1, children || [])
|
122
122
|
when /^pages_(.+)$/
|
123
|
-
limit =
|
124
|
-
|
123
|
+
xpath, limit = *args
|
124
|
+
limit = limit || Float::MAX
|
125
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
125
126
|
else
|
126
127
|
nil
|
127
128
|
end
|
@@ -146,16 +147,19 @@ module Yasuri
|
|
146
147
|
"links" => LinksNode,
|
147
148
|
"pages" => PaginateNode
|
148
149
|
}
|
150
|
+
ReservedKeys = %w|node name path children|
|
149
151
|
def self.hash2node(node_h)
|
150
|
-
node, name, path, children =
|
152
|
+
node, name, path, children = ReservedKeys.map do |key|
|
151
153
|
node_h[key]
|
152
154
|
end
|
153
155
|
children ||= []
|
154
156
|
|
155
157
|
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
158
|
+
ReservedKeys.each{|key| node_h.delete(key)}
|
159
|
+
opt = node_h
|
156
160
|
|
157
161
|
klass = Text2Node[node]
|
158
|
-
klass ? klass.new(path, name, childnodes) : nil
|
162
|
+
klass ? klass.new(path, name, childnodes, opt: opt) : nil
|
159
163
|
end
|
160
164
|
|
161
165
|
def self.with_retry(retry_count = 5)
|
data/spec/yasuri_spec.rb
CHANGED
@@ -54,19 +54,19 @@ describe 'Yasuri' do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it "can be truncated with regexp" do
|
57
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
57
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^[^,]+/
|
58
58
|
actual = node.inject(@agent, @index_page)
|
59
59
|
expect(actual).to eq "Hello"
|
60
60
|
end
|
61
61
|
|
62
62
|
it "can be truncated with regexp" do
|
63
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
63
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/[^,]+$/
|
64
64
|
actual = node.inject(@agent, @index_page)
|
65
65
|
expect(actual).to eq "Yasuri"
|
66
66
|
end
|
67
67
|
|
68
68
|
it "return empty string if truncated with no match to regexp" do
|
69
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
69
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^hoge/
|
70
70
|
actual = node.inject(@agent, @index_page)
|
71
71
|
expect(actual).to be_empty
|
72
72
|
end
|
@@ -281,7 +281,7 @@ describe 'Yasuri' do
|
|
281
281
|
it "scrape each paginated pages limited" do
|
282
282
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
283
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
-
], 3)
|
284
|
+
], limit:3)
|
285
285
|
actual = root_node.inject(@agent, @page)
|
286
286
|
expected = [
|
287
287
|
{"content" => "PaginationTest01"},
|
@@ -327,7 +327,7 @@ describe 'Yasuri' do
|
|
327
327
|
end
|
328
328
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
329
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
-
], 2)
|
330
|
+
], limit: 2)
|
331
331
|
compare_generated_vs_original(generated, original, @page)
|
332
332
|
end
|
333
333
|
end
|
@@ -386,6 +386,26 @@ describe 'Yasuri' do
|
|
386
386
|
compare_generated_vs_original(generated, original, paginate_test_page)
|
387
387
|
end
|
388
388
|
|
389
|
+
it "return PaginateNode/TextNode with limit" do
|
390
|
+
src = %q|{ "node" : "pages",
|
391
|
+
"name" : "root",
|
392
|
+
"path" : "/html/body/nav/span/a[@class=\'next\']",
|
393
|
+
"limit" : 2,
|
394
|
+
"children" : [ { "node" : "text",
|
395
|
+
"name" : "content",
|
396
|
+
"path" : "/html/body/p"
|
397
|
+
} ]
|
398
|
+
}|
|
399
|
+
generated = Yasuri.json2tree(src)
|
400
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
401
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
402
|
+
], limit:2)
|
403
|
+
|
404
|
+
paginate_test_uri = @uri + "/pagination/page01.html"
|
405
|
+
paginate_test_page = @agent.get(paginate_test_uri)
|
406
|
+
compare_generated_vs_original(generated, original, paginate_test_page)
|
407
|
+
end
|
408
|
+
|
389
409
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
390
410
|
src = %q| { "node" : "struct",
|
391
411
|
"name" : "tables",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|