yasuri 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +13 -9
- data/spec/yasuri_spec.rb +25 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61a2aa3974c697ecc14b991521961ec54f3ff5c0
|
4
|
+
data.tar.gz: 03cf5b02e7a646175183725d38d50a9538df7eed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2108d6b78c8704fa4d99491c1684dfd686c656d7bd039b8c80e52fa2150837958c25f37afabdcf198b10153b2f32216408735182fb0d58b3007f4808ac2226c9
|
7
|
+
data.tar.gz: c956e589ab7676e844110870e18c8f842afb6fd566a5ddd4758ec9e328961ff1b485bac9eebecdac6456f7ed28a9793d9dd4dd9bc2f89dd8f9131003c6db9a5d
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -10,7 +10,7 @@ module Yasuri
|
|
10
10
|
module Node
|
11
11
|
attr_reader :url, :xpath, :name
|
12
12
|
|
13
|
-
def initialize(xpath, name, children = [])
|
13
|
+
def initialize(xpath, name, children = [], opt: {})
|
14
14
|
@xpath, @name, @children = xpath, name, children
|
15
15
|
end
|
16
16
|
|
@@ -21,9 +21,9 @@ module Yasuri
|
|
21
21
|
|
22
22
|
class TextNode
|
23
23
|
include Node
|
24
|
-
def initialize(xpath, name, children = [])
|
24
|
+
def initialize(xpath, name, children = [], truncate_regexp: nil, opt: {})
|
25
25
|
super(xpath, name, children)
|
26
|
-
@truncate_regexp
|
26
|
+
@truncate_regexp = truncate_regexp
|
27
27
|
end
|
28
28
|
def inject(agent, page, retry_count = 5)
|
29
29
|
node = page.search(@xpath)
|
@@ -68,9 +68,9 @@ module Yasuri
|
|
68
68
|
class PaginateNode
|
69
69
|
include Node
|
70
70
|
|
71
|
-
def initialize(xpath, name, children = [], limit
|
71
|
+
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
72
72
|
super(xpath, name, children)
|
73
|
-
@limit = limit
|
73
|
+
@limit = limit || opt["limit"] || Float::MAX
|
74
74
|
end
|
75
75
|
|
76
76
|
def inject(agent, page, retry_count = 5)
|
@@ -120,8 +120,9 @@ module Yasuri
|
|
120
120
|
when /^links_(.+)$/
|
121
121
|
Yasuri::LinksNode.new(xpath, $1, children || [])
|
122
122
|
when /^pages_(.+)$/
|
123
|
-
limit =
|
124
|
-
|
123
|
+
xpath, limit = *args
|
124
|
+
limit = limit || Float::MAX
|
125
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
125
126
|
else
|
126
127
|
nil
|
127
128
|
end
|
@@ -146,16 +147,19 @@ module Yasuri
|
|
146
147
|
"links" => LinksNode,
|
147
148
|
"pages" => PaginateNode
|
148
149
|
}
|
150
|
+
ReservedKeys = %w|node name path children|
|
149
151
|
def self.hash2node(node_h)
|
150
|
-
node, name, path, children =
|
152
|
+
node, name, path, children = ReservedKeys.map do |key|
|
151
153
|
node_h[key]
|
152
154
|
end
|
153
155
|
children ||= []
|
154
156
|
|
155
157
|
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
158
|
+
ReservedKeys.each{|key| node_h.delete(key)}
|
159
|
+
opt = node_h
|
156
160
|
|
157
161
|
klass = Text2Node[node]
|
158
|
-
klass ? klass.new(path, name, childnodes) : nil
|
162
|
+
klass ? klass.new(path, name, childnodes, opt: opt) : nil
|
159
163
|
end
|
160
164
|
|
161
165
|
def self.with_retry(retry_count = 5)
|
data/spec/yasuri_spec.rb
CHANGED
@@ -54,19 +54,19 @@ describe 'Yasuri' do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it "can be truncated with regexp" do
|
57
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
57
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^[^,]+/
|
58
58
|
actual = node.inject(@agent, @index_page)
|
59
59
|
expect(actual).to eq "Hello"
|
60
60
|
end
|
61
61
|
|
62
62
|
it "can be truncated with regexp" do
|
63
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
63
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/[^,]+$/
|
64
64
|
actual = node.inject(@agent, @index_page)
|
65
65
|
expect(actual).to eq "Yasuri"
|
66
66
|
end
|
67
67
|
|
68
68
|
it "return empty string if truncated with no match to regexp" do
|
69
|
-
node = Yasuri.text_title '/html/body/p[1]',
|
69
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^hoge/
|
70
70
|
actual = node.inject(@agent, @index_page)
|
71
71
|
expect(actual).to be_empty
|
72
72
|
end
|
@@ -281,7 +281,7 @@ describe 'Yasuri' do
|
|
281
281
|
it "scrape each paginated pages limited" do
|
282
282
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
283
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
-
], 3)
|
284
|
+
], limit:3)
|
285
285
|
actual = root_node.inject(@agent, @page)
|
286
286
|
expected = [
|
287
287
|
{"content" => "PaginationTest01"},
|
@@ -327,7 +327,7 @@ describe 'Yasuri' do
|
|
327
327
|
end
|
328
328
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
329
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
-
], 2)
|
330
|
+
], limit: 2)
|
331
331
|
compare_generated_vs_original(generated, original, @page)
|
332
332
|
end
|
333
333
|
end
|
@@ -386,6 +386,26 @@ describe 'Yasuri' do
|
|
386
386
|
compare_generated_vs_original(generated, original, paginate_test_page)
|
387
387
|
end
|
388
388
|
|
389
|
+
it "return PaginateNode/TextNode with limit" do
|
390
|
+
src = %q|{ "node" : "pages",
|
391
|
+
"name" : "root",
|
392
|
+
"path" : "/html/body/nav/span/a[@class=\'next\']",
|
393
|
+
"limit" : 2,
|
394
|
+
"children" : [ { "node" : "text",
|
395
|
+
"name" : "content",
|
396
|
+
"path" : "/html/body/p"
|
397
|
+
} ]
|
398
|
+
}|
|
399
|
+
generated = Yasuri.json2tree(src)
|
400
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
401
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
402
|
+
], limit:2)
|
403
|
+
|
404
|
+
paginate_test_uri = @uri + "/pagination/page01.html"
|
405
|
+
paginate_test_page = @agent.get(paginate_test_uri)
|
406
|
+
compare_generated_vs_original(generated, original, paginate_test_page)
|
407
|
+
end
|
408
|
+
|
389
409
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
390
410
|
src = %q| { "node" : "struct",
|
391
411
|
"name" : "tables",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|