yasuri 2.0.11 → 2.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/USAGE.ja.md +35 -0
- data/USAGE.md +35 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri_links_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +1 -1
- data/lib/yasuri/yasuri_paginate_node.rb +10 -3
- data/lib/yasuri/yasuri_struct_node.rb +3 -3
- data/lib/yasuri/yasuri_text_node.rb +2 -2
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +2 -1
- data/spec/yasuri_struct_node_spec.rb +42 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4fed4a13bb125758515e3c0ced665b1ca3d20b6
|
4
|
+
data.tar.gz: e9dfb2ed6256a367db2e5b6a78d23fa097c422d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b9d6345f3f49b1f7d9445ce18bca736b8cbeedc69979a45d541b59af4e09092d7c1d12886801a24296e9e3d73f39a7c2d53a7c2de12e1a0ff890623b47cfe84
|
7
|
+
data.tar.gz: 6d755f266062052dd5244599deefefea85f7570c827a898e48eee22c44510dde287b0554ed2cae85e3b94b44fe4eb6f74b512c44047e6cc1bb43fe27a93143b0
|
data/USAGE.ja.md
CHANGED
@@ -431,3 +431,38 @@ node.inject(agent, page)
|
|
431
431
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
432
432
|
```
|
433
433
|
この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
|
434
|
+
|
435
|
+
##### `flatten`
|
436
|
+
取得した各ページの結果を展開します.
|
437
|
+
|
438
|
+
```ruby
|
439
|
+
agent = Mechanize.new
|
440
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
441
|
+
|
442
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
443
|
+
text_title '/html/head/title'
|
444
|
+
text_content '/html/body/p'
|
445
|
+
end
|
446
|
+
node.inject(agent, page)
|
447
|
+
|
448
|
+
#=> [ {"title" => "Page01",
|
449
|
+
"content" => "Patination01"},
|
450
|
+
{"title" => "Page01",
|
451
|
+
"content" => "Patination02"},
|
452
|
+
{"title" => "Page01",
|
453
|
+
"content" => "Patination03"}]
|
454
|
+
|
455
|
+
|
456
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
457
|
+
text_title '/html/head/title'
|
458
|
+
text_content '/html/body/p'
|
459
|
+
end
|
460
|
+
node.inject(agent, page)
|
461
|
+
|
462
|
+
#=> [ "Page01",
|
463
|
+
"Patination01",
|
464
|
+
"Page02",
|
465
|
+
"Patination02",
|
466
|
+
"Page03",
|
467
|
+
"Patination03"]
|
468
|
+
```
|
data/USAGE.md
CHANGED
@@ -429,3 +429,38 @@ node.inject(agent, page)
|
|
429
429
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
430
430
|
```
|
431
431
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
432
|
+
|
433
|
+
##### `flatten`
|
434
|
+
`flatten` option expands each page results.
|
435
|
+
|
436
|
+
```ruby
|
437
|
+
agent = Mechanize.new
|
438
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
439
|
+
|
440
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
441
|
+
text_title '/html/head/title'
|
442
|
+
text_content '/html/body/p'
|
443
|
+
end
|
444
|
+
node.inject(agent, page)
|
445
|
+
|
446
|
+
#=> [ {"title" => "Page01",
|
447
|
+
"content" => "Patination01"},
|
448
|
+
{"title" => "Page01",
|
449
|
+
"content" => "Patination02"},
|
450
|
+
{"title" => "Page01",
|
451
|
+
"content" => "Patination03"}]
|
452
|
+
|
453
|
+
|
454
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
455
|
+
text_title '/html/head/title'
|
456
|
+
text_content '/html/body/p'
|
457
|
+
end
|
458
|
+
node.inject(agent, page)
|
459
|
+
|
460
|
+
#=> [ "Page01",
|
461
|
+
"Patination01",
|
462
|
+
"Page02",
|
463
|
+
"Patination02",
|
464
|
+
"Page03",
|
465
|
+
"Patination03"]
|
466
|
+
```
|
data/lib/yasuri/version.rb
CHANGED
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
10
|
retry_count = opt[:retry_count] || 5
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,14 +7,17 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], limit: nil)
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
11
11
|
super(xpath, name, children)
|
12
|
+
@flatten = flatten
|
12
13
|
@limit = limit
|
13
14
|
end
|
14
15
|
|
15
|
-
def inject(agent, page, opt = {})
|
16
|
+
def inject(agent, page, opt = {}, element = page)
|
16
17
|
retry_count = opt[:retry_count] || 5
|
17
18
|
|
19
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
|
+
|
18
21
|
child_results = []
|
19
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
20
23
|
while page
|
@@ -32,10 +35,14 @@ module Yasuri
|
|
32
35
|
break if (limit -= 1) <= 0
|
33
36
|
end
|
34
37
|
|
38
|
+
if @flatten == true
|
39
|
+
return child_results.map{|h| h.values}.flatten
|
40
|
+
end
|
41
|
+
|
35
42
|
child_results
|
36
43
|
end
|
37
44
|
def opts
|
38
|
-
{limit:@limit}
|
45
|
+
{limit:@limit, flatten:@flatten}
|
39
46
|
end
|
40
47
|
end
|
41
48
|
end
|
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
13
|
child_name = Yasuri.NodeName(child_node.name, opt)
|
14
|
-
[child_name, child_node.inject(agent,
|
14
|
+
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>StructualLinksTest</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
|
7
|
+
<table>
|
8
|
+
<thead>
|
9
|
+
<tr>
|
10
|
+
<th>Title</th>
|
11
|
+
<th>Links</th>
|
12
|
+
</tr>
|
13
|
+
</thead>
|
14
|
+
<tr>
|
15
|
+
<td>Child01,02</td>
|
16
|
+
<td><a href="../child01.html">Child01</a></td>
|
17
|
+
<td><a href="../child02.html">Child02</a></td>
|
18
|
+
<td>../child02.html</td>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td>Child01,02,03</td>
|
23
|
+
<td><a href="../child01.html">Child01</a></td>
|
24
|
+
<td><a href="../child02.html">Child02</a></td>
|
25
|
+
<td><a href="../child03.html">Child03</a></td>
|
26
|
+
</tr>
|
27
|
+
</table>
|
28
|
+
|
29
|
+
</body>
|
30
|
+
</html>
|
File without changes
|
@@ -30,6 +30,49 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
+
it "scrape each paginated pages with flatten" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
+
Yasuri::TextNode.new('./a', "text"),
|
38
|
+
]),
|
39
|
+
], flatten: true)
|
40
|
+
actual = root_node.inject(@agent, @page)
|
41
|
+
expected = [
|
42
|
+
"PaginationTest01",
|
43
|
+
{"text"=>""},
|
44
|
+
{"text"=>""},
|
45
|
+
{"text" => "2"},
|
46
|
+
{"text" => "3"},
|
47
|
+
{"text" => "4"},
|
48
|
+
{"text"=>"NextPage »"},
|
49
|
+
"PaginationTest02",
|
50
|
+
{"text"=>"« PreviousPage"},
|
51
|
+
{"text" => "1"},
|
52
|
+
{"text"=>""},
|
53
|
+
{"text" => "3"},
|
54
|
+
{"text" => "4"},
|
55
|
+
{"text"=>"NextPage »"},
|
56
|
+
"PaginationTest03",
|
57
|
+
{"text"=>"« PreviousPage"},
|
58
|
+
{"text" => "1"},
|
59
|
+
{"text" => "2"},
|
60
|
+
{"text"=>""},
|
61
|
+
{"text" => "4"},
|
62
|
+
{"text"=>"NextPage »"},
|
63
|
+
"PaginationTest04",
|
64
|
+
{"text"=>"« PreviousPage"},
|
65
|
+
{"text" => "1"},
|
66
|
+
{"text" => "2"},
|
67
|
+
{"text" => "3"},
|
68
|
+
{"text"=>""},
|
69
|
+
{"text"=>""},
|
70
|
+
]
|
71
|
+
|
72
|
+
expect(actual).to match expected
|
73
|
+
end
|
74
|
+
|
75
|
+
|
33
76
|
it "scrape each paginated pages limited" do
|
34
77
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
78
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -126,7 +126,7 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
127
|
])
|
128
128
|
])
|
129
|
-
page = @agent.get(@uri + "/structual_text.html")
|
129
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
130
130
|
compare_generated_vs_original(generated, original, page)
|
131
131
|
end
|
132
132
|
end
|
@@ -193,6 +193,7 @@ describe 'Yasuri' do
|
|
193
193
|
"name" : "root",
|
194
194
|
"path" : "/html/body/nav/span/a[@class='next']",
|
195
195
|
"limit" : 10,
|
196
|
+
"flatten" : false,
|
196
197
|
"children" : [ { "node" : "text",
|
197
198
|
"name" : "content",
|
198
199
|
"path" : "/html/body/p"
|
@@ -12,7 +12,7 @@ describe 'Yasuri' do
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
14
|
@agent = Mechanize.new
|
15
|
-
@page = @agent.get(uri + "/structual_text.html")
|
15
|
+
@page = @agent.get(uri + "/struct/structual_text.html")
|
16
16
|
|
17
17
|
@table_1996 = [
|
18
18
|
{ "title" => "The Perfect Insider",
|
@@ -132,4 +132,45 @@ describe 'Yasuri' do
|
|
132
132
|
end
|
133
133
|
|
134
134
|
end
|
135
|
+
|
136
|
+
describe '::StructNode::Links' do
|
137
|
+
before do
|
138
|
+
@agent = Mechanize.new
|
139
|
+
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
+
|
141
|
+
@table = [
|
142
|
+
{ "title" => "Child01,02",
|
143
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
144
|
+
|
145
|
+
{ "title" => "Child01,02,03",
|
146
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
147
|
+
]
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'return child node in links inside struct' do
|
151
|
+
node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
|
152
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
+
Yasuri::LinksNode.new('./td/a', "child", [
|
154
|
+
Yasuri::TextNode.new('/html/body/p', "p"),
|
155
|
+
])
|
156
|
+
])
|
157
|
+
expected = @table
|
158
|
+
actual = node.inject(@agent, @page)
|
159
|
+
expect(actual).to match expected
|
160
|
+
end
|
161
|
+
end # descrive
|
162
|
+
|
163
|
+
describe '::StructNode::Pages' do
|
164
|
+
before do
|
165
|
+
@agent = Mechanize.new
|
166
|
+
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
+
end
|
168
|
+
|
169
|
+
it 'not supported' do
|
170
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
|
+
])
|
173
|
+
expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
|
+
end
|
175
|
+
end
|
135
176
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -174,7 +174,8 @@ files:
|
|
174
174
|
- spec/htdocs/pagination/page02.html
|
175
175
|
- spec/htdocs/pagination/page03.html
|
176
176
|
- spec/htdocs/pagination/page04.html
|
177
|
-
- spec/htdocs/
|
177
|
+
- spec/htdocs/struct/structual_links.html
|
178
|
+
- spec/htdocs/struct/structual_text.html
|
178
179
|
- spec/servers/httpserver.rb
|
179
180
|
- spec/spec_helper.rb
|
180
181
|
- spec/yasuri_links_node_spec.rb
|
@@ -204,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
204
205
|
version: '0'
|
205
206
|
requirements: []
|
206
207
|
rubyforge_project:
|
207
|
-
rubygems_version: 2.
|
208
|
+
rubygems_version: 2.5.2
|
208
209
|
signing_key:
|
209
210
|
specification_version: 4
|
210
211
|
summary: Yasuri is easy scraping library.
|
@@ -220,7 +221,8 @@ test_files:
|
|
220
221
|
- spec/htdocs/pagination/page02.html
|
221
222
|
- spec/htdocs/pagination/page03.html
|
222
223
|
- spec/htdocs/pagination/page04.html
|
223
|
-
- spec/htdocs/
|
224
|
+
- spec/htdocs/struct/structual_links.html
|
225
|
+
- spec/htdocs/struct/structual_text.html
|
224
226
|
- spec/servers/httpserver.rb
|
225
227
|
- spec/spec_helper.rb
|
226
228
|
- spec/yasuri_links_node_spec.rb
|