yasuri 2.0.11 → 2.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/USAGE.ja.md +35 -0
- data/USAGE.md +35 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri_links_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +1 -1
- data/lib/yasuri/yasuri_paginate_node.rb +10 -3
- data/lib/yasuri/yasuri_struct_node.rb +3 -3
- data/lib/yasuri/yasuri_text_node.rb +2 -2
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +2 -1
- data/spec/yasuri_struct_node_spec.rb +42 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4fed4a13bb125758515e3c0ced665b1ca3d20b6
|
4
|
+
data.tar.gz: e9dfb2ed6256a367db2e5b6a78d23fa097c422d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b9d6345f3f49b1f7d9445ce18bca736b8cbeedc69979a45d541b59af4e09092d7c1d12886801a24296e9e3d73f39a7c2d53a7c2de12e1a0ff890623b47cfe84
|
7
|
+
data.tar.gz: 6d755f266062052dd5244599deefefea85f7570c827a898e48eee22c44510dde287b0554ed2cae85e3b94b44fe4eb6f74b512c44047e6cc1bb43fe27a93143b0
|
data/USAGE.ja.md
CHANGED
@@ -431,3 +431,38 @@ node.inject(agent, page)
|
|
431
431
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
432
432
|
```
|
433
433
|
この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
|
434
|
+
|
435
|
+
##### `flatten`
|
436
|
+
取得した各ページの結果を展開します.
|
437
|
+
|
438
|
+
```ruby
|
439
|
+
agent = Mechanize.new
|
440
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
441
|
+
|
442
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
443
|
+
text_title '/html/head/title'
|
444
|
+
text_content '/html/body/p'
|
445
|
+
end
|
446
|
+
node.inject(agent, page)
|
447
|
+
|
448
|
+
#=> [ {"title" => "Page01",
|
449
|
+
"content" => "Patination01"},
|
450
|
+
{"title" => "Page01",
|
451
|
+
"content" => "Patination02"},
|
452
|
+
{"title" => "Page01",
|
453
|
+
"content" => "Patination03"}]
|
454
|
+
|
455
|
+
|
456
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
457
|
+
text_title '/html/head/title'
|
458
|
+
text_content '/html/body/p'
|
459
|
+
end
|
460
|
+
node.inject(agent, page)
|
461
|
+
|
462
|
+
#=> [ "Page01",
|
463
|
+
"Patination01",
|
464
|
+
"Page02",
|
465
|
+
"Patination02",
|
466
|
+
"Page03",
|
467
|
+
"Patination03"]
|
468
|
+
```
|
data/USAGE.md
CHANGED
@@ -429,3 +429,38 @@ node.inject(agent, page)
|
|
429
429
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
430
430
|
```
|
431
431
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
432
|
+
|
433
|
+
##### `flatten`
|
434
|
+
`flatten` option expands each page results.
|
435
|
+
|
436
|
+
```ruby
|
437
|
+
agent = Mechanize.new
|
438
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
439
|
+
|
440
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
441
|
+
text_title '/html/head/title'
|
442
|
+
text_content '/html/body/p'
|
443
|
+
end
|
444
|
+
node.inject(agent, page)
|
445
|
+
|
446
|
+
#=> [ {"title" => "Page01",
|
447
|
+
"content" => "Patination01"},
|
448
|
+
{"title" => "Page01",
|
449
|
+
"content" => "Patination02"},
|
450
|
+
{"title" => "Page01",
|
451
|
+
"content" => "Patination03"}]
|
452
|
+
|
453
|
+
|
454
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
455
|
+
text_title '/html/head/title'
|
456
|
+
text_content '/html/body/p'
|
457
|
+
end
|
458
|
+
node.inject(agent, page)
|
459
|
+
|
460
|
+
#=> [ "Page01",
|
461
|
+
"Patination01",
|
462
|
+
"Page02",
|
463
|
+
"Patination02",
|
464
|
+
"Page03",
|
465
|
+
"Patination03"]
|
466
|
+
```
|
data/lib/yasuri/version.rb
CHANGED
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
10
|
retry_count = opt[:retry_count] || 5
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,14 +7,17 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], limit: nil)
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
11
11
|
super(xpath, name, children)
|
12
|
+
@flatten = flatten
|
12
13
|
@limit = limit
|
13
14
|
end
|
14
15
|
|
15
|
-
def inject(agent, page, opt = {})
|
16
|
+
def inject(agent, page, opt = {}, element = page)
|
16
17
|
retry_count = opt[:retry_count] || 5
|
17
18
|
|
19
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
|
+
|
18
21
|
child_results = []
|
19
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
20
23
|
while page
|
@@ -32,10 +35,14 @@ module Yasuri
|
|
32
35
|
break if (limit -= 1) <= 0
|
33
36
|
end
|
34
37
|
|
38
|
+
if @flatten == true
|
39
|
+
return child_results.map{|h| h.values}.flatten
|
40
|
+
end
|
41
|
+
|
35
42
|
child_results
|
36
43
|
end
|
37
44
|
def opts
|
38
|
-
{limit:@limit}
|
45
|
+
{limit:@limit, flatten:@flatten}
|
39
46
|
end
|
40
47
|
end
|
41
48
|
end
|
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
13
|
child_name = Yasuri.NodeName(child_node.name, opt)
|
14
|
-
[child_name, child_node.inject(agent,
|
14
|
+
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>StructualLinksTest</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
|
7
|
+
<table>
|
8
|
+
<thead>
|
9
|
+
<tr>
|
10
|
+
<th>Title</th>
|
11
|
+
<th>Links</th>
|
12
|
+
</tr>
|
13
|
+
</thead>
|
14
|
+
<tr>
|
15
|
+
<td>Child01,02</td>
|
16
|
+
<td><a href="../child01.html">Child01</a></td>
|
17
|
+
<td><a href="../child02.html">Child02</a></td>
|
18
|
+
<td>../child02.html</td>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td>Child01,02,03</td>
|
23
|
+
<td><a href="../child01.html">Child01</a></td>
|
24
|
+
<td><a href="../child02.html">Child02</a></td>
|
25
|
+
<td><a href="../child03.html">Child03</a></td>
|
26
|
+
</tr>
|
27
|
+
</table>
|
28
|
+
|
29
|
+
</body>
|
30
|
+
</html>
|
File without changes
|
@@ -30,6 +30,49 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
+
it "scrape each paginated pages with flatten" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
+
Yasuri::TextNode.new('./a', "text"),
|
38
|
+
]),
|
39
|
+
], flatten: true)
|
40
|
+
actual = root_node.inject(@agent, @page)
|
41
|
+
expected = [
|
42
|
+
"PaginationTest01",
|
43
|
+
{"text"=>""},
|
44
|
+
{"text"=>""},
|
45
|
+
{"text" => "2"},
|
46
|
+
{"text" => "3"},
|
47
|
+
{"text" => "4"},
|
48
|
+
{"text"=>"NextPage »"},
|
49
|
+
"PaginationTest02",
|
50
|
+
{"text"=>"« PreviousPage"},
|
51
|
+
{"text" => "1"},
|
52
|
+
{"text"=>""},
|
53
|
+
{"text" => "3"},
|
54
|
+
{"text" => "4"},
|
55
|
+
{"text"=>"NextPage »"},
|
56
|
+
"PaginationTest03",
|
57
|
+
{"text"=>"« PreviousPage"},
|
58
|
+
{"text" => "1"},
|
59
|
+
{"text" => "2"},
|
60
|
+
{"text"=>""},
|
61
|
+
{"text" => "4"},
|
62
|
+
{"text"=>"NextPage »"},
|
63
|
+
"PaginationTest04",
|
64
|
+
{"text"=>"« PreviousPage"},
|
65
|
+
{"text" => "1"},
|
66
|
+
{"text" => "2"},
|
67
|
+
{"text" => "3"},
|
68
|
+
{"text"=>""},
|
69
|
+
{"text"=>""},
|
70
|
+
]
|
71
|
+
|
72
|
+
expect(actual).to match expected
|
73
|
+
end
|
74
|
+
|
75
|
+
|
33
76
|
it "scrape each paginated pages limited" do
|
34
77
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
78
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -126,7 +126,7 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
127
|
])
|
128
128
|
])
|
129
|
-
page = @agent.get(@uri + "/structual_text.html")
|
129
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
130
130
|
compare_generated_vs_original(generated, original, page)
|
131
131
|
end
|
132
132
|
end
|
@@ -193,6 +193,7 @@ describe 'Yasuri' do
|
|
193
193
|
"name" : "root",
|
194
194
|
"path" : "/html/body/nav/span/a[@class='next']",
|
195
195
|
"limit" : 10,
|
196
|
+
"flatten" : false,
|
196
197
|
"children" : [ { "node" : "text",
|
197
198
|
"name" : "content",
|
198
199
|
"path" : "/html/body/p"
|
@@ -12,7 +12,7 @@ describe 'Yasuri' do
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
14
|
@agent = Mechanize.new
|
15
|
-
@page = @agent.get(uri + "/structual_text.html")
|
15
|
+
@page = @agent.get(uri + "/struct/structual_text.html")
|
16
16
|
|
17
17
|
@table_1996 = [
|
18
18
|
{ "title" => "The Perfect Insider",
|
@@ -132,4 +132,45 @@ describe 'Yasuri' do
|
|
132
132
|
end
|
133
133
|
|
134
134
|
end
|
135
|
+
|
136
|
+
describe '::StructNode::Links' do
|
137
|
+
before do
|
138
|
+
@agent = Mechanize.new
|
139
|
+
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
+
|
141
|
+
@table = [
|
142
|
+
{ "title" => "Child01,02",
|
143
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
144
|
+
|
145
|
+
{ "title" => "Child01,02,03",
|
146
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
147
|
+
]
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'return child node in links inside struct' do
|
151
|
+
node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
|
152
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
+
Yasuri::LinksNode.new('./td/a', "child", [
|
154
|
+
Yasuri::TextNode.new('/html/body/p', "p"),
|
155
|
+
])
|
156
|
+
])
|
157
|
+
expected = @table
|
158
|
+
actual = node.inject(@agent, @page)
|
159
|
+
expect(actual).to match expected
|
160
|
+
end
|
161
|
+
end # descrive
|
162
|
+
|
163
|
+
describe '::StructNode::Pages' do
|
164
|
+
before do
|
165
|
+
@agent = Mechanize.new
|
166
|
+
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
+
end
|
168
|
+
|
169
|
+
it 'not supported' do
|
170
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
|
+
])
|
173
|
+
expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
|
+
end
|
175
|
+
end
|
135
176
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -174,7 +174,8 @@ files:
|
|
174
174
|
- spec/htdocs/pagination/page02.html
|
175
175
|
- spec/htdocs/pagination/page03.html
|
176
176
|
- spec/htdocs/pagination/page04.html
|
177
|
-
- spec/htdocs/
|
177
|
+
- spec/htdocs/struct/structual_links.html
|
178
|
+
- spec/htdocs/struct/structual_text.html
|
178
179
|
- spec/servers/httpserver.rb
|
179
180
|
- spec/spec_helper.rb
|
180
181
|
- spec/yasuri_links_node_spec.rb
|
@@ -204,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
204
205
|
version: '0'
|
205
206
|
requirements: []
|
206
207
|
rubyforge_project:
|
207
|
-
rubygems_version: 2.
|
208
|
+
rubygems_version: 2.5.2
|
208
209
|
signing_key:
|
209
210
|
specification_version: 4
|
210
211
|
summary: Yasuri is easy scraping library.
|
@@ -220,7 +221,8 @@ test_files:
|
|
220
221
|
- spec/htdocs/pagination/page02.html
|
221
222
|
- spec/htdocs/pagination/page03.html
|
222
223
|
- spec/htdocs/pagination/page04.html
|
223
|
-
- spec/htdocs/
|
224
|
+
- spec/htdocs/struct/structual_links.html
|
225
|
+
- spec/htdocs/struct/structual_text.html
|
224
226
|
- spec/servers/httpserver.rb
|
225
227
|
- spec/spec_helper.rb
|
226
228
|
- spec/yasuri_links_node_spec.rb
|