yasuri 1.9.12 → 2.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/USAGE.ja.md +0 -35
- data/USAGE.md +0 -35
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +2 -3
- data/lib/yasuri/yasuri_links_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +2 -2
- data/lib/yasuri/yasuri_paginate_node.rb +4 -11
- data/lib/yasuri/yasuri_struct_node.rb +3 -3
- data/lib/yasuri/yasuri_text_node.rb +3 -6
- data/spec/htdocs/{struct/structual_text.html → structual_text.html} +0 -0
- data/spec/yasuri_paginate_node_spec.rb +0 -43
- data/spec/yasuri_spec.rb +3 -4
- data/spec/yasuri_struct_node_spec.rb +2 -43
- metadata +5 -7
- data/spec/htdocs/struct/structual_links.html +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
|
4
|
+
data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
|
7
|
+
data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
|
data/USAGE.ja.md
CHANGED
@@ -431,38 +431,3 @@ node.inject(agent, page)
|
|
431
431
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
432
432
|
```
|
433
433
|
この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
|
434
|
-
|
435
|
-
##### `flatten`
|
436
|
-
取得した各ページの結果を展開します.
|
437
|
-
|
438
|
-
```ruby
|
439
|
-
agent = Mechanize.new
|
440
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
441
|
-
|
442
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
443
|
-
text_title '/html/head/title'
|
444
|
-
text_content '/html/body/p'
|
445
|
-
end
|
446
|
-
node.inject(agent, page)
|
447
|
-
|
448
|
-
#=> [ {"title" => "Page01",
|
449
|
-
"content" => "Patination01"},
|
450
|
-
{"title" => "Page01",
|
451
|
-
"content" => "Patination02"},
|
452
|
-
{"title" => "Page01",
|
453
|
-
"content" => "Patination03"}]
|
454
|
-
|
455
|
-
|
456
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
457
|
-
text_title '/html/head/title'
|
458
|
-
text_content '/html/body/p'
|
459
|
-
end
|
460
|
-
node.inject(agent, page)
|
461
|
-
|
462
|
-
#=> [ "Page01",
|
463
|
-
"Patination01",
|
464
|
-
"Page02",
|
465
|
-
"Patination02",
|
466
|
-
"Page03",
|
467
|
-
"Patination03"]
|
468
|
-
```
|
data/USAGE.md
CHANGED
@@ -429,38 +429,3 @@ node.inject(agent, page)
|
|
429
429
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
430
430
|
```
|
431
431
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
432
|
-
|
433
|
-
##### `flatten`
|
434
|
-
`flatten` option expands each page results.
|
435
|
-
|
436
|
-
```ruby
|
437
|
-
agent = Mechanize.new
|
438
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
439
|
-
|
440
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
441
|
-
text_title '/html/head/title'
|
442
|
-
text_content '/html/body/p'
|
443
|
-
end
|
444
|
-
node.inject(agent, page)
|
445
|
-
|
446
|
-
#=> [ {"title" => "Page01",
|
447
|
-
"content" => "Patination01"},
|
448
|
-
{"title" => "Page01",
|
449
|
-
"content" => "Patination02"},
|
450
|
-
{"title" => "Page01",
|
451
|
-
"content" => "Patination03"}]
|
452
|
-
|
453
|
-
|
454
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
455
|
-
text_title '/html/head/title'
|
456
|
-
text_content '/html/body/p'
|
457
|
-
end
|
458
|
-
node.inject(agent, page)
|
459
|
-
|
460
|
-
#=> [ "Page01",
|
461
|
-
"Patination01",
|
462
|
-
"Page02",
|
463
|
-
"Patination02",
|
464
|
-
"Page03",
|
465
|
-
"Patination03"]
|
466
|
-
```
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -37,7 +37,7 @@ module Yasuri
|
|
37
37
|
}
|
38
38
|
Node2Text = Text2Node.invert
|
39
39
|
|
40
|
-
ReservedKeys =
|
40
|
+
ReservedKeys = %i|node name path children|
|
41
41
|
def self.hash2node(node_h)
|
42
42
|
node, name, path, children = ReservedKeys.map do |key|
|
43
43
|
node_h[key]
|
@@ -78,8 +78,7 @@ module Yasuri
|
|
78
78
|
json
|
79
79
|
end
|
80
80
|
|
81
|
-
def self.NodeName(name,
|
82
|
-
symbolize_names = hash[:symbolize_names] || false
|
81
|
+
def self.NodeName(name, symbolize_names:false)
|
83
82
|
symbolize_names ? name.to_sym : name
|
84
83
|
end
|
85
84
|
|
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {}
|
9
|
+
def inject(agent, page, opt = {})
|
10
10
|
retry_count = opt[:retry_count] || 5
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = page.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,11 +7,11 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
-
def inject(agent, page, opt = {}
|
14
|
+
def inject(agent, page, opt = {})
|
15
15
|
fail "#{Kernel.__method__} is not implemented."
|
16
16
|
end
|
17
17
|
def opts
|
@@ -7,17 +7,14 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], limit: nil)
|
11
11
|
super(xpath, name, children)
|
12
|
-
@limit =
|
13
|
-
@flatten = hash[:flatten] || false
|
12
|
+
@limit = limit
|
14
13
|
end
|
15
14
|
|
16
|
-
def inject(agent, page, opt = {}
|
15
|
+
def inject(agent, page, opt = {})
|
17
16
|
retry_count = opt[:retry_count] || 5
|
18
17
|
|
19
|
-
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
|
-
|
21
18
|
child_results = []
|
22
19
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
20
|
while page
|
@@ -35,14 +32,10 @@ module Yasuri
|
|
35
32
|
break if (limit -= 1) <= 0
|
36
33
|
end
|
37
34
|
|
38
|
-
if @flatten == true
|
39
|
-
return child_results.map{|h| h.values}.flatten
|
40
|
-
end
|
41
|
-
|
42
35
|
child_results
|
43
36
|
end
|
44
37
|
def opts
|
45
|
-
{limit:@limit
|
38
|
+
{limit:@limit}
|
46
39
|
end
|
47
40
|
end
|
48
41
|
end
|
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {}
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {})
|
10
|
+
sub_tags = page.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
13
|
child_name = Yasuri.NodeName(child_node.name, opt)
|
14
|
-
[child_name, child_node.inject(agent,
|
14
|
+
[child_name, child_node.inject(agent, sub_tag, opt)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
@@ -7,12 +7,9 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], truncate: nil, proc:nil)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
-
truncate = hash[:truncate]
|
14
|
-
proc = hash[:proc]
|
15
|
-
|
16
13
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
17
14
|
@truncate = truncate
|
18
15
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
@@ -21,8 +18,8 @@ module Yasuri
|
|
21
18
|
|
22
19
|
end
|
23
20
|
|
24
|
-
def inject(agent, page, opt = {}
|
25
|
-
node =
|
21
|
+
def inject(agent, page, opt = {})
|
22
|
+
node = page.search(@xpath)
|
26
23
|
text = node.text.to_s
|
27
24
|
|
28
25
|
if @truncate
|
File without changes
|
@@ -30,49 +30,6 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
-
it "scrape each paginated pages with flatten" do
|
34
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
-
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
-
Yasuri::TextNode.new('./a', "text"),
|
38
|
-
]),
|
39
|
-
], flatten: true)
|
40
|
-
actual = root_node.inject(@agent, @page)
|
41
|
-
expected = [
|
42
|
-
"PaginationTest01",
|
43
|
-
{"text"=>""},
|
44
|
-
{"text"=>""},
|
45
|
-
{"text" => "2"},
|
46
|
-
{"text" => "3"},
|
47
|
-
{"text" => "4"},
|
48
|
-
{"text"=>"NextPage »"},
|
49
|
-
"PaginationTest02",
|
50
|
-
{"text"=>"« PreviousPage"},
|
51
|
-
{"text" => "1"},
|
52
|
-
{"text"=>""},
|
53
|
-
{"text" => "3"},
|
54
|
-
{"text" => "4"},
|
55
|
-
{"text"=>"NextPage »"},
|
56
|
-
"PaginationTest03",
|
57
|
-
{"text"=>"« PreviousPage"},
|
58
|
-
{"text" => "1"},
|
59
|
-
{"text" => "2"},
|
60
|
-
{"text"=>""},
|
61
|
-
{"text" => "4"},
|
62
|
-
{"text"=>"NextPage »"},
|
63
|
-
"PaginationTest04",
|
64
|
-
{"text"=>"« PreviousPage"},
|
65
|
-
{"text" => "1"},
|
66
|
-
{"text" => "2"},
|
67
|
-
{"text" => "3"},
|
68
|
-
{"text"=>""},
|
69
|
-
{"text"=>""},
|
70
|
-
]
|
71
|
-
|
72
|
-
expect(actual).to match expected
|
73
|
-
end
|
74
|
-
|
75
|
-
|
76
33
|
it "scrape each paginated pages limited" do
|
77
34
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
78
35
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -39,7 +39,7 @@ describe 'Yasuri' do
|
|
39
39
|
"truncate" : "^[^,]+"
|
40
40
|
}|
|
41
41
|
generated = Yasuri.json2tree(src)
|
42
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "content",
|
42
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
43
|
compare_generated_vs_original(generated, original, @index_page)
|
44
44
|
end
|
45
45
|
|
@@ -126,7 +126,7 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
127
|
])
|
128
128
|
])
|
129
|
-
page = @agent.get(@uri + "/
|
129
|
+
page = @agent.get(@uri + "/structual_text.html")
|
130
130
|
compare_generated_vs_original(generated, original, page)
|
131
131
|
end
|
132
132
|
end
|
@@ -153,7 +153,7 @@ describe 'Yasuri' do
|
|
153
153
|
end
|
154
154
|
|
155
155
|
it "return text node with truncate_regexp" do
|
156
|
-
node = Yasuri::TextNode.new("/html/head/title", "title",
|
156
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
157
|
json = Yasuri.tree2json(node)
|
158
158
|
expected_str = %q| { "node": "text",
|
159
159
|
"name": "title",
|
@@ -193,7 +193,6 @@ describe 'Yasuri' do
|
|
193
193
|
"name" : "root",
|
194
194
|
"path" : "/html/body/nav/span/a[@class='next']",
|
195
195
|
"limit" : 10,
|
196
|
-
"flatten" : false,
|
197
196
|
"children" : [ { "node" : "text",
|
198
197
|
"name" : "content",
|
199
198
|
"path" : "/html/body/p"
|
@@ -12,7 +12,7 @@ describe 'Yasuri' do
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
14
|
@agent = Mechanize.new
|
15
|
-
@page = @agent.get(uri + "/
|
15
|
+
@page = @agent.get(uri + "/structual_text.html")
|
16
16
|
|
17
17
|
@table_1996 = [
|
18
18
|
{ "title" => "The Perfect Insider",
|
@@ -126,51 +126,10 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[1]', "title"),
|
127
127
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
128
|
])
|
129
|
-
expected = @table_1996.map{|h|
|
129
|
+
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
130
|
actual = node.inject(@agent, @page, symbolize_names:true)
|
131
131
|
expect(actual).to match expected
|
132
132
|
end
|
133
133
|
|
134
134
|
end
|
135
|
-
|
136
|
-
describe '::StructNode::Links' do
|
137
|
-
before do
|
138
|
-
@agent = Mechanize.new
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
141
|
-
@table = [
|
142
|
-
{ "title" => "Child01,02",
|
143
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
144
|
-
|
145
|
-
{ "title" => "Child01,02,03",
|
146
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
147
|
-
]
|
148
|
-
end
|
149
|
-
|
150
|
-
it 'return child node in links inside struct' do
|
151
|
-
node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
|
152
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
-
Yasuri::LinksNode.new('./td/a', "child", [
|
154
|
-
Yasuri::TextNode.new('/html/body/p', "p"),
|
155
|
-
])
|
156
|
-
])
|
157
|
-
expected = @table
|
158
|
-
actual = node.inject(@agent, @page)
|
159
|
-
expect(actual).to match expected
|
160
|
-
end
|
161
|
-
end # descrive
|
162
|
-
|
163
|
-
describe '::StructNode::Pages' do
|
164
|
-
before do
|
165
|
-
@agent = Mechanize.new
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
-
end
|
168
|
-
|
169
|
-
it 'not supported' do
|
170
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
|
-
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
|
-
])
|
173
|
-
expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
|
-
end
|
175
|
-
end
|
176
135
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -174,8 +174,7 @@ files:
|
|
174
174
|
- spec/htdocs/pagination/page02.html
|
175
175
|
- spec/htdocs/pagination/page03.html
|
176
176
|
- spec/htdocs/pagination/page04.html
|
177
|
-
- spec/htdocs/
|
178
|
-
- spec/htdocs/struct/structual_text.html
|
177
|
+
- spec/htdocs/structual_text.html
|
179
178
|
- spec/servers/httpserver.rb
|
180
179
|
- spec/spec_helper.rb
|
181
180
|
- spec/yasuri_links_node_spec.rb
|
@@ -205,7 +204,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
204
|
version: '0'
|
206
205
|
requirements: []
|
207
206
|
rubyforge_project:
|
208
|
-
rubygems_version: 2.5
|
207
|
+
rubygems_version: 2.4.5
|
209
208
|
signing_key:
|
210
209
|
specification_version: 4
|
211
210
|
summary: Yasuri is easy scraping library.
|
@@ -221,8 +220,7 @@ test_files:
|
|
221
220
|
- spec/htdocs/pagination/page02.html
|
222
221
|
- spec/htdocs/pagination/page03.html
|
223
222
|
- spec/htdocs/pagination/page04.html
|
224
|
-
- spec/htdocs/
|
225
|
-
- spec/htdocs/struct/structual_text.html
|
223
|
+
- spec/htdocs/structual_text.html
|
226
224
|
- spec/servers/httpserver.rb
|
227
225
|
- spec/spec_helper.rb
|
228
226
|
- spec/yasuri_links_node_spec.rb
|
@@ -1,30 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<head>
|
3
|
-
<title>StructualLinksTest</title>
|
4
|
-
</head>
|
5
|
-
<body>
|
6
|
-
|
7
|
-
<table>
|
8
|
-
<thead>
|
9
|
-
<tr>
|
10
|
-
<th>Title</th>
|
11
|
-
<th>Links</th>
|
12
|
-
</tr>
|
13
|
-
</thead>
|
14
|
-
<tr>
|
15
|
-
<td>Child01,02</td>
|
16
|
-
<td><a href="../child01.html">Child01</a></td>
|
17
|
-
<td><a href="../child02.html">Child02</a></td>
|
18
|
-
<td>../child02.html</td>
|
19
|
-
</tr>
|
20
|
-
|
21
|
-
<tr>
|
22
|
-
<td>Child01,02,03</td>
|
23
|
-
<td><a href="../child01.html">Child01</a></td>
|
24
|
-
<td><a href="../child02.html">Child02</a></td>
|
25
|
-
<td><a href="../child03.html">Child03</a></td>
|
26
|
-
</tr>
|
27
|
-
</table>
|
28
|
-
|
29
|
-
</body>
|
30
|
-
</html>
|