yasuri 1.9.12 → 2.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/USAGE.ja.md +0 -35
- data/USAGE.md +0 -35
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +2 -3
- data/lib/yasuri/yasuri_links_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +2 -2
- data/lib/yasuri/yasuri_paginate_node.rb +4 -11
- data/lib/yasuri/yasuri_struct_node.rb +3 -3
- data/lib/yasuri/yasuri_text_node.rb +3 -6
- data/spec/htdocs/{struct/structual_text.html → structual_text.html} +0 -0
- data/spec/yasuri_paginate_node_spec.rb +0 -43
- data/spec/yasuri_spec.rb +3 -4
- data/spec/yasuri_struct_node_spec.rb +2 -43
- metadata +5 -7
- data/spec/htdocs/struct/structual_links.html +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
|
4
|
+
data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
|
7
|
+
data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
|
data/USAGE.ja.md
CHANGED
@@ -431,38 +431,3 @@ node.inject(agent, page)
|
|
431
431
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
432
432
|
```
|
433
433
|
この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
|
434
|
-
|
435
|
-
##### `flatten`
|
436
|
-
取得した各ページの結果を展開します.
|
437
|
-
|
438
|
-
```ruby
|
439
|
-
agent = Mechanize.new
|
440
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
441
|
-
|
442
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
443
|
-
text_title '/html/head/title'
|
444
|
-
text_content '/html/body/p'
|
445
|
-
end
|
446
|
-
node.inject(agent, page)
|
447
|
-
|
448
|
-
#=> [ {"title" => "Page01",
|
449
|
-
"content" => "Patination01"},
|
450
|
-
{"title" => "Page01",
|
451
|
-
"content" => "Patination02"},
|
452
|
-
{"title" => "Page01",
|
453
|
-
"content" => "Patination03"}]
|
454
|
-
|
455
|
-
|
456
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
457
|
-
text_title '/html/head/title'
|
458
|
-
text_content '/html/body/p'
|
459
|
-
end
|
460
|
-
node.inject(agent, page)
|
461
|
-
|
462
|
-
#=> [ "Page01",
|
463
|
-
"Patination01",
|
464
|
-
"Page02",
|
465
|
-
"Patination02",
|
466
|
-
"Page03",
|
467
|
-
"Patination03"]
|
468
|
-
```
|
data/USAGE.md
CHANGED
@@ -429,38 +429,3 @@ node.inject(agent, page)
|
|
429
429
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
430
430
|
```
|
431
431
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
432
|
-
|
433
|
-
##### `flatten`
|
434
|
-
`flatten` option expands each page results.
|
435
|
-
|
436
|
-
```ruby
|
437
|
-
agent = Mechanize.new
|
438
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
439
|
-
|
440
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
441
|
-
text_title '/html/head/title'
|
442
|
-
text_content '/html/body/p'
|
443
|
-
end
|
444
|
-
node.inject(agent, page)
|
445
|
-
|
446
|
-
#=> [ {"title" => "Page01",
|
447
|
-
"content" => "Patination01"},
|
448
|
-
{"title" => "Page01",
|
449
|
-
"content" => "Patination02"},
|
450
|
-
{"title" => "Page01",
|
451
|
-
"content" => "Patination03"}]
|
452
|
-
|
453
|
-
|
454
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
455
|
-
text_title '/html/head/title'
|
456
|
-
text_content '/html/body/p'
|
457
|
-
end
|
458
|
-
node.inject(agent, page)
|
459
|
-
|
460
|
-
#=> [ "Page01",
|
461
|
-
"Patination01",
|
462
|
-
"Page02",
|
463
|
-
"Patination02",
|
464
|
-
"Page03",
|
465
|
-
"Patination03"]
|
466
|
-
```
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -37,7 +37,7 @@ module Yasuri
|
|
37
37
|
}
|
38
38
|
Node2Text = Text2Node.invert
|
39
39
|
|
40
|
-
ReservedKeys =
|
40
|
+
ReservedKeys = %i|node name path children|
|
41
41
|
def self.hash2node(node_h)
|
42
42
|
node, name, path, children = ReservedKeys.map do |key|
|
43
43
|
node_h[key]
|
@@ -78,8 +78,7 @@ module Yasuri
|
|
78
78
|
json
|
79
79
|
end
|
80
80
|
|
81
|
-
def self.NodeName(name,
|
82
|
-
symbolize_names = hash[:symbolize_names] || false
|
81
|
+
def self.NodeName(name, symbolize_names:false)
|
83
82
|
symbolize_names ? name.to_sym : name
|
84
83
|
end
|
85
84
|
|
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {}
|
9
|
+
def inject(agent, page, opt = {})
|
10
10
|
retry_count = opt[:retry_count] || 5
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = page.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,11 +7,11 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
-
def inject(agent, page, opt = {}
|
14
|
+
def inject(agent, page, opt = {})
|
15
15
|
fail "#{Kernel.__method__} is not implemented."
|
16
16
|
end
|
17
17
|
def opts
|
@@ -7,17 +7,14 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], limit: nil)
|
11
11
|
super(xpath, name, children)
|
12
|
-
@limit =
|
13
|
-
@flatten = hash[:flatten] || false
|
12
|
+
@limit = limit
|
14
13
|
end
|
15
14
|
|
16
|
-
def inject(agent, page, opt = {}
|
15
|
+
def inject(agent, page, opt = {})
|
17
16
|
retry_count = opt[:retry_count] || 5
|
18
17
|
|
19
|
-
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
|
-
|
21
18
|
child_results = []
|
22
19
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
20
|
while page
|
@@ -35,14 +32,10 @@ module Yasuri
|
|
35
32
|
break if (limit -= 1) <= 0
|
36
33
|
end
|
37
34
|
|
38
|
-
if @flatten == true
|
39
|
-
return child_results.map{|h| h.values}.flatten
|
40
|
-
end
|
41
|
-
|
42
35
|
child_results
|
43
36
|
end
|
44
37
|
def opts
|
45
|
-
{limit:@limit
|
38
|
+
{limit:@limit}
|
46
39
|
end
|
47
40
|
end
|
48
41
|
end
|
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {}
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {})
|
10
|
+
sub_tags = page.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
13
|
child_name = Yasuri.NodeName(child_node.name, opt)
|
14
|
-
[child_name, child_node.inject(agent,
|
14
|
+
[child_name, child_node.inject(agent, sub_tag, opt)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
@@ -7,12 +7,9 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], truncate: nil, proc:nil)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
-
truncate = hash[:truncate]
|
14
|
-
proc = hash[:proc]
|
15
|
-
|
16
13
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
17
14
|
@truncate = truncate
|
18
15
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
@@ -21,8 +18,8 @@ module Yasuri
|
|
21
18
|
|
22
19
|
end
|
23
20
|
|
24
|
-
def inject(agent, page, opt = {}
|
25
|
-
node =
|
21
|
+
def inject(agent, page, opt = {})
|
22
|
+
node = page.search(@xpath)
|
26
23
|
text = node.text.to_s
|
27
24
|
|
28
25
|
if @truncate
|
File without changes
|
@@ -30,49 +30,6 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
-
it "scrape each paginated pages with flatten" do
|
34
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
-
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
-
Yasuri::TextNode.new('./a', "text"),
|
38
|
-
]),
|
39
|
-
], flatten: true)
|
40
|
-
actual = root_node.inject(@agent, @page)
|
41
|
-
expected = [
|
42
|
-
"PaginationTest01",
|
43
|
-
{"text"=>""},
|
44
|
-
{"text"=>""},
|
45
|
-
{"text" => "2"},
|
46
|
-
{"text" => "3"},
|
47
|
-
{"text" => "4"},
|
48
|
-
{"text"=>"NextPage »"},
|
49
|
-
"PaginationTest02",
|
50
|
-
{"text"=>"« PreviousPage"},
|
51
|
-
{"text" => "1"},
|
52
|
-
{"text"=>""},
|
53
|
-
{"text" => "3"},
|
54
|
-
{"text" => "4"},
|
55
|
-
{"text"=>"NextPage »"},
|
56
|
-
"PaginationTest03",
|
57
|
-
{"text"=>"« PreviousPage"},
|
58
|
-
{"text" => "1"},
|
59
|
-
{"text" => "2"},
|
60
|
-
{"text"=>""},
|
61
|
-
{"text" => "4"},
|
62
|
-
{"text"=>"NextPage »"},
|
63
|
-
"PaginationTest04",
|
64
|
-
{"text"=>"« PreviousPage"},
|
65
|
-
{"text" => "1"},
|
66
|
-
{"text" => "2"},
|
67
|
-
{"text" => "3"},
|
68
|
-
{"text"=>""},
|
69
|
-
{"text"=>""},
|
70
|
-
]
|
71
|
-
|
72
|
-
expect(actual).to match expected
|
73
|
-
end
|
74
|
-
|
75
|
-
|
76
33
|
it "scrape each paginated pages limited" do
|
77
34
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
78
35
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -39,7 +39,7 @@ describe 'Yasuri' do
|
|
39
39
|
"truncate" : "^[^,]+"
|
40
40
|
}|
|
41
41
|
generated = Yasuri.json2tree(src)
|
42
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "content",
|
42
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
43
|
compare_generated_vs_original(generated, original, @index_page)
|
44
44
|
end
|
45
45
|
|
@@ -126,7 +126,7 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
127
|
])
|
128
128
|
])
|
129
|
-
page = @agent.get(@uri + "/
|
129
|
+
page = @agent.get(@uri + "/structual_text.html")
|
130
130
|
compare_generated_vs_original(generated, original, page)
|
131
131
|
end
|
132
132
|
end
|
@@ -153,7 +153,7 @@ describe 'Yasuri' do
|
|
153
153
|
end
|
154
154
|
|
155
155
|
it "return text node with truncate_regexp" do
|
156
|
-
node = Yasuri::TextNode.new("/html/head/title", "title",
|
156
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
157
|
json = Yasuri.tree2json(node)
|
158
158
|
expected_str = %q| { "node": "text",
|
159
159
|
"name": "title",
|
@@ -193,7 +193,6 @@ describe 'Yasuri' do
|
|
193
193
|
"name" : "root",
|
194
194
|
"path" : "/html/body/nav/span/a[@class='next']",
|
195
195
|
"limit" : 10,
|
196
|
-
"flatten" : false,
|
197
196
|
"children" : [ { "node" : "text",
|
198
197
|
"name" : "content",
|
199
198
|
"path" : "/html/body/p"
|
@@ -12,7 +12,7 @@ describe 'Yasuri' do
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
14
|
@agent = Mechanize.new
|
15
|
-
@page = @agent.get(uri + "/
|
15
|
+
@page = @agent.get(uri + "/structual_text.html")
|
16
16
|
|
17
17
|
@table_1996 = [
|
18
18
|
{ "title" => "The Perfect Insider",
|
@@ -126,51 +126,10 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[1]', "title"),
|
127
127
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
128
|
])
|
129
|
-
expected = @table_1996.map{|h|
|
129
|
+
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
130
|
actual = node.inject(@agent, @page, symbolize_names:true)
|
131
131
|
expect(actual).to match expected
|
132
132
|
end
|
133
133
|
|
134
134
|
end
|
135
|
-
|
136
|
-
describe '::StructNode::Links' do
|
137
|
-
before do
|
138
|
-
@agent = Mechanize.new
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
141
|
-
@table = [
|
142
|
-
{ "title" => "Child01,02",
|
143
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
144
|
-
|
145
|
-
{ "title" => "Child01,02,03",
|
146
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
147
|
-
]
|
148
|
-
end
|
149
|
-
|
150
|
-
it 'return child node in links inside struct' do
|
151
|
-
node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
|
152
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
-
Yasuri::LinksNode.new('./td/a', "child", [
|
154
|
-
Yasuri::TextNode.new('/html/body/p', "p"),
|
155
|
-
])
|
156
|
-
])
|
157
|
-
expected = @table
|
158
|
-
actual = node.inject(@agent, @page)
|
159
|
-
expect(actual).to match expected
|
160
|
-
end
|
161
|
-
end # descrive
|
162
|
-
|
163
|
-
describe '::StructNode::Pages' do
|
164
|
-
before do
|
165
|
-
@agent = Mechanize.new
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
-
end
|
168
|
-
|
169
|
-
it 'not supported' do
|
170
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
|
-
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
|
-
])
|
173
|
-
expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
|
-
end
|
175
|
-
end
|
176
135
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -174,8 +174,7 @@ files:
|
|
174
174
|
- spec/htdocs/pagination/page02.html
|
175
175
|
- spec/htdocs/pagination/page03.html
|
176
176
|
- spec/htdocs/pagination/page04.html
|
177
|
-
- spec/htdocs/
|
178
|
-
- spec/htdocs/struct/structual_text.html
|
177
|
+
- spec/htdocs/structual_text.html
|
179
178
|
- spec/servers/httpserver.rb
|
180
179
|
- spec/spec_helper.rb
|
181
180
|
- spec/yasuri_links_node_spec.rb
|
@@ -205,7 +204,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
204
|
version: '0'
|
206
205
|
requirements: []
|
207
206
|
rubyforge_project:
|
208
|
-
rubygems_version: 2.5
|
207
|
+
rubygems_version: 2.4.5
|
209
208
|
signing_key:
|
210
209
|
specification_version: 4
|
211
210
|
summary: Yasuri is easy scraping library.
|
@@ -221,8 +220,7 @@ test_files:
|
|
221
220
|
- spec/htdocs/pagination/page02.html
|
222
221
|
- spec/htdocs/pagination/page03.html
|
223
222
|
- spec/htdocs/pagination/page04.html
|
224
|
-
- spec/htdocs/
|
225
|
-
- spec/htdocs/struct/structual_text.html
|
223
|
+
- spec/htdocs/structual_text.html
|
226
224
|
- spec/servers/httpserver.rb
|
227
225
|
- spec/spec_helper.rb
|
228
226
|
- spec/yasuri_links_node_spec.rb
|
@@ -1,30 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<head>
|
3
|
-
<title>StructualLinksTest</title>
|
4
|
-
</head>
|
5
|
-
<body>
|
6
|
-
|
7
|
-
<table>
|
8
|
-
<thead>
|
9
|
-
<tr>
|
10
|
-
<th>Title</th>
|
11
|
-
<th>Links</th>
|
12
|
-
</tr>
|
13
|
-
</thead>
|
14
|
-
<tr>
|
15
|
-
<td>Child01,02</td>
|
16
|
-
<td><a href="../child01.html">Child01</a></td>
|
17
|
-
<td><a href="../child02.html">Child02</a></td>
|
18
|
-
<td>../child02.html</td>
|
19
|
-
</tr>
|
20
|
-
|
21
|
-
<tr>
|
22
|
-
<td>Child01,02,03</td>
|
23
|
-
<td><a href="../child01.html">Child01</a></td>
|
24
|
-
<td><a href="../child02.html">Child02</a></td>
|
25
|
-
<td><a href="../child03.html">Child03</a></td>
|
26
|
-
</tr>
|
27
|
-
</table>
|
28
|
-
|
29
|
-
</body>
|
30
|
-
</html>
|