yasuri 2.0.11 → 2.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
4
- data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
3
+ metadata.gz: a4fed4a13bb125758515e3c0ced665b1ca3d20b6
4
+ data.tar.gz: e9dfb2ed6256a367db2e5b6a78d23fa097c422d7
5
5
  SHA512:
6
- metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
7
- data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
6
+ metadata.gz: 8b9d6345f3f49b1f7d9445ce18bca736b8cbeedc69979a45d541b59af4e09092d7c1d12886801a24296e9e3d73f39a7c2d53a7c2de12e1a0ff890623b47cfe84
7
+ data.tar.gz: 6d755f266062052dd5244599deefefea85f7570c827a898e48eee22c44510dde287b0554ed2cae85e3b94b44fe4eb6f74b512c44047e6cc1bb43fe27a93143b0
data/USAGE.ja.md CHANGED
@@ -431,3 +431,38 @@ node.inject(agent, page)
431
431
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
432
432
  ```
433
433
  この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
434
+
435
+ ##### `flatten`
436
+ 取得した各ページの結果を展開します.
437
+
438
+ ```ruby
439
+ agent = Mechanize.new
440
+ page = agent.get("http://yasuri.example.net/page01.html")
441
+
442
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
443
+ text_title '/html/head/title'
444
+ text_content '/html/body/p'
445
+ end
446
+ node.inject(agent, page)
447
+
448
+ #=> [ {"title" => "Page01",
449
+ "content" => "Patination01"},
450
+ {"title" => "Page01",
451
+ "content" => "Patination02"},
452
+ {"title" => "Page01",
453
+ "content" => "Patination03"}]
454
+
455
+
456
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
457
+ text_title '/html/head/title'
458
+ text_content '/html/body/p'
459
+ end
460
+ node.inject(agent, page)
461
+
462
+ #=> [ "Page01",
463
+ "Patination01",
464
+ "Page02",
465
+ "Patination02",
466
+ "Page03",
467
+ "Patination03"]
468
+ ```
data/USAGE.md CHANGED
@@ -429,3 +429,38 @@ node.inject(agent, page)
429
429
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
430
430
  ```
431
431
  Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
432
+
433
+ ##### `flatten`
434
+ `flatten` option expands each page results.
435
+
436
+ ```ruby
437
+ agent = Mechanize.new
438
+ page = agent.get("http://yasuri.example.net/page01.html")
439
+
440
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
441
+ text_title '/html/head/title'
442
+ text_content '/html/body/p'
443
+ end
444
+ node.inject(agent, page)
445
+
446
+ #=> [ {"title" => "Page01",
447
+ "content" => "Patination01"},
448
+ {"title" => "Page01",
449
+ "content" => "Patination02"},
450
+ {"title" => "Page01",
451
+ "content" => "Patination03"}]
452
+
453
+
454
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
455
+ text_title '/html/head/title'
456
+ text_content '/html/body/p'
457
+ end
458
+ node.inject(agent, page)
459
+
460
+ #=> [ "Page01",
461
+ "Patination01",
462
+ "Page02",
463
+ "Patination02",
464
+ "Page03",
465
+ "Patination03"]
466
+ ```
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.11"
2
+ VERSION = "2.0.12"
3
3
  end
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
9
+ def inject(agent, page, opt = {}, element = page)
10
10
  retry_count = opt[:retry_count] || 5
11
11
 
12
- links = page.search(@xpath) || [] # links expected
12
+ links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
@@ -11,7 +11,7 @@ module Yasuri
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {})
14
+ def inject(agent, page, opt = {}, element = page)
15
15
  fail "#{Kernel.__method__} is not implemented."
16
16
  end
17
17
  def opts
@@ -7,14 +7,17 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], limit: nil)
10
+ def initialize(xpath, name, children = [], limit: nil, flatten: false)
11
11
  super(xpath, name, children)
12
+ @flatten = flatten
12
13
  @limit = limit
13
14
  end
14
15
 
15
- def inject(agent, page, opt = {})
16
+ def inject(agent, page, opt = {}, element = page)
16
17
  retry_count = opt[:retry_count] || 5
17
18
 
19
+ raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
+
18
21
  child_results = []
19
22
  limit = @limit.nil? ? Float::MAX : @limit
20
23
  while page
@@ -32,10 +35,14 @@ module Yasuri
32
35
  break if (limit -= 1) <= 0
33
36
  end
34
37
 
38
+ if @flatten == true
39
+ return child_results.map{|h| h.values}.flatten
40
+ end
41
+
35
42
  child_results
36
43
  end
37
44
  def opts
38
- {limit:@limit}
45
+ {limit:@limit, flatten:@flatten}
39
46
  end
40
47
  end
41
48
  end
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class StructNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
10
- sub_tags = page.search(@xpath)
9
+ def inject(agent, page, opt = {}, element = page)
10
+ sub_tags = element.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
13
  child_name = Yasuri.NodeName(child_node.name, opt)
14
- [child_name, child_node.inject(agent, sub_tag, opt)]
14
+ [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
@@ -18,8 +18,8 @@ module Yasuri
18
18
 
19
19
  end
20
20
 
21
- def inject(agent, page, opt = {})
22
- node = page.search(@xpath)
21
+ def inject(agent, page, opt = {}, element = page)
22
+ node = element.search(@xpath)
23
23
  text = node.text.to_s
24
24
 
25
25
  if @truncate
@@ -0,0 +1,30 @@
1
+ <html>
2
+ <head>
3
+ <title>StructualLinksTest</title>
4
+ </head>
5
+ <body>
6
+
7
+ <table>
8
+ <thead>
9
+ <tr>
10
+ <th>Title</th>
11
+ <th>Links</th>
12
+ </tr>
13
+ </thead>
14
+ <tr>
15
+ <td>Child01,02</td>
16
+ <td><a href="../child01.html">Child01</a></td>
17
+ <td><a href="../child02.html">Child02</a></td>
18
+ <td>../child02.html</td>
19
+ </tr>
20
+
21
+ <tr>
22
+ <td>Child01,02,03</td>
23
+ <td><a href="../child01.html">Child01</a></td>
24
+ <td><a href="../child02.html">Child02</a></td>
25
+ <td><a href="../child03.html">Child03</a></td>
26
+ </tr>
27
+ </table>
28
+
29
+ </body>
30
+ </html>
@@ -30,6 +30,49 @@ describe 'Yasuri' do
30
30
  expect(actual).to match expected
31
31
  end
32
32
 
33
+ it "scrape each paginated pages with flatten" do
34
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
+ Yasuri::TextNode.new('/html/body/p', "content"),
36
+ Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
+ Yasuri::TextNode.new('./a', "text"),
38
+ ]),
39
+ ], flatten: true)
40
+ actual = root_node.inject(@agent, @page)
41
+ expected = [
42
+ "PaginationTest01",
43
+ {"text"=>""},
44
+ {"text"=>""},
45
+ {"text" => "2"},
46
+ {"text" => "3"},
47
+ {"text" => "4"},
48
+ {"text"=>"NextPage »"},
49
+ "PaginationTest02",
50
+ {"text"=>"« PreviousPage"},
51
+ {"text" => "1"},
52
+ {"text"=>""},
53
+ {"text" => "3"},
54
+ {"text" => "4"},
55
+ {"text"=>"NextPage »"},
56
+ "PaginationTest03",
57
+ {"text"=>"« PreviousPage"},
58
+ {"text" => "1"},
59
+ {"text" => "2"},
60
+ {"text"=>""},
61
+ {"text" => "4"},
62
+ {"text"=>"NextPage »"},
63
+ "PaginationTest04",
64
+ {"text"=>"« PreviousPage"},
65
+ {"text" => "1"},
66
+ {"text" => "2"},
67
+ {"text" => "3"},
68
+ {"text"=>""},
69
+ {"text"=>""},
70
+ ]
71
+
72
+ expect(actual).to match expected
73
+ end
74
+
75
+
33
76
  it "scrape each paginated pages limited" do
34
77
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
78
  Yasuri::TextNode.new('/html/body/p', "content"),
data/spec/yasuri_spec.rb CHANGED
@@ -126,7 +126,7 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
127
  ])
128
128
  ])
129
- page = @agent.get(@uri + "/structual_text.html")
129
+ page = @agent.get(@uri + "/struct/structual_text.html")
130
130
  compare_generated_vs_original(generated, original, page)
131
131
  end
132
132
  end
@@ -193,6 +193,7 @@ describe 'Yasuri' do
193
193
  "name" : "root",
194
194
  "path" : "/html/body/nav/span/a[@class='next']",
195
195
  "limit" : 10,
196
+ "flatten" : false,
196
197
  "children" : [ { "node" : "text",
197
198
  "name" : "content",
198
199
  "path" : "/html/body/p"
@@ -12,7 +12,7 @@ describe 'Yasuri' do
12
12
  describe '::StructNode' do
13
13
  before do
14
14
  @agent = Mechanize.new
15
- @page = @agent.get(uri + "/structual_text.html")
15
+ @page = @agent.get(uri + "/struct/structual_text.html")
16
16
 
17
17
  @table_1996 = [
18
18
  { "title" => "The Perfect Insider",
@@ -132,4 +132,45 @@ describe 'Yasuri' do
132
132
  end
133
133
 
134
134
  end
135
+
136
+ describe '::StructNode::Links' do
137
+ before do
138
+ @agent = Mechanize.new
139
+ @page = @agent.get(uri + "/struct/structual_links.html")
140
+
141
+ @table = [
142
+ { "title" => "Child01,02",
143
+ "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
144
+
145
+ { "title" => "Child01,02,03",
146
+ "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
147
+ ]
148
+ end
149
+
150
+ it 'return child node in links inside struct' do
151
+ node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
+ Yasuri::TextNode.new('./td[1]', "title"),
153
+ Yasuri::LinksNode.new('./td/a', "child", [
154
+ Yasuri::TextNode.new('/html/body/p', "p"),
155
+ ])
156
+ ])
157
+ expected = @table
158
+ actual = node.inject(@agent, @page)
159
+ expect(actual).to match expected
160
+ end
161
+ end # descrive
162
+
163
+ describe '::StructNode::Pages' do
164
+ before do
165
+ @agent = Mechanize.new
166
+ @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
+ end
168
+
169
+ it 'not supported' do
170
+ node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
+ Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
+ ])
173
+ expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
+ end
175
+ end
135
176
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.11
4
+ version: 2.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-14 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -174,7 +174,8 @@ files:
174
174
  - spec/htdocs/pagination/page02.html
175
175
  - spec/htdocs/pagination/page03.html
176
176
  - spec/htdocs/pagination/page04.html
177
- - spec/htdocs/structual_text.html
177
+ - spec/htdocs/struct/structual_links.html
178
+ - spec/htdocs/struct/structual_text.html
178
179
  - spec/servers/httpserver.rb
179
180
  - spec/spec_helper.rb
180
181
  - spec/yasuri_links_node_spec.rb
@@ -204,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
205
  version: '0'
205
206
  requirements: []
206
207
  rubyforge_project:
207
- rubygems_version: 2.4.5
208
+ rubygems_version: 2.5.2
208
209
  signing_key:
209
210
  specification_version: 4
210
211
  summary: Yasuri is easy scraping library.
@@ -220,7 +221,8 @@ test_files:
220
221
  - spec/htdocs/pagination/page02.html
221
222
  - spec/htdocs/pagination/page03.html
222
223
  - spec/htdocs/pagination/page04.html
223
- - spec/htdocs/structual_text.html
224
+ - spec/htdocs/struct/structual_links.html
225
+ - spec/htdocs/struct/structual_text.html
224
226
  - spec/servers/httpserver.rb
225
227
  - spec/spec_helper.rb
226
228
  - spec/yasuri_links_node_spec.rb