yasuri 2.0.11 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
4
- data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
3
+ metadata.gz: a4fed4a13bb125758515e3c0ced665b1ca3d20b6
4
+ data.tar.gz: e9dfb2ed6256a367db2e5b6a78d23fa097c422d7
5
5
  SHA512:
6
- metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
7
- data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
6
+ metadata.gz: 8b9d6345f3f49b1f7d9445ce18bca736b8cbeedc69979a45d541b59af4e09092d7c1d12886801a24296e9e3d73f39a7c2d53a7c2de12e1a0ff890623b47cfe84
7
+ data.tar.gz: 6d755f266062052dd5244599deefefea85f7570c827a898e48eee22c44510dde287b0554ed2cae85e3b94b44fe4eb6f74b512c44047e6cc1bb43fe27a93143b0
data/USAGE.ja.md CHANGED
@@ -431,3 +431,38 @@ node.inject(agent, page)
431
431
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
432
432
  ```
433
433
  この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
434
+
435
+ ##### `flatten`
436
+ 取得した各ページの結果を展開します.
437
+
438
+ ```ruby
439
+ agent = Mechanize.new
440
+ page = agent.get("http://yasuri.example.net/page01.html")
441
+
442
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
443
+ text_title '/html/head/title'
444
+ text_content '/html/body/p'
445
+ end
446
+ node.inject(agent, page)
447
+
448
+ #=> [ {"title" => "Page01",
449
+ "content" => "Patination01"},
450
+ {"title" => "Page01",
451
+ "content" => "Patination02"},
452
+ {"title" => "Page01",
453
+ "content" => "Patination03"}]
454
+
455
+
456
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
457
+ text_title '/html/head/title'
458
+ text_content '/html/body/p'
459
+ end
460
+ node.inject(agent, page)
461
+
462
+ #=> [ "Page01",
463
+ "Patination01",
464
+ "Page02",
465
+ "Patination02",
466
+ "Page03",
467
+ "Patination03"]
468
+ ```
data/USAGE.md CHANGED
@@ -429,3 +429,38 @@ node.inject(agent, page)
429
429
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
430
430
  ```
431
431
  Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
432
+
433
+ ##### `flatten`
434
+ `flatten` option expands each page results.
435
+
436
+ ```ruby
437
+ agent = Mechanize.new
438
+ page = agent.get("http://yasuri.example.net/page01.html")
439
+
440
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
441
+ text_title '/html/head/title'
442
+ text_content '/html/body/p'
443
+ end
444
+ node.inject(agent, page)
445
+
446
+ #=> [ {"title" => "Page01",
447
+ "content" => "Patination01"},
448
+ {"title" => "Page01",
449
+ "content" => "Patination02"},
450
+ {"title" => "Page01",
451
+ "content" => "Patination03"}]
452
+
453
+
454
+ node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
455
+ text_title '/html/head/title'
456
+ text_content '/html/body/p'
457
+ end
458
+ node.inject(agent, page)
459
+
460
+ #=> [ "Page01",
461
+ "Patination01",
462
+ "Page02",
463
+ "Patination02",
464
+ "Page03",
465
+ "Patination03"]
466
+ ```
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.11"
2
+ VERSION = "2.0.12"
3
3
  end
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
9
+ def inject(agent, page, opt = {}, element = page)
10
10
  retry_count = opt[:retry_count] || 5
11
11
 
12
- links = page.search(@xpath) || [] # links expected
12
+ links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
@@ -11,7 +11,7 @@ module Yasuri
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {})
14
+ def inject(agent, page, opt = {}, element = page)
15
15
  fail "#{Kernel.__method__} is not implemented."
16
16
  end
17
17
  def opts
@@ -7,14 +7,17 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], limit: nil)
10
+ def initialize(xpath, name, children = [], limit: nil, flatten: false)
11
11
  super(xpath, name, children)
12
+ @flatten = flatten
12
13
  @limit = limit
13
14
  end
14
15
 
15
- def inject(agent, page, opt = {})
16
+ def inject(agent, page, opt = {}, element = page)
16
17
  retry_count = opt[:retry_count] || 5
17
18
 
19
+ raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
+
18
21
  child_results = []
19
22
  limit = @limit.nil? ? Float::MAX : @limit
20
23
  while page
@@ -32,10 +35,14 @@ module Yasuri
32
35
  break if (limit -= 1) <= 0
33
36
  end
34
37
 
38
+ if @flatten == true
39
+ return child_results.map{|h| h.values}.flatten
40
+ end
41
+
35
42
  child_results
36
43
  end
37
44
  def opts
38
- {limit:@limit}
45
+ {limit:@limit, flatten:@flatten}
39
46
  end
40
47
  end
41
48
  end
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class StructNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
10
- sub_tags = page.search(@xpath)
9
+ def inject(agent, page, opt = {}, element = page)
10
+ sub_tags = element.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
13
  child_name = Yasuri.NodeName(child_node.name, opt)
14
- [child_name, child_node.inject(agent, sub_tag, opt)]
14
+ [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
@@ -18,8 +18,8 @@ module Yasuri
18
18
 
19
19
  end
20
20
 
21
- def inject(agent, page, opt = {})
22
- node = page.search(@xpath)
21
+ def inject(agent, page, opt = {}, element = page)
22
+ node = element.search(@xpath)
23
23
  text = node.text.to_s
24
24
 
25
25
  if @truncate
@@ -0,0 +1,30 @@
1
+ <html>
2
+ <head>
3
+ <title>StructualLinksTest</title>
4
+ </head>
5
+ <body>
6
+
7
+ <table>
8
+ <thead>
9
+ <tr>
10
+ <th>Title</th>
11
+ <th>Links</th>
12
+ </tr>
13
+ </thead>
14
+ <tr>
15
+ <td>Child01,02</td>
16
+ <td><a href="../child01.html">Child01</a></td>
17
+ <td><a href="../child02.html">Child02</a></td>
18
+ <td>../child02.html</td>
19
+ </tr>
20
+
21
+ <tr>
22
+ <td>Child01,02,03</td>
23
+ <td><a href="../child01.html">Child01</a></td>
24
+ <td><a href="../child02.html">Child02</a></td>
25
+ <td><a href="../child03.html">Child03</a></td>
26
+ </tr>
27
+ </table>
28
+
29
+ </body>
30
+ </html>
@@ -30,6 +30,49 @@ describe 'Yasuri' do
30
30
  expect(actual).to match expected
31
31
  end
32
32
 
33
+ it "scrape each paginated pages with flatten" do
34
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
+ Yasuri::TextNode.new('/html/body/p', "content"),
36
+ Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
+ Yasuri::TextNode.new('./a', "text"),
38
+ ]),
39
+ ], flatten: true)
40
+ actual = root_node.inject(@agent, @page)
41
+ expected = [
42
+ "PaginationTest01",
43
+ {"text"=>""},
44
+ {"text"=>""},
45
+ {"text" => "2"},
46
+ {"text" => "3"},
47
+ {"text" => "4"},
48
+ {"text"=>"NextPage »"},
49
+ "PaginationTest02",
50
+ {"text"=>"« PreviousPage"},
51
+ {"text" => "1"},
52
+ {"text"=>""},
53
+ {"text" => "3"},
54
+ {"text" => "4"},
55
+ {"text"=>"NextPage »"},
56
+ "PaginationTest03",
57
+ {"text"=>"« PreviousPage"},
58
+ {"text" => "1"},
59
+ {"text" => "2"},
60
+ {"text"=>""},
61
+ {"text" => "4"},
62
+ {"text"=>"NextPage »"},
63
+ "PaginationTest04",
64
+ {"text"=>"« PreviousPage"},
65
+ {"text" => "1"},
66
+ {"text" => "2"},
67
+ {"text" => "3"},
68
+ {"text"=>""},
69
+ {"text"=>""},
70
+ ]
71
+
72
+ expect(actual).to match expected
73
+ end
74
+
75
+
33
76
  it "scrape each paginated pages limited" do
34
77
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
78
  Yasuri::TextNode.new('/html/body/p', "content"),
data/spec/yasuri_spec.rb CHANGED
@@ -126,7 +126,7 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
127
  ])
128
128
  ])
129
- page = @agent.get(@uri + "/structual_text.html")
129
+ page = @agent.get(@uri + "/struct/structual_text.html")
130
130
  compare_generated_vs_original(generated, original, page)
131
131
  end
132
132
  end
@@ -193,6 +193,7 @@ describe 'Yasuri' do
193
193
  "name" : "root",
194
194
  "path" : "/html/body/nav/span/a[@class='next']",
195
195
  "limit" : 10,
196
+ "flatten" : false,
196
197
  "children" : [ { "node" : "text",
197
198
  "name" : "content",
198
199
  "path" : "/html/body/p"
@@ -12,7 +12,7 @@ describe 'Yasuri' do
12
12
  describe '::StructNode' do
13
13
  before do
14
14
  @agent = Mechanize.new
15
- @page = @agent.get(uri + "/structual_text.html")
15
+ @page = @agent.get(uri + "/struct/structual_text.html")
16
16
 
17
17
  @table_1996 = [
18
18
  { "title" => "The Perfect Insider",
@@ -132,4 +132,45 @@ describe 'Yasuri' do
132
132
  end
133
133
 
134
134
  end
135
+
136
+ describe '::StructNode::Links' do
137
+ before do
138
+ @agent = Mechanize.new
139
+ @page = @agent.get(uri + "/struct/structual_links.html")
140
+
141
+ @table = [
142
+ { "title" => "Child01,02",
143
+ "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
144
+
145
+ { "title" => "Child01,02,03",
146
+ "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
147
+ ]
148
+ end
149
+
150
+ it 'return child node in links inside struct' do
151
+ node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
+ Yasuri::TextNode.new('./td[1]', "title"),
153
+ Yasuri::LinksNode.new('./td/a', "child", [
154
+ Yasuri::TextNode.new('/html/body/p', "p"),
155
+ ])
156
+ ])
157
+ expected = @table
158
+ actual = node.inject(@agent, @page)
159
+ expect(actual).to match expected
160
+ end
161
+ end # descrive
162
+
163
+ describe '::StructNode::Pages' do
164
+ before do
165
+ @agent = Mechanize.new
166
+ @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
+ end
168
+
169
+ it 'not supported' do
170
+ node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
+ Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
+ ])
173
+ expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
+ end
175
+ end
135
176
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.11
4
+ version: 2.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-14 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -174,7 +174,8 @@ files:
174
174
  - spec/htdocs/pagination/page02.html
175
175
  - spec/htdocs/pagination/page03.html
176
176
  - spec/htdocs/pagination/page04.html
177
- - spec/htdocs/structual_text.html
177
+ - spec/htdocs/struct/structual_links.html
178
+ - spec/htdocs/struct/structual_text.html
178
179
  - spec/servers/httpserver.rb
179
180
  - spec/spec_helper.rb
180
181
  - spec/yasuri_links_node_spec.rb
@@ -204,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
205
  version: '0'
205
206
  requirements: []
206
207
  rubyforge_project:
207
- rubygems_version: 2.4.5
208
+ rubygems_version: 2.5.2
208
209
  signing_key:
209
210
  specification_version: 4
210
211
  summary: Yasuri is easy scraping library.
@@ -220,7 +221,8 @@ test_files:
220
221
  - spec/htdocs/pagination/page02.html
221
222
  - spec/htdocs/pagination/page03.html
222
223
  - spec/htdocs/pagination/page04.html
223
- - spec/htdocs/structual_text.html
224
+ - spec/htdocs/struct/structual_links.html
225
+ - spec/htdocs/struct/structual_text.html
224
226
  - spec/servers/httpserver.rb
225
227
  - spec/spec_helper.rb
226
228
  - spec/yasuri_links_node_spec.rb