yasuri 1.9.12 → 2.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 87406480e622911dca3649ba2a8e5b134ccffb36
4
- data.tar.gz: 2fde6a481bed02e569a5c08f6df96c09925abee0
3
+ metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
4
+ data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
5
5
  SHA512:
6
- metadata.gz: 7c274f2316495aea66d737f053119dd71c154f7411b9cd54b102c71ee2e7ac36602dd0e44d78d0d06895ee0e27f934a9ac0e2f45d7e52ce4629f60f9fd905cf3
7
- data.tar.gz: a5696ca1fac061c542c7f0586bfecdf623962443e98207f897ff76ff0204cc78e53505b070f18467d2e72c0ddb527383224fb75ff2d05b5ff6e5a5149caaa20a
6
+ metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
7
+ data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
data/USAGE.ja.md CHANGED
@@ -431,38 +431,3 @@ node.inject(agent, page)
431
431
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
432
432
  ```
433
433
  この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
434
-
435
- ##### `flatten`
436
- 取得した各ページの結果を展開します.
437
-
438
- ```ruby
439
- agent = Mechanize.new
440
- page = agent.get("http://yasuri.example.net/page01.html")
441
-
442
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
443
- text_title '/html/head/title'
444
- text_content '/html/body/p'
445
- end
446
- node.inject(agent, page)
447
-
448
- #=> [ {"title" => "Page01",
449
- "content" => "Patination01"},
450
- {"title" => "Page01",
451
- "content" => "Patination02"},
452
- {"title" => "Page01",
453
- "content" => "Patination03"}]
454
-
455
-
456
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
457
- text_title '/html/head/title'
458
- text_content '/html/body/p'
459
- end
460
- node.inject(agent, page)
461
-
462
- #=> [ "Page01",
463
- "Patination01",
464
- "Page02",
465
- "Patination02",
466
- "Page03",
467
- "Patination03"]
468
- ```
data/USAGE.md CHANGED
@@ -429,38 +429,3 @@ node.inject(agent, page)
429
429
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
430
430
  ```
431
431
  Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
432
-
433
- ##### `flatten`
434
- `flatten` option expands each page results.
435
-
436
- ```ruby
437
- agent = Mechanize.new
438
- page = agent.get("http://yasuri.example.net/page01.html")
439
-
440
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
441
- text_title '/html/head/title'
442
- text_content '/html/body/p'
443
- end
444
- node.inject(agent, page)
445
-
446
- #=> [ {"title" => "Page01",
447
- "content" => "Patination01"},
448
- {"title" => "Page01",
449
- "content" => "Patination02"},
450
- {"title" => "Page01",
451
- "content" => "Patination03"}]
452
-
453
-
454
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
455
- text_title '/html/head/title'
456
- text_content '/html/body/p'
457
- end
458
- node.inject(agent, page)
459
-
460
- #=> [ "Page01",
461
- "Patination01",
462
- "Page02",
463
- "Patination02",
464
- "Page03",
465
- "Patination03"]
466
- ```
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "1.9.12"
2
+ VERSION = "2.0.11"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -37,7 +37,7 @@ module Yasuri
37
37
  }
38
38
  Node2Text = Text2Node.invert
39
39
 
40
- ReservedKeys = [:node, :name, :path, :children]
40
+ ReservedKeys = %i|node name path children|
41
41
  def self.hash2node(node_h)
42
42
  node, name, path, children = ReservedKeys.map do |key|
43
43
  node_h[key]
@@ -78,8 +78,7 @@ module Yasuri
78
78
  json
79
79
  end
80
80
 
81
- def self.NodeName(name, hash = {})
82
- symbolize_names = hash[:symbolize_names] || false
81
+ def self.NodeName(name, symbolize_names:false)
83
82
  symbolize_names ? name.to_sym : name
84
83
  end
85
84
 
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {}, element = page)
9
+ def inject(agent, page, opt = {})
10
10
  retry_count = opt[:retry_count] || 5
11
11
 
12
- links = element.search(@xpath) || [] # links expected
12
+ links = page.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
@@ -7,11 +7,11 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt = {})
10
+ def initialize(xpath, name, children = [], opt: {})
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {}, element = page)
14
+ def inject(agent, page, opt = {})
15
15
  fail "#{Kernel.__method__} is not implemented."
16
16
  end
17
17
  def opts
@@ -7,17 +7,14 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], limit: nil)
11
11
  super(xpath, name, children)
12
- @limit = hash[:limit]
13
- @flatten = hash[:flatten] || false
12
+ @limit = limit
14
13
  end
15
14
 
16
- def inject(agent, page, opt = {}, element = page)
15
+ def inject(agent, page, opt = {})
17
16
  retry_count = opt[:retry_count] || 5
18
17
 
19
- raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
-
21
18
  child_results = []
22
19
  limit = @limit.nil? ? Float::MAX : @limit
23
20
  while page
@@ -35,14 +32,10 @@ module Yasuri
35
32
  break if (limit -= 1) <= 0
36
33
  end
37
34
 
38
- if @flatten == true
39
- return child_results.map{|h| h.values}.flatten
40
- end
41
-
42
35
  child_results
43
36
  end
44
37
  def opts
45
- {limit:@limit, flatten:@flatten}
38
+ {limit:@limit}
46
39
  end
47
40
  end
48
41
  end
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class StructNode
8
8
  include Node
9
- def inject(agent, page, opt = {}, element = page)
10
- sub_tags = element.search(@xpath)
9
+ def inject(agent, page, opt = {})
10
+ sub_tags = page.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
13
  child_name = Yasuri.NodeName(child_node.name, opt)
14
- [child_name, child_node.inject(agent, page, opt, sub_tag)]
14
+ [child_name, child_node.inject(agent, sub_tag, opt)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
@@ -7,12 +7,9 @@ module Yasuri
7
7
  class TextNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], truncate: nil, proc:nil)
11
11
  super(xpath, name, children)
12
12
 
13
- truncate = hash[:truncate]
14
- proc = hash[:proc]
15
-
16
13
  truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
17
14
  @truncate = truncate
18
15
  @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
@@ -21,8 +18,8 @@ module Yasuri
21
18
 
22
19
  end
23
20
 
24
- def inject(agent, page, opt = {}, element = page)
25
- node = element.search(@xpath)
21
+ def inject(agent, page, opt = {})
22
+ node = page.search(@xpath)
26
23
  text = node.text.to_s
27
24
 
28
25
  if @truncate
@@ -30,49 +30,6 @@ describe 'Yasuri' do
30
30
  expect(actual).to match expected
31
31
  end
32
32
 
33
- it "scrape each paginated pages with flatten" do
34
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
- Yasuri::TextNode.new('/html/body/p', "content"),
36
- Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
- Yasuri::TextNode.new('./a', "text"),
38
- ]),
39
- ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
41
- expected = [
42
- "PaginationTest01",
43
- {"text"=>""},
44
- {"text"=>""},
45
- {"text" => "2"},
46
- {"text" => "3"},
47
- {"text" => "4"},
48
- {"text"=>"NextPage »"},
49
- "PaginationTest02",
50
- {"text"=>"« PreviousPage"},
51
- {"text" => "1"},
52
- {"text"=>""},
53
- {"text" => "3"},
54
- {"text" => "4"},
55
- {"text"=>"NextPage »"},
56
- "PaginationTest03",
57
- {"text"=>"« PreviousPage"},
58
- {"text" => "1"},
59
- {"text" => "2"},
60
- {"text"=>""},
61
- {"text" => "4"},
62
- {"text"=>"NextPage »"},
63
- "PaginationTest04",
64
- {"text"=>"« PreviousPage"},
65
- {"text" => "1"},
66
- {"text" => "2"},
67
- {"text" => "3"},
68
- {"text"=>""},
69
- {"text"=>""},
70
- ]
71
-
72
- expect(actual).to match expected
73
- end
74
-
75
-
76
33
  it "scrape each paginated pages limited" do
77
34
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
35
  Yasuri::TextNode.new('/html/body/p', "content"),
data/spec/yasuri_spec.rb CHANGED
@@ -39,7 +39,7 @@ describe 'Yasuri' do
39
39
  "truncate" : "^[^,]+"
40
40
  }|
41
41
  generated = Yasuri.json2tree(src)
42
- original = Yasuri::TextNode.new('/html/body/p[1]', "content", {}, truncate:/^[^,]+/)
42
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
43
43
  compare_generated_vs_original(generated, original, @index_page)
44
44
  end
45
45
 
@@ -126,7 +126,7 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
127
  ])
128
128
  ])
129
- page = @agent.get(@uri + "/struct/structual_text.html")
129
+ page = @agent.get(@uri + "/structual_text.html")
130
130
  compare_generated_vs_original(generated, original, page)
131
131
  end
132
132
  end
@@ -153,7 +153,7 @@ describe 'Yasuri' do
153
153
  end
154
154
 
155
155
  it "return text node with truncate_regexp" do
156
- node = Yasuri::TextNode.new("/html/head/title", "title", {}, truncate:/^[^,]+/)
156
+ node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
157
157
  json = Yasuri.tree2json(node)
158
158
  expected_str = %q| { "node": "text",
159
159
  "name": "title",
@@ -193,7 +193,6 @@ describe 'Yasuri' do
193
193
  "name" : "root",
194
194
  "path" : "/html/body/nav/span/a[@class='next']",
195
195
  "limit" : 10,
196
- "flatten" : false,
197
196
  "children" : [ { "node" : "text",
198
197
  "name" : "content",
199
198
  "path" : "/html/body/p"
@@ -12,7 +12,7 @@ describe 'Yasuri' do
12
12
  describe '::StructNode' do
13
13
  before do
14
14
  @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
15
+ @page = @agent.get(uri + "/structual_text.html")
16
16
 
17
17
  @table_1996 = [
18
18
  { "title" => "The Perfect Insider",
@@ -126,51 +126,10 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[1]', "title"),
127
127
  Yasuri::TextNode.new('./td[2]', "pub_date"),
128
128
  ])
129
- expected = @table_1996.map{|h| Hash[h.map{|k,v| [k.to_sym, v] }] }
129
+ expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
130
  actual = node.inject(@agent, @page, symbolize_names:true)
131
131
  expect(actual).to match expected
132
132
  end
133
133
 
134
134
  end
135
-
136
- describe '::StructNode::Links' do
137
- before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
141
- @table = [
142
- { "title" => "Child01,02",
143
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
144
-
145
- { "title" => "Child01,02,03",
146
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
147
- ]
148
- end
149
-
150
- it 'return child node in links inside struct' do
151
- node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
- Yasuri::TextNode.new('./td[1]', "title"),
153
- Yasuri::LinksNode.new('./td/a', "child", [
154
- Yasuri::TextNode.new('/html/body/p', "p"),
155
- ])
156
- ])
157
- expected = @table
158
- actual = node.inject(@agent, @page)
159
- expect(actual).to match expected
160
- end
161
- end # descrive
162
-
163
- describe '::StructNode::Pages' do
164
- before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
- end
168
-
169
- it 'not supported' do
170
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
- Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
- ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
- end
175
- end
176
135
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.12
4
+ version: 2.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-15 00:00:00.000000000 Z
11
+ date: 2016-11-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -174,8 +174,7 @@ files:
174
174
  - spec/htdocs/pagination/page02.html
175
175
  - spec/htdocs/pagination/page03.html
176
176
  - spec/htdocs/pagination/page04.html
177
- - spec/htdocs/struct/structual_links.html
178
- - spec/htdocs/struct/structual_text.html
177
+ - spec/htdocs/structual_text.html
179
178
  - spec/servers/httpserver.rb
180
179
  - spec/spec_helper.rb
181
180
  - spec/yasuri_links_node_spec.rb
@@ -205,7 +204,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
205
204
  version: '0'
206
205
  requirements: []
207
206
  rubyforge_project:
208
- rubygems_version: 2.5.2
207
+ rubygems_version: 2.4.5
209
208
  signing_key:
210
209
  specification_version: 4
211
210
  summary: Yasuri is easy scraping library.
@@ -221,8 +220,7 @@ test_files:
221
220
  - spec/htdocs/pagination/page02.html
222
221
  - spec/htdocs/pagination/page03.html
223
222
  - spec/htdocs/pagination/page04.html
224
- - spec/htdocs/struct/structual_links.html
225
- - spec/htdocs/struct/structual_text.html
223
+ - spec/htdocs/structual_text.html
226
224
  - spec/servers/httpserver.rb
227
225
  - spec/spec_helper.rb
228
226
  - spec/yasuri_links_node_spec.rb
@@ -1,30 +0,0 @@
1
- <html>
2
- <head>
3
- <title>StructualLinksTest</title>
4
- </head>
5
- <body>
6
-
7
- <table>
8
- <thead>
9
- <tr>
10
- <th>Title</th>
11
- <th>Links</th>
12
- </tr>
13
- </thead>
14
- <tr>
15
- <td>Child01,02</td>
16
- <td><a href="../child01.html">Child01</a></td>
17
- <td><a href="../child02.html">Child02</a></td>
18
- <td>../child02.html</td>
19
- </tr>
20
-
21
- <tr>
22
- <td>Child01,02,03</td>
23
- <td><a href="../child01.html">Child01</a></td>
24
- <td><a href="../child02.html">Child02</a></td>
25
- <td><a href="../child03.html">Child03</a></td>
26
- </tr>
27
- </table>
28
-
29
- </body>
30
- </html>