yasuri 1.9.12 → 2.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 87406480e622911dca3649ba2a8e5b134ccffb36
4
- data.tar.gz: 2fde6a481bed02e569a5c08f6df96c09925abee0
3
+ metadata.gz: d8d6bd8c37be444f0c5568bcf20604d7bca5c223
4
+ data.tar.gz: 8438eee300a7e4f73be7107cbd9417da18f5048d
5
5
  SHA512:
6
- metadata.gz: 7c274f2316495aea66d737f053119dd71c154f7411b9cd54b102c71ee2e7ac36602dd0e44d78d0d06895ee0e27f934a9ac0e2f45d7e52ce4629f60f9fd905cf3
7
- data.tar.gz: a5696ca1fac061c542c7f0586bfecdf623962443e98207f897ff76ff0204cc78e53505b070f18467d2e72c0ddb527383224fb75ff2d05b5ff6e5a5149caaa20a
6
+ metadata.gz: 107ddc8cd0310c646841e6fe6a2695313edb9692418a783e133a5d269d4a1ab39385975276ae167ac68863b9760794ebb2738832dccfc4f599686c5a9e50f244
7
+ data.tar.gz: b6d089de8cd866f137ca58dd779396cd4948e080d3225cc4384f8f9cdb54f5a778cd4be85b89628938ccacbf11dfefe74ea8bd248e835971470e7a64df597411
data/USAGE.ja.md CHANGED
@@ -431,38 +431,3 @@ node.inject(agent, page)
431
431
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
432
432
  ```
433
433
  この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
434
-
435
- ##### `flatten`
436
- 取得した各ページの結果を展開します.
437
-
438
- ```ruby
439
- agent = Mechanize.new
440
- page = agent.get("http://yasuri.example.net/page01.html")
441
-
442
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
443
- text_title '/html/head/title'
444
- text_content '/html/body/p'
445
- end
446
- node.inject(agent, page)
447
-
448
- #=> [ {"title" => "Page01",
449
- "content" => "Patination01"},
450
- {"title" => "Page01",
451
- "content" => "Patination02"},
452
- {"title" => "Page01",
453
- "content" => "Patination03"}]
454
-
455
-
456
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
457
- text_title '/html/head/title'
458
- text_content '/html/body/p'
459
- end
460
- node.inject(agent, page)
461
-
462
- #=> [ "Page01",
463
- "Patination01",
464
- "Page02",
465
- "Patination02",
466
- "Page03",
467
- "Patination03"]
468
- ```
data/USAGE.md CHANGED
@@ -429,38 +429,3 @@ node.inject(agent, page)
429
429
  #=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
430
430
  ```
431
431
  Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
432
-
433
- ##### `flatten`
434
- `flatten` option expands each page results.
435
-
436
- ```ruby
437
- agent = Mechanize.new
438
- page = agent.get("http://yasuri.example.net/page01.html")
439
-
440
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
441
- text_title '/html/head/title'
442
- text_content '/html/body/p'
443
- end
444
- node.inject(agent, page)
445
-
446
- #=> [ {"title" => "Page01",
447
- "content" => "Patination01"},
448
- {"title" => "Page01",
449
- "content" => "Patination02"},
450
- {"title" => "Page01",
451
- "content" => "Patination03"}]
452
-
453
-
454
- node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
455
- text_title '/html/head/title'
456
- text_content '/html/body/p'
457
- end
458
- node.inject(agent, page)
459
-
460
- #=> [ "Page01",
461
- "Patination01",
462
- "Page02",
463
- "Patination02",
464
- "Page03",
465
- "Patination03"]
466
- ```
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "1.9.12"
2
+ VERSION = "2.0.11"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -37,7 +37,7 @@ module Yasuri
37
37
  }
38
38
  Node2Text = Text2Node.invert
39
39
 
40
- ReservedKeys = [:node, :name, :path, :children]
40
+ ReservedKeys = %i|node name path children|
41
41
  def self.hash2node(node_h)
42
42
  node, name, path, children = ReservedKeys.map do |key|
43
43
  node_h[key]
@@ -78,8 +78,7 @@ module Yasuri
78
78
  json
79
79
  end
80
80
 
81
- def self.NodeName(name, hash = {})
82
- symbolize_names = hash[:symbolize_names] || false
81
+ def self.NodeName(name, symbolize_names:false)
83
82
  symbolize_names ? name.to_sym : name
84
83
  end
85
84
 
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {}, element = page)
9
+ def inject(agent, page, opt = {})
10
10
  retry_count = opt[:retry_count] || 5
11
11
 
12
- links = element.search(@xpath) || [] # links expected
12
+ links = page.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
@@ -7,11 +7,11 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt = {})
10
+ def initialize(xpath, name, children = [], opt: {})
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {}, element = page)
14
+ def inject(agent, page, opt = {})
15
15
  fail "#{Kernel.__method__} is not implemented."
16
16
  end
17
17
  def opts
@@ -7,17 +7,14 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], limit: nil)
11
11
  super(xpath, name, children)
12
- @limit = hash[:limit]
13
- @flatten = hash[:flatten] || false
12
+ @limit = limit
14
13
  end
15
14
 
16
- def inject(agent, page, opt = {}, element = page)
15
+ def inject(agent, page, opt = {})
17
16
  retry_count = opt[:retry_count] || 5
18
17
 
19
- raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
-
21
18
  child_results = []
22
19
  limit = @limit.nil? ? Float::MAX : @limit
23
20
  while page
@@ -35,14 +32,10 @@ module Yasuri
35
32
  break if (limit -= 1) <= 0
36
33
  end
37
34
 
38
- if @flatten == true
39
- return child_results.map{|h| h.values}.flatten
40
- end
41
-
42
35
  child_results
43
36
  end
44
37
  def opts
45
- {limit:@limit, flatten:@flatten}
38
+ {limit:@limit}
46
39
  end
47
40
  end
48
41
  end
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class StructNode
8
8
  include Node
9
- def inject(agent, page, opt = {}, element = page)
10
- sub_tags = element.search(@xpath)
9
+ def inject(agent, page, opt = {})
10
+ sub_tags = page.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
13
  child_name = Yasuri.NodeName(child_node.name, opt)
14
- [child_name, child_node.inject(agent, page, opt, sub_tag)]
14
+ [child_name, child_node.inject(agent, sub_tag, opt)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
@@ -7,12 +7,9 @@ module Yasuri
7
7
  class TextNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], truncate: nil, proc:nil)
11
11
  super(xpath, name, children)
12
12
 
13
- truncate = hash[:truncate]
14
- proc = hash[:proc]
15
-
16
13
  truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
17
14
  @truncate = truncate
18
15
  @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
@@ -21,8 +18,8 @@ module Yasuri
21
18
 
22
19
  end
23
20
 
24
- def inject(agent, page, opt = {}, element = page)
25
- node = element.search(@xpath)
21
+ def inject(agent, page, opt = {})
22
+ node = page.search(@xpath)
26
23
  text = node.text.to_s
27
24
 
28
25
  if @truncate
@@ -30,49 +30,6 @@ describe 'Yasuri' do
30
30
  expect(actual).to match expected
31
31
  end
32
32
 
33
- it "scrape each paginated pages with flatten" do
34
- root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
35
- Yasuri::TextNode.new('/html/body/p', "content"),
36
- Yasuri::StructNode.new('/html/body/nav/span', "span", [
37
- Yasuri::TextNode.new('./a', "text"),
38
- ]),
39
- ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
41
- expected = [
42
- "PaginationTest01",
43
- {"text"=>""},
44
- {"text"=>""},
45
- {"text" => "2"},
46
- {"text" => "3"},
47
- {"text" => "4"},
48
- {"text"=>"NextPage »"},
49
- "PaginationTest02",
50
- {"text"=>"« PreviousPage"},
51
- {"text" => "1"},
52
- {"text"=>""},
53
- {"text" => "3"},
54
- {"text" => "4"},
55
- {"text"=>"NextPage »"},
56
- "PaginationTest03",
57
- {"text"=>"« PreviousPage"},
58
- {"text" => "1"},
59
- {"text" => "2"},
60
- {"text"=>""},
61
- {"text" => "4"},
62
- {"text"=>"NextPage »"},
63
- "PaginationTest04",
64
- {"text"=>"« PreviousPage"},
65
- {"text" => "1"},
66
- {"text" => "2"},
67
- {"text" => "3"},
68
- {"text"=>""},
69
- {"text"=>""},
70
- ]
71
-
72
- expect(actual).to match expected
73
- end
74
-
75
-
76
33
  it "scrape each paginated pages limited" do
77
34
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
35
  Yasuri::TextNode.new('/html/body/p', "content"),
data/spec/yasuri_spec.rb CHANGED
@@ -39,7 +39,7 @@ describe 'Yasuri' do
39
39
  "truncate" : "^[^,]+"
40
40
  }|
41
41
  generated = Yasuri.json2tree(src)
42
- original = Yasuri::TextNode.new('/html/body/p[1]', "content", {}, truncate:/^[^,]+/)
42
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
43
43
  compare_generated_vs_original(generated, original, @index_page)
44
44
  end
45
45
 
@@ -126,7 +126,7 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
127
127
  ])
128
128
  ])
129
- page = @agent.get(@uri + "/struct/structual_text.html")
129
+ page = @agent.get(@uri + "/structual_text.html")
130
130
  compare_generated_vs_original(generated, original, page)
131
131
  end
132
132
  end
@@ -153,7 +153,7 @@ describe 'Yasuri' do
153
153
  end
154
154
 
155
155
  it "return text node with truncate_regexp" do
156
- node = Yasuri::TextNode.new("/html/head/title", "title", {}, truncate:/^[^,]+/)
156
+ node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
157
157
  json = Yasuri.tree2json(node)
158
158
  expected_str = %q| { "node": "text",
159
159
  "name": "title",
@@ -193,7 +193,6 @@ describe 'Yasuri' do
193
193
  "name" : "root",
194
194
  "path" : "/html/body/nav/span/a[@class='next']",
195
195
  "limit" : 10,
196
- "flatten" : false,
197
196
  "children" : [ { "node" : "text",
198
197
  "name" : "content",
199
198
  "path" : "/html/body/p"
@@ -12,7 +12,7 @@ describe 'Yasuri' do
12
12
  describe '::StructNode' do
13
13
  before do
14
14
  @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
15
+ @page = @agent.get(uri + "/structual_text.html")
16
16
 
17
17
  @table_1996 = [
18
18
  { "title" => "The Perfect Insider",
@@ -126,51 +126,10 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[1]', "title"),
127
127
  Yasuri::TextNode.new('./td[2]', "pub_date"),
128
128
  ])
129
- expected = @table_1996.map{|h| Hash[h.map{|k,v| [k.to_sym, v] }] }
129
+ expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
130
  actual = node.inject(@agent, @page, symbolize_names:true)
131
131
  expect(actual).to match expected
132
132
  end
133
133
 
134
134
  end
135
-
136
- describe '::StructNode::Links' do
137
- before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
141
- @table = [
142
- { "title" => "Child01,02",
143
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
144
-
145
- { "title" => "Child01,02,03",
146
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
147
- ]
148
- end
149
-
150
- it 'return child node in links inside struct' do
151
- node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
- Yasuri::TextNode.new('./td[1]', "title"),
153
- Yasuri::LinksNode.new('./td/a', "child", [
154
- Yasuri::TextNode.new('/html/body/p', "p"),
155
- ])
156
- ])
157
- expected = @table
158
- actual = node.inject(@agent, @page)
159
- expect(actual).to match expected
160
- end
161
- end # descrive
162
-
163
- describe '::StructNode::Pages' do
164
- before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
- end
168
-
169
- it 'not supported' do
170
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
- Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
- ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
- end
175
- end
176
135
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.12
4
+ version: 2.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-15 00:00:00.000000000 Z
11
+ date: 2016-11-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -174,8 +174,7 @@ files:
174
174
  - spec/htdocs/pagination/page02.html
175
175
  - spec/htdocs/pagination/page03.html
176
176
  - spec/htdocs/pagination/page04.html
177
- - spec/htdocs/struct/structual_links.html
178
- - spec/htdocs/struct/structual_text.html
177
+ - spec/htdocs/structual_text.html
179
178
  - spec/servers/httpserver.rb
180
179
  - spec/spec_helper.rb
181
180
  - spec/yasuri_links_node_spec.rb
@@ -205,7 +204,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
205
204
  version: '0'
206
205
  requirements: []
207
206
  rubyforge_project:
208
- rubygems_version: 2.5.2
207
+ rubygems_version: 2.4.5
209
208
  signing_key:
210
209
  specification_version: 4
211
210
  summary: Yasuri is easy scraping library.
@@ -221,8 +220,7 @@ test_files:
221
220
  - spec/htdocs/pagination/page02.html
222
221
  - spec/htdocs/pagination/page03.html
223
222
  - spec/htdocs/pagination/page04.html
224
- - spec/htdocs/struct/structual_links.html
225
- - spec/htdocs/struct/structual_text.html
223
+ - spec/htdocs/structual_text.html
226
224
  - spec/servers/httpserver.rb
227
225
  - spec/spec_helper.rb
228
226
  - spec/yasuri_links_node_spec.rb
@@ -1,30 +0,0 @@
1
- <html>
2
- <head>
3
- <title>StructualLinksTest</title>
4
- </head>
5
- <body>
6
-
7
- <table>
8
- <thead>
9
- <tr>
10
- <th>Title</th>
11
- <th>Links</th>
12
- </tr>
13
- </thead>
14
- <tr>
15
- <td>Child01,02</td>
16
- <td><a href="../child01.html">Child01</a></td>
17
- <td><a href="../child02.html">Child02</a></td>
18
- <td>../child02.html</td>
19
- </tr>
20
-
21
- <tr>
22
- <td>Child01,02,03</td>
23
- <td><a href="../child01.html">Child01</a></td>
24
- <td><a href="../child02.html">Child02</a></td>
25
- <td><a href="../child03.html">Child03</a></td>
26
- </tr>
27
- </table>
28
-
29
- </body>
30
- </html>