yasuri 2.0.12 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,8 +11,7 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::StructNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
14
+ @uri = uri + "/struct/structual_text.html"
16
15
 
17
16
  @table_1996 = [
18
17
  { "title" => "The Perfect Insider",
@@ -53,7 +52,7 @@ describe 'Yasuri' do
53
52
  Yasuri::TextNode.new('./td[2]', "pub_date"),
54
53
  ])
55
54
  expected = @table_1996
56
- actual = node.inject(@agent, @page)
55
+ actual = node.scrape(@uri)
57
56
  expect(actual).to match expected
58
57
  end
59
58
 
@@ -63,7 +62,7 @@ describe 'Yasuri' do
63
62
  Yasuri::TextNode.new('./td[2]', "pub_date"),
64
63
  ])
65
64
  expected = @table_1996.first
66
- actual = node.inject(@agent, @page)
65
+ actual = node.scrape(@uri)
67
66
  expect(actual).to match expected
68
67
  end
69
68
 
@@ -72,7 +71,7 @@ describe 'Yasuri' do
72
71
  node = Yasuri::StructNode.new(no_match_xpath, "table", [
73
72
  Yasuri::TextNode.new('./td[1]', "title")
74
73
  ])
75
- actual = node.inject(@agent, @page)
74
+ actual = node.scrape(@uri)
76
75
  expect(actual).to be_empty
77
76
  end
78
77
 
@@ -81,7 +80,7 @@ describe 'Yasuri' do
81
80
  node = Yasuri::StructNode.new(invalid_xpath, "table", [
82
81
  Yasuri::TextNode.new('./td[1]', "title")
83
82
  ])
84
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
83
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
85
84
  end
86
85
 
87
86
  it 'fail with invalid xpath in children' do
@@ -90,7 +89,7 @@ describe 'Yasuri' do
90
89
  Yasuri::TextNode.new(invalid_xpath, "title"),
91
90
  Yasuri::TextNode.new('./td[2]', "pub_date"),
92
91
  ])
93
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
92
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
94
93
  end
95
94
 
96
95
  it 'scrape all tables' do
@@ -101,7 +100,7 @@ describe 'Yasuri' do
101
100
  ])
102
101
  ])
103
102
  expected = @all_tables
104
- actual = node.inject(@agent, @page)
103
+ actual = node.scrape(@uri)
105
104
  expect(actual).to match expected
106
105
  end
107
106
 
@@ -118,7 +117,7 @@ describe 'Yasuri' do
118
117
  Yasuri::TextNode.new('./td[2]', "pub_date"),
119
118
  ])
120
119
  ])
121
- compare_generated_vs_original(generated, original, @page)
120
+ compare_generated_vs_original(generated, original, @uri)
122
121
  end
123
122
 
124
123
  it 'return child node as symbol' do
@@ -127,7 +126,7 @@ describe 'Yasuri' do
127
126
  Yasuri::TextNode.new('./td[2]', "pub_date"),
128
127
  ])
129
128
  expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
- actual = node.inject(@agent, @page, symbolize_names:true)
129
+ actual = node.scrape(@uri, symbolize_names:true)
131
130
  expect(actual).to match expected
132
131
  end
133
132
 
@@ -135,9 +134,7 @@ describe 'Yasuri' do
135
134
 
136
135
  describe '::StructNode::Links' do
137
136
  before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
137
+ @uri = uri + "/struct/structual_links.html"
141
138
  @table = [
142
139
  { "title" => "Child01,02",
143
140
  "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
@@ -155,22 +152,21 @@ describe 'Yasuri' do
155
152
  ])
156
153
  ])
157
154
  expected = @table
158
- actual = node.inject(@agent, @page)
155
+ actual = node.scrape(@uri)
159
156
  expect(actual).to match expected
160
157
  end
161
158
  end # descrive
162
159
 
163
160
  describe '::StructNode::Pages' do
164
161
  before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
162
+ @uri = uri + "/struct/structual_text.html"
167
163
  end
168
164
 
169
165
  it 'not supported' do
170
166
  node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
167
  Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
168
  ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
169
+ expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
170
  end
175
171
  end
176
172
  end
@@ -10,69 +10,68 @@ describe 'Yasuri' do
10
10
  include_context 'httpserver'
11
11
 
12
12
  before do
13
- @agent = Mechanize.new
14
- @index_page = @agent.get(uri)
13
+ @uri = uri
15
14
  end
16
15
 
17
16
  describe '::TextNode' do
18
17
  before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
19
18
 
20
19
  it 'scrape text text <p>Hello,Yasuri</p>' do
21
- actual = @node.inject(@agent, @index_page)
20
+ actual = @node.scrape(@uri)
22
21
  expect(actual).to eq "Hello,Yasuri"
23
22
  end
24
23
 
25
24
  it 'return empty text if no match node' do
26
25
  no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
27
- actual = no_match_node.inject(@agent, @index_page)
26
+ actual = no_match_node.scrape(@uri)
28
27
  expect(actual).to be_empty
29
28
  end
30
29
 
31
30
  it 'fail with invalid xpath' do
32
31
  invalid_xpath = '/html/body/no_match_node['
33
32
  node = Yasuri::TextNode.new(invalid_xpath, "title")
34
- expect { node.inject(@agent, @index_page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
33
+ expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
35
34
  end
36
35
 
37
36
  it "can be defined by DSL, return single TextNode title" do
38
37
  generated = Yasuri.text_title '/html/body/p[1]'
39
38
  original = Yasuri::TextNode.new('/html/body/p[1]', "title")
40
- compare_generated_vs_original(generated, original, @index_page)
39
+ compare_generated_vs_original(generated, original, @uri)
41
40
  end
42
41
 
43
42
  it "can be truncated with regexp" do
44
43
  node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
45
- actual = node.inject(@agent, @index_page)
44
+ actual = node.scrape(@uri)
46
45
  expect(actual).to eq "Hello"
47
46
  end
48
47
 
49
48
  it "return first captured if matched given capture pattern" do
50
49
  node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
51
- actual = node.inject(@agent, @index_page)
50
+ actual = node.scrape(@uri)
52
51
  expect(actual).to eq "ello,Yasur"
53
52
  end
54
53
 
55
54
  it "can be truncated with regexp" do
56
55
  node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
57
- actual = node.inject(@agent, @index_page)
56
+ actual = node.scrape(@uri)
58
57
  expect(actual).to eq "Yasuri"
59
58
  end
60
59
 
61
60
  it "return empty string if truncated with no match to regexp" do
62
61
  node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
63
- actual = node.inject(@agent, @index_page)
62
+ actual = node.scrape(@uri)
64
63
  expect(actual).to be_empty
65
64
  end
66
65
 
67
66
  it "return symbol method applied string" do
68
67
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
69
- actual = node.inject(@agent, @index_page)
68
+ actual = node.scrape(@uri)
70
69
  expect(actual).to eq "HELLO,YASURI"
71
70
  end
72
71
 
73
72
  it "return apply multi arguments" do
74
73
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
75
- actual = node.inject(@agent, @index_page)
74
+ actual = node.scrape(@uri)
76
75
  expect(actual).to eq "ELLO,YASUR"
77
76
  end
78
77
  end
data/yasuri.gemspec CHANGED
@@ -14,12 +14,13 @@ Gem::Specification.new do |spec|
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files -z`.split("\x0")
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.bindir = "exe"
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
20
  spec.require_paths = ["lib"]
20
21
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
22
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "bundler"
23
+ spec.add_development_dependency "rake"
23
24
  spec.add_development_dependency "rspec"
24
25
  spec.add_development_dependency "fuubar"
25
26
  spec.add_development_dependency "glint"
@@ -28,4 +29,5 @@ Gem::Specification.new do |spec|
28
29
  spec.add_development_dependency "codeclimate-test-reporter"
29
30
 
30
31
  spec.add_dependency "mechanize"
32
+ spec.add_dependency "thor"
31
33
  end
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.12
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
- autorequire:
9
- bindir: bin
8
+ autorequire:
9
+ bindir: exe
10
10
  cert_chain: []
11
- date: 2016-12-15 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -136,16 +136,33 @@ dependencies:
136
136
  - - ">="
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: thor
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
139
153
  description: Yasuri is an easy web-scraping library for supporting "Mechanize".
140
154
  email:
141
155
  - tac@tac42.net
142
- executables: []
156
+ executables:
157
+ - yasuri
143
158
  extensions: []
144
159
  extra_rdoc_files: []
145
160
  files:
146
161
  - ".coveralls.yml"
162
+ - ".github/workflows/ruby.yml"
147
163
  - ".gitignore"
148
164
  - ".rspec"
165
+ - ".ruby-version"
149
166
  - ".travis.yml"
150
167
  - Gemfile
151
168
  - LICENSE
@@ -153,16 +170,26 @@ files:
153
170
  - Rakefile
154
171
  - USAGE.ja.md
155
172
  - USAGE.md
156
- - app.rb
173
+ - examples/example.rb
174
+ - examples/github.yml
175
+ - examples/sample.json
176
+ - examples/sample.yml
177
+ - exe/yasuri
157
178
  - lib/yasuri.rb
158
179
  - lib/yasuri/version.rb
159
180
  - lib/yasuri/yasuri.rb
181
+ - lib/yasuri/yasuri_cli.rb
160
182
  - lib/yasuri/yasuri_links_node.rb
183
+ - lib/yasuri/yasuri_map_node.rb
161
184
  - lib/yasuri/yasuri_node.rb
162
185
  - lib/yasuri/yasuri_node_generator.rb
163
186
  - lib/yasuri/yasuri_paginate_node.rb
164
187
  - lib/yasuri/yasuri_struct_node.rb
165
188
  - lib/yasuri/yasuri_text_node.rb
189
+ - spec/cli_resources/tree.json
190
+ - spec/cli_resources/tree.yml
191
+ - spec/cli_resources/tree_wrong.json
192
+ - spec/cli_resources/tree_wrong.yml
166
193
  - spec/htdocs/child01.html
167
194
  - spec/htdocs/child01_sub.html
168
195
  - spec/htdocs/child02.html
@@ -178,7 +205,9 @@ files:
178
205
  - spec/htdocs/struct/structual_text.html
179
206
  - spec/servers/httpserver.rb
180
207
  - spec/spec_helper.rb
208
+ - spec/yasuri_cli_spec.rb
181
209
  - spec/yasuri_links_node_spec.rb
210
+ - spec/yasuri_map_spec.rb
182
211
  - spec/yasuri_node_spec.rb
183
212
  - spec/yasuri_paginate_node_spec.rb
184
213
  - spec/yasuri_spec.rb
@@ -189,7 +218,7 @@ homepage: https://github.com/tac0x2a/yasuri
189
218
  licenses:
190
219
  - MIT
191
220
  metadata: {}
192
- post_install_message:
221
+ post_install_message:
193
222
  rdoc_options: []
194
223
  require_paths:
195
224
  - lib
@@ -204,12 +233,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
233
  - !ruby/object:Gem::Version
205
234
  version: '0'
206
235
  requirements: []
207
- rubyforge_project:
208
- rubygems_version: 2.5.2
209
- signing_key:
236
+ rubygems_version: 3.2.3
237
+ signing_key:
210
238
  specification_version: 4
211
239
  summary: Yasuri is easy scraping library.
212
240
  test_files:
241
+ - spec/cli_resources/tree.json
242
+ - spec/cli_resources/tree.yml
243
+ - spec/cli_resources/tree_wrong.json
244
+ - spec/cli_resources/tree_wrong.yml
213
245
  - spec/htdocs/child01.html
214
246
  - spec/htdocs/child01_sub.html
215
247
  - spec/htdocs/child02.html
@@ -225,7 +257,9 @@ test_files:
225
257
  - spec/htdocs/struct/structual_text.html
226
258
  - spec/servers/httpserver.rb
227
259
  - spec/spec_helper.rb
260
+ - spec/yasuri_cli_spec.rb
228
261
  - spec/yasuri_links_node_spec.rb
262
+ - spec/yasuri_map_spec.rb
229
263
  - spec/yasuri_node_spec.rb
230
264
  - spec/yasuri_paginate_node_spec.rb
231
265
  - spec/yasuri_spec.rb
data/app.rb DELETED
@@ -1,52 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- # Author:: TAC (tac@tac42.net)
5
-
6
- require 'pp'
7
- require 'time'
8
- require 'mechanize'
9
-
10
- require_relative 'lib/yasuri/yasuri'
11
-
12
- agent = Mechanize.new
13
-
14
- uri = "http://www.asahi.com/"
15
-
16
- # Node tree constructing by DSL
17
- root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
18
- text_title '//*[@id="MainInner"]/div[1]/div/h1'
19
- text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
20
- end
21
-
22
- # Node tree constructing by JSON
23
- src = <<-EOJSON
24
- { "node" : "links",
25
- "name" : "root",
26
- "path" : "//*[@id='MainInner']/div[1]/ul/li/a",
27
- "children" : [
28
- { "node" : "text",
29
- "name" : "title",
30
- "path" : "//*[@id='MainInner']/div[1]/div/h1"
31
- },
32
- { "node" : "text",
33
- "name" : "article",
34
- "path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
35
- }
36
- ]
37
- }
38
- EOJSON
39
- root = Yasuri.json2tree(src)
40
-
41
- # Access to parsed resources
42
- page = agent.get(uri)
43
- contents = root.inject(agent, page)
44
-
45
- contents.each do |h|
46
- t = h['title']
47
- a = h['article']
48
-
49
- puts t
50
- puts a
51
- puts "=" * 100
52
- end