yasuri 2.0.13 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,107 +1,121 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- ##########
7
- # Struct #
8
- ##########
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::StructNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
16
-
17
- @table_1996 = [
18
- { "title" => "The Perfect Insider",
8
+ let(:uri_struct) { "#{uri}/struct/structual_text.html" }
9
+ let(:table1996) do
10
+ [
11
+ { "title" => "The Perfect Insider",
19
12
  "pub_date" => "1996/4/5" },
20
- { "title" => "Doctors in Isolated Room",
13
+ { "title" => "Doctors in Isolated Room",
21
14
  "pub_date" => "1996/7/5" },
22
- { "title" => "Mathematical Goodbye",
23
- "pub_date" => "1996/9/5" },
15
+ { "title" => "Mathematical Goodbye",
16
+ "pub_date" => "1996/9/5" }
24
17
  ]
25
- @table_1997 = [
26
- { "title" => "Jack the Poetical Private",
18
+ end
19
+ let(:table1997) do
20
+ [
21
+ { "title" => "Jack the Poetical Private",
27
22
  "pub_date" => "1997/1/5" },
28
- { "title" => "Who Inside",
23
+ { "title" => "Who Inside",
29
24
  "pub_date" => "1997/4/5" },
30
- { "title" => "Illusion Acts Like Magic",
31
- "pub_date" => "1997/10/5" },
25
+ { "title" => "Illusion Acts Like Magic",
26
+ "pub_date" => "1997/10/5" }
32
27
  ]
33
- @table_1998 = [
34
- { "title" => "Replaceable Summer",
28
+ end
29
+ let(:table1998) do
30
+ [
31
+ { "title" => "Replaceable Summer",
35
32
  "pub_date" => "1998/1/7" },
36
- { "title" => "Switch Back",
33
+ { "title" => "Switch Back",
37
34
  "pub_date" => "1998/4/5" },
38
- { "title" => "Numerical Models",
35
+ { "title" => "Numerical Models",
39
36
  "pub_date" => "1998/7/5" },
40
- { "title" => "The Perfect Outsider",
41
- "pub_date" => "1998/10/5" },
37
+ { "title" => "The Perfect Outsider",
38
+ "pub_date" => "1998/10/5" }
42
39
  ]
43
- @all_tables = [
44
- {"table" => @table_1996},
45
- {"table" => @table_1997},
46
- {"table" => @table_1998},
40
+ end
41
+
42
+ let(:all_tables) do
43
+ [
44
+ { "table" => table1996 },
45
+ { "table" => table1997 },
46
+ { "table" => table1998 }
47
47
  ]
48
48
  end
49
49
 
50
50
  it 'scrape single table contents' do
51
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
52
- Yasuri::TextNode.new('./td[1]', "title"),
53
- Yasuri::TextNode.new('./td[2]', "pub_date"),
54
- ])
55
- expected = @table_1996
56
- actual = node.inject(@agent, @page)
51
+ node = Yasuri::StructNode.new(
52
+ '/html/body/table[1]/tr', "table", [
53
+ Yasuri::TextNode.new('./td[1]', "title"),
54
+ Yasuri::TextNode.new('./td[2]', "pub_date")
55
+ ]
56
+ )
57
+ expected = table1996
58
+ actual = node.scrape(uri_struct)
57
59
  expect(actual).to match expected
58
60
  end
59
61
 
60
62
  it 'return single result without array' do
61
- node = Yasuri::StructNode.new('/html/body/table[1]/tr[1]', "table_first_tr", [
62
- Yasuri::TextNode.new('./td[1]', "title"),
63
- Yasuri::TextNode.new('./td[2]', "pub_date"),
64
- ])
65
- expected = @table_1996.first
66
- actual = node.inject(@agent, @page)
63
+ node = Yasuri::StructNode.new(
64
+ '/html/body/table[1]/tr[1]', "table_first_tr", [
65
+ Yasuri::TextNode.new('./td[1]', "title"),
66
+ Yasuri::TextNode.new('./td[2]', "pub_date")
67
+ ]
68
+ )
69
+ expected = table1996.first
70
+ actual = node.scrape(uri_struct)
67
71
  expect(actual).to match expected
68
72
  end
69
73
 
70
74
  it 'return empty text if no match node' do
71
75
  no_match_xpath = '/html/body/table[1]/t'
72
- node = Yasuri::StructNode.new(no_match_xpath, "table", [
73
- Yasuri::TextNode.new('./td[1]', "title")
74
- ])
75
- actual = node.inject(@agent, @page)
76
+ node = Yasuri::StructNode.new(
77
+ no_match_xpath, "table", [
78
+ Yasuri::TextNode.new('./td[1]', "title")
79
+ ]
80
+ )
81
+ actual = node.scrape(uri_struct)
76
82
  expect(actual).to be_empty
77
83
  end
78
84
 
79
85
  it 'fail with invalid xpath' do
80
86
  invalid_xpath = '/html/body/table[1]/table[1]/tr['
81
- node = Yasuri::StructNode.new(invalid_xpath, "table", [
82
- Yasuri::TextNode.new('./td[1]', "title")
83
- ])
84
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
87
+ node = Yasuri::StructNode.new(
88
+ invalid_xpath, "table", [
89
+ Yasuri::TextNode.new('./td[1]', "title")
90
+ ]
91
+ )
92
+ expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
85
93
  end
86
94
 
87
95
  it 'fail with invalid xpath in children' do
88
96
  invalid_xpath = './td[1]['
89
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
90
- Yasuri::TextNode.new(invalid_xpath, "title"),
91
- Yasuri::TextNode.new('./td[2]', "pub_date"),
92
- ])
93
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
97
+ node = Yasuri::StructNode.new(
98
+ '/html/body/table[1]/tr', "table", [
99
+ Yasuri::TextNode.new(invalid_xpath, "title"),
100
+ Yasuri::TextNode.new('./td[2]', "pub_date")
101
+ ]
102
+ )
103
+ expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
94
104
  end
95
105
 
96
106
  it 'scrape all tables' do
97
- node = Yasuri::StructNode.new('/html/body/table', "tables", [
98
- Yasuri::StructNode.new('./tr', "table", [
99
- Yasuri::TextNode.new('./td[1]', "title"),
100
- Yasuri::TextNode.new('./td[2]', "pub_date"),
101
- ])
102
- ])
103
- expected = @all_tables
104
- actual = node.inject(@agent, @page)
107
+ node = Yasuri::StructNode.new(
108
+ '/html/body/table', "tables", [
109
+ Yasuri::StructNode.new(
110
+ './tr', "table", [
111
+ Yasuri::TextNode.new('./td[1]', "title"),
112
+ Yasuri::TextNode.new('./td[2]', "pub_date")
113
+ ]
114
+ )
115
+ ]
116
+ )
117
+ expected = all_tables
118
+ actual = node.scrape(uri_struct)
105
119
  expect(actual).to match expected
106
120
  end
107
121
 
@@ -112,65 +126,71 @@ describe 'Yasuri' do
112
126
  text_pub_date './td[2]'
113
127
  end
114
128
  end
115
- original = Yasuri::StructNode.new('/html/body/table', "tables", [
116
- Yasuri::StructNode.new('./tr', "table", [
117
- Yasuri::TextNode.new('./td[1]', "title"),
118
- Yasuri::TextNode.new('./td[2]', "pub_date"),
119
- ])
120
- ])
121
- compare_generated_vs_original(generated, original, @page)
129
+ original = Yasuri::StructNode.new(
130
+ '/html/body/table', "tables", [
131
+ Yasuri::StructNode.new(
132
+ './tr', "table", [
133
+ Yasuri::TextNode.new('./td[1]', "title"),
134
+ Yasuri::TextNode.new('./td[2]', "pub_date")
135
+ ]
136
+ )
137
+ ]
138
+ )
139
+ compare_generated_vs_original(generated, original, uri_struct)
122
140
  end
123
141
 
124
142
  it 'return child node as symbol' do
125
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
126
- Yasuri::TextNode.new('./td[1]', "title"),
127
- Yasuri::TextNode.new('./td[2]', "pub_date"),
128
- ])
129
- expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
- actual = node.inject(@agent, @page, symbolize_names:true)
143
+ node = Yasuri::StructNode.new(
144
+ '/html/body/table[1]/tr', "table", [
145
+ Yasuri::TextNode.new('./td[1]', "title"),
146
+ Yasuri::TextNode.new('./td[2]', "pub_date")
147
+ ]
148
+ )
149
+ expected = table1996.map { |h| h.transform_keys(&:to_sym) }
150
+ actual = node.scrape(uri_struct, symbolize_names: true)
131
151
  expect(actual).to match expected
132
152
  end
133
-
134
153
  end
135
154
 
136
155
  describe '::StructNode::Links' do
137
- before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
141
- @table = [
156
+ let(:uri_struct) { "#{uri}/struct/structual_links.html" }
157
+ let(:table) do
158
+ [
142
159
  { "title" => "Child01,02",
143
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
160
+ "child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }] },
144
161
 
145
162
  { "title" => "Child01,02,03",
146
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
163
+ "child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }, { "p" => "Child 03 page." }] }
147
164
  ]
148
165
  end
149
166
 
150
167
  it 'return child node in links inside struct' do
151
- node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
- Yasuri::TextNode.new('./td[1]', "title"),
153
- Yasuri::LinksNode.new('./td/a', "child", [
154
- Yasuri::TextNode.new('/html/body/p', "p"),
155
- ])
156
- ])
157
- expected = @table
158
- actual = node.inject(@agent, @page)
168
+ node = Yasuri::StructNode.new(
169
+ '/html/body/table/tr', "table", [
170
+ Yasuri::TextNode.new('./td[1]', "title"),
171
+ Yasuri::LinksNode.new(
172
+ './td/a', "child", [
173
+ Yasuri::TextNode.new('/html/body/p', "p")
174
+ ]
175
+ )
176
+ ]
177
+ )
178
+ expected = table
179
+ actual = node.scrape(uri_struct)
159
180
  expect(actual).to match expected
160
181
  end
161
- end # descrive
182
+ end
162
183
 
163
184
  describe '::StructNode::Pages' do
164
- before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
- end
185
+ let(:uri_struct) { "#{uri}/struct/structual_text.html" }
168
186
 
169
187
  it 'not supported' do
170
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
- Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
- ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
188
+ node = Yasuri::StructNode.new(
189
+ '/html/body/table[1]/tr', "table", [
190
+ Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
191
+ ]
192
+ )
193
+ expect { node.scrape(uri_struct) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
194
  end
175
195
  end
176
196
  end
@@ -1,78 +1,68 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
2
 
4
3
  require_relative 'spec_helper'
5
4
 
6
- ########
7
- # Text #
8
- ########
9
5
  describe 'Yasuri' do
10
6
  include_context 'httpserver'
11
7
 
12
- before do
13
- @agent = Mechanize.new
14
- @index_page = @agent.get(uri)
15
- end
16
-
17
8
  describe '::TextNode' do
18
- before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
19
-
20
9
  it 'scrape text text <p>Hello,Yasuri</p>' do
21
- actual = @node.inject(@agent, @index_page)
10
+ node = Yasuri::TextNode.new('/html/body/p[1]', "title")
11
+ actual = node.scrape(uri)
22
12
  expect(actual).to eq "Hello,Yasuri"
23
13
  end
24
14
 
25
15
  it 'return empty text if no match node' do
26
16
  no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
27
- actual = no_match_node.inject(@agent, @index_page)
17
+ actual = no_match_node.scrape(uri)
28
18
  expect(actual).to be_empty
29
19
  end
30
20
 
31
21
  it 'fail with invalid xpath' do
32
22
  invalid_xpath = '/html/body/no_match_node['
33
23
  node = Yasuri::TextNode.new(invalid_xpath, "title")
34
- expect { node.inject(@agent, @index_page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
24
+ expect { node.scrape(uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
35
25
  end
36
26
 
37
27
  it "can be defined by DSL, return single TextNode title" do
38
28
  generated = Yasuri.text_title '/html/body/p[1]'
39
- original = Yasuri::TextNode.new('/html/body/p[1]', "title")
40
- compare_generated_vs_original(generated, original, @index_page)
29
+ original = Yasuri::TextNode.new('/html/body/p[1]', "title")
30
+ compare_generated_vs_original(generated, original, uri)
41
31
  end
42
32
 
43
- it "can be truncated with regexp" do
44
- node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
45
- actual = node.inject(@agent, @index_page)
33
+ it "can truncate head by regexp" do
34
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /^[^,]+/
35
+ actual = node.scrape(uri)
46
36
  expect(actual).to eq "Hello"
47
37
  end
48
38
 
49
- it "return first captured if matched given capture pattern" do
50
- node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
51
- actual = node.inject(@agent, @index_page)
52
- expect(actual).to eq "ello,Yasur"
39
+ it "can truncate tail by regexp" do
40
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /[^,]+$/
41
+ actual = node.scrape(uri)
42
+ expect(actual).to eq "Yasuri"
53
43
  end
54
44
 
55
- it "can be truncated with regexp" do
56
- node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
57
- actual = node.inject(@agent, @index_page)
58
- expect(actual).to eq "Yasuri"
45
+ it "return first captured if matched given capture pattern" do
46
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /H(.+)i/
47
+ actual = node.scrape(uri)
48
+ expect(actual).to eq "ello,Yasur"
59
49
  end
60
50
 
61
51
  it "return empty string if truncated with no match to regexp" do
62
- node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
63
- actual = node.inject(@agent, @index_page)
52
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /^hoge/
53
+ actual = node.scrape(uri)
64
54
  expect(actual).to be_empty
65
55
  end
66
56
 
67
57
  it "return symbol method applied string" do
68
58
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
69
- actual = node.inject(@agent, @index_page)
59
+ actual = node.scrape(uri)
70
60
  expect(actual).to eq "HELLO,YASURI"
71
61
  end
72
62
 
73
63
  it "return apply multi arguments" do
74
- node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
75
- actual = node.inject(@agent, @index_page)
64
+ node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate: /H(.+)i/
65
+ actual = node.scrape(uri)
76
66
  expect(actual).to eq "ELLO,YASUR"
77
67
  end
78
68
  end
data/yasuri.gemspec CHANGED
@@ -1,31 +1,38 @@
1
- # coding: utf-8
1
+
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'yasuri/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "yasuri"
8
- spec.version = Yasuri::VERSION
9
- spec.authors = ["TAC"]
10
- spec.email = ["tac@tac42.net"]
11
- spec.summary = %q{Yasuri is easy scraping library.}
12
- spec.description = %q{Yasuri is an easy web-scraping library for supporting "Mechanize".}
13
- spec.homepage = "https://github.com/tac0x2a/yasuri"
14
- spec.license = "MIT"
7
+ spec.name = 'yasuri'
8
+ spec.version = Yasuri::VERSION
9
+ spec.authors = ['TAC']
10
+ spec.email = ['tac@tac42.net']
11
+ spec.summary = %q{Yasuri is easy scraping library.}
12
+ spec.description = %q{Yasuri is an easy web-scraping library for supporting 'Mechanize'.}
13
+ spec.homepage = 'https://github.com/tac0x2a/yasuri'
14
+ spec.license = 'MIT'
15
15
 
16
- spec.files = `git ls-files -z`.split("\x0")
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.bindir = 'exe'
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+ spec.required_ruby_version = '>= 2.7.0'
20
22
 
21
- spec.add_development_dependency "bundler"
22
- spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rspec"
24
- spec.add_development_dependency "fuubar"
25
- spec.add_development_dependency "glint"
26
- spec.add_development_dependency "coveralls"
27
- spec.add_development_dependency "simplecov"
28
- spec.add_development_dependency "codeclimate-test-reporter"
23
+ spec.add_development_dependency 'bundler'
24
+ spec.add_development_dependency 'codeclimate-test-reporter'
25
+ spec.add_development_dependency 'coveralls'
26
+ spec.add_development_dependency 'fuubar'
27
+ spec.add_development_dependency 'glint'
28
+ spec.add_development_dependency 'rake'
29
+ spec.add_development_dependency 'rspec'
30
+ spec.add_development_dependency 'rubocop'
31
+ spec.add_development_dependency 'rubocop-performance'
32
+ spec.add_development_dependency 'rubocop-rspec'
33
+ spec.add_development_dependency 'rubocop-rubycw'
34
+ spec.add_development_dependency 'simplecov'
29
35
 
30
- spec.add_dependency "mechanize"
36
+ spec.add_dependency 'mechanize'
37
+ spec.add_dependency 'thor'
31
38
  end