yasuri 2.0.13 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,107 +1,121 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- ##########
7
- # Struct #
8
- ##########
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::StructNode' do
13
- before do
14
- @agent = Mechanize.new
15
- @page = @agent.get(uri + "/struct/structual_text.html")
16
-
17
- @table_1996 = [
18
- { "title" => "The Perfect Insider",
8
+ let(:uri_struct) { "#{uri}/struct/structual_text.html" }
9
+ let(:table1996) do
10
+ [
11
+ { "title" => "The Perfect Insider",
19
12
  "pub_date" => "1996/4/5" },
20
- { "title" => "Doctors in Isolated Room",
13
+ { "title" => "Doctors in Isolated Room",
21
14
  "pub_date" => "1996/7/5" },
22
- { "title" => "Mathematical Goodbye",
23
- "pub_date" => "1996/9/5" },
15
+ { "title" => "Mathematical Goodbye",
16
+ "pub_date" => "1996/9/5" }
24
17
  ]
25
- @table_1997 = [
26
- { "title" => "Jack the Poetical Private",
18
+ end
19
+ let(:table1997) do
20
+ [
21
+ { "title" => "Jack the Poetical Private",
27
22
  "pub_date" => "1997/1/5" },
28
- { "title" => "Who Inside",
23
+ { "title" => "Who Inside",
29
24
  "pub_date" => "1997/4/5" },
30
- { "title" => "Illusion Acts Like Magic",
31
- "pub_date" => "1997/10/5" },
25
+ { "title" => "Illusion Acts Like Magic",
26
+ "pub_date" => "1997/10/5" }
32
27
  ]
33
- @table_1998 = [
34
- { "title" => "Replaceable Summer",
28
+ end
29
+ let(:table1998) do
30
+ [
31
+ { "title" => "Replaceable Summer",
35
32
  "pub_date" => "1998/1/7" },
36
- { "title" => "Switch Back",
33
+ { "title" => "Switch Back",
37
34
  "pub_date" => "1998/4/5" },
38
- { "title" => "Numerical Models",
35
+ { "title" => "Numerical Models",
39
36
  "pub_date" => "1998/7/5" },
40
- { "title" => "The Perfect Outsider",
41
- "pub_date" => "1998/10/5" },
37
+ { "title" => "The Perfect Outsider",
38
+ "pub_date" => "1998/10/5" }
42
39
  ]
43
- @all_tables = [
44
- {"table" => @table_1996},
45
- {"table" => @table_1997},
46
- {"table" => @table_1998},
40
+ end
41
+
42
+ let(:all_tables) do
43
+ [
44
+ { "table" => table1996 },
45
+ { "table" => table1997 },
46
+ { "table" => table1998 }
47
47
  ]
48
48
  end
49
49
 
50
50
  it 'scrape single table contents' do
51
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
52
- Yasuri::TextNode.new('./td[1]', "title"),
53
- Yasuri::TextNode.new('./td[2]', "pub_date"),
54
- ])
55
- expected = @table_1996
56
- actual = node.inject(@agent, @page)
51
+ node = Yasuri::StructNode.new(
52
+ '/html/body/table[1]/tr', "table", [
53
+ Yasuri::TextNode.new('./td[1]', "title"),
54
+ Yasuri::TextNode.new('./td[2]', "pub_date")
55
+ ]
56
+ )
57
+ expected = table1996
58
+ actual = node.scrape(uri_struct)
57
59
  expect(actual).to match expected
58
60
  end
59
61
 
60
62
  it 'return single result without array' do
61
- node = Yasuri::StructNode.new('/html/body/table[1]/tr[1]', "table_first_tr", [
62
- Yasuri::TextNode.new('./td[1]', "title"),
63
- Yasuri::TextNode.new('./td[2]', "pub_date"),
64
- ])
65
- expected = @table_1996.first
66
- actual = node.inject(@agent, @page)
63
+ node = Yasuri::StructNode.new(
64
+ '/html/body/table[1]/tr[1]', "table_first_tr", [
65
+ Yasuri::TextNode.new('./td[1]', "title"),
66
+ Yasuri::TextNode.new('./td[2]', "pub_date")
67
+ ]
68
+ )
69
+ expected = table1996.first
70
+ actual = node.scrape(uri_struct)
67
71
  expect(actual).to match expected
68
72
  end
69
73
 
70
74
  it 'return empty text if no match node' do
71
75
  no_match_xpath = '/html/body/table[1]/t'
72
- node = Yasuri::StructNode.new(no_match_xpath, "table", [
73
- Yasuri::TextNode.new('./td[1]', "title")
74
- ])
75
- actual = node.inject(@agent, @page)
76
+ node = Yasuri::StructNode.new(
77
+ no_match_xpath, "table", [
78
+ Yasuri::TextNode.new('./td[1]', "title")
79
+ ]
80
+ )
81
+ actual = node.scrape(uri_struct)
76
82
  expect(actual).to be_empty
77
83
  end
78
84
 
79
85
  it 'fail with invalid xpath' do
80
86
  invalid_xpath = '/html/body/table[1]/table[1]/tr['
81
- node = Yasuri::StructNode.new(invalid_xpath, "table", [
82
- Yasuri::TextNode.new('./td[1]', "title")
83
- ])
84
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
87
+ node = Yasuri::StructNode.new(
88
+ invalid_xpath, "table", [
89
+ Yasuri::TextNode.new('./td[1]', "title")
90
+ ]
91
+ )
92
+ expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
85
93
  end
86
94
 
87
95
  it 'fail with invalid xpath in children' do
88
96
  invalid_xpath = './td[1]['
89
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
90
- Yasuri::TextNode.new(invalid_xpath, "title"),
91
- Yasuri::TextNode.new('./td[2]', "pub_date"),
92
- ])
93
- expect { node.inject(@agent, @page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
97
+ node = Yasuri::StructNode.new(
98
+ '/html/body/table[1]/tr', "table", [
99
+ Yasuri::TextNode.new(invalid_xpath, "title"),
100
+ Yasuri::TextNode.new('./td[2]', "pub_date")
101
+ ]
102
+ )
103
+ expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
94
104
  end
95
105
 
96
106
  it 'scrape all tables' do
97
- node = Yasuri::StructNode.new('/html/body/table', "tables", [
98
- Yasuri::StructNode.new('./tr', "table", [
99
- Yasuri::TextNode.new('./td[1]', "title"),
100
- Yasuri::TextNode.new('./td[2]', "pub_date"),
101
- ])
102
- ])
103
- expected = @all_tables
104
- actual = node.inject(@agent, @page)
107
+ node = Yasuri::StructNode.new(
108
+ '/html/body/table', "tables", [
109
+ Yasuri::StructNode.new(
110
+ './tr', "table", [
111
+ Yasuri::TextNode.new('./td[1]', "title"),
112
+ Yasuri::TextNode.new('./td[2]', "pub_date")
113
+ ]
114
+ )
115
+ ]
116
+ )
117
+ expected = all_tables
118
+ actual = node.scrape(uri_struct)
105
119
  expect(actual).to match expected
106
120
  end
107
121
 
@@ -112,65 +126,71 @@ describe 'Yasuri' do
112
126
  text_pub_date './td[2]'
113
127
  end
114
128
  end
115
- original = Yasuri::StructNode.new('/html/body/table', "tables", [
116
- Yasuri::StructNode.new('./tr', "table", [
117
- Yasuri::TextNode.new('./td[1]', "title"),
118
- Yasuri::TextNode.new('./td[2]', "pub_date"),
119
- ])
120
- ])
121
- compare_generated_vs_original(generated, original, @page)
129
+ original = Yasuri::StructNode.new(
130
+ '/html/body/table', "tables", [
131
+ Yasuri::StructNode.new(
132
+ './tr', "table", [
133
+ Yasuri::TextNode.new('./td[1]', "title"),
134
+ Yasuri::TextNode.new('./td[2]', "pub_date")
135
+ ]
136
+ )
137
+ ]
138
+ )
139
+ compare_generated_vs_original(generated, original, uri_struct)
122
140
  end
123
141
 
124
142
  it 'return child node as symbol' do
125
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
126
- Yasuri::TextNode.new('./td[1]', "title"),
127
- Yasuri::TextNode.new('./td[2]', "pub_date"),
128
- ])
129
- expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
- actual = node.inject(@agent, @page, symbolize_names:true)
143
+ node = Yasuri::StructNode.new(
144
+ '/html/body/table[1]/tr', "table", [
145
+ Yasuri::TextNode.new('./td[1]', "title"),
146
+ Yasuri::TextNode.new('./td[2]', "pub_date")
147
+ ]
148
+ )
149
+ expected = table1996.map { |h| h.transform_keys(&:to_sym) }
150
+ actual = node.scrape(uri_struct, symbolize_names: true)
131
151
  expect(actual).to match expected
132
152
  end
133
-
134
153
  end
135
154
 
136
155
  describe '::StructNode::Links' do
137
- before do
138
- @agent = Mechanize.new
139
- @page = @agent.get(uri + "/struct/structual_links.html")
140
-
141
- @table = [
156
+ let(:uri_struct) { "#{uri}/struct/structual_links.html" }
157
+ let(:table) do
158
+ [
142
159
  { "title" => "Child01,02",
143
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
160
+ "child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }] },
144
161
 
145
162
  { "title" => "Child01,02,03",
146
- "child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
163
+ "child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }, { "p" => "Child 03 page." }] }
147
164
  ]
148
165
  end
149
166
 
150
167
  it 'return child node in links inside struct' do
151
- node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
152
- Yasuri::TextNode.new('./td[1]', "title"),
153
- Yasuri::LinksNode.new('./td/a', "child", [
154
- Yasuri::TextNode.new('/html/body/p', "p"),
155
- ])
156
- ])
157
- expected = @table
158
- actual = node.inject(@agent, @page)
168
+ node = Yasuri::StructNode.new(
169
+ '/html/body/table/tr', "table", [
170
+ Yasuri::TextNode.new('./td[1]', "title"),
171
+ Yasuri::LinksNode.new(
172
+ './td/a', "child", [
173
+ Yasuri::TextNode.new('/html/body/p', "p")
174
+ ]
175
+ )
176
+ ]
177
+ )
178
+ expected = table
179
+ actual = node.scrape(uri_struct)
159
180
  expect(actual).to match expected
160
181
  end
161
- end # descrive
182
+ end
162
183
 
163
184
  describe '::StructNode::Pages' do
164
- before do
165
- @agent = Mechanize.new
166
- @page = @agent.get(uri + "/struct/structual_text.html") #dummy
167
- end
185
+ let(:uri_struct) { "#{uri}/struct/structual_text.html" }
168
186
 
169
187
  it 'not supported' do
170
- node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
171
- Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
172
- ])
173
- expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
188
+ node = Yasuri::StructNode.new(
189
+ '/html/body/table[1]/tr', "table", [
190
+ Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
191
+ ]
192
+ )
193
+ expect { node.scrape(uri_struct) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
174
194
  end
175
195
  end
176
196
  end
@@ -1,78 +1,68 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
2
 
4
3
  require_relative 'spec_helper'
5
4
 
6
- ########
7
- # Text #
8
- ########
9
5
  describe 'Yasuri' do
10
6
  include_context 'httpserver'
11
7
 
12
- before do
13
- @agent = Mechanize.new
14
- @index_page = @agent.get(uri)
15
- end
16
-
17
8
  describe '::TextNode' do
18
- before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
19
-
20
9
  it 'scrape text text <p>Hello,Yasuri</p>' do
21
- actual = @node.inject(@agent, @index_page)
10
+ node = Yasuri::TextNode.new('/html/body/p[1]', "title")
11
+ actual = node.scrape(uri)
22
12
  expect(actual).to eq "Hello,Yasuri"
23
13
  end
24
14
 
25
15
  it 'return empty text if no match node' do
26
16
  no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
27
- actual = no_match_node.inject(@agent, @index_page)
17
+ actual = no_match_node.scrape(uri)
28
18
  expect(actual).to be_empty
29
19
  end
30
20
 
31
21
  it 'fail with invalid xpath' do
32
22
  invalid_xpath = '/html/body/no_match_node['
33
23
  node = Yasuri::TextNode.new(invalid_xpath, "title")
34
- expect { node.inject(@agent, @index_page) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
24
+ expect { node.scrape(uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
35
25
  end
36
26
 
37
27
  it "can be defined by DSL, return single TextNode title" do
38
28
  generated = Yasuri.text_title '/html/body/p[1]'
39
- original = Yasuri::TextNode.new('/html/body/p[1]', "title")
40
- compare_generated_vs_original(generated, original, @index_page)
29
+ original = Yasuri::TextNode.new('/html/body/p[1]', "title")
30
+ compare_generated_vs_original(generated, original, uri)
41
31
  end
42
32
 
43
- it "can be truncated with regexp" do
44
- node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
45
- actual = node.inject(@agent, @index_page)
33
+ it "can truncate head by regexp" do
34
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /^[^,]+/
35
+ actual = node.scrape(uri)
46
36
  expect(actual).to eq "Hello"
47
37
  end
48
38
 
49
- it "return first captured if matched given capture pattern" do
50
- node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
51
- actual = node.inject(@agent, @index_page)
52
- expect(actual).to eq "ello,Yasur"
39
+ it "can truncate tail by regexp" do
40
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /[^,]+$/
41
+ actual = node.scrape(uri)
42
+ expect(actual).to eq "Yasuri"
53
43
  end
54
44
 
55
- it "can be truncated with regexp" do
56
- node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
57
- actual = node.inject(@agent, @index_page)
58
- expect(actual).to eq "Yasuri"
45
+ it "return first captured if matched given capture pattern" do
46
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /H(.+)i/
47
+ actual = node.scrape(uri)
48
+ expect(actual).to eq "ello,Yasur"
59
49
  end
60
50
 
61
51
  it "return empty string if truncated with no match to regexp" do
62
- node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
63
- actual = node.inject(@agent, @index_page)
52
+ node = Yasuri.text_title '/html/body/p[1]', truncate: /^hoge/
53
+ actual = node.scrape(uri)
64
54
  expect(actual).to be_empty
65
55
  end
66
56
 
67
57
  it "return symbol method applied string" do
68
58
  node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
69
- actual = node.inject(@agent, @index_page)
59
+ actual = node.scrape(uri)
70
60
  expect(actual).to eq "HELLO,YASURI"
71
61
  end
72
62
 
73
63
  it "return apply multi arguments" do
74
- node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
75
- actual = node.inject(@agent, @index_page)
64
+ node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate: /H(.+)i/
65
+ actual = node.scrape(uri)
76
66
  expect(actual).to eq "ELLO,YASUR"
77
67
  end
78
68
  end
data/yasuri.gemspec CHANGED
@@ -1,31 +1,38 @@
1
- # coding: utf-8
1
+
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'yasuri/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "yasuri"
8
- spec.version = Yasuri::VERSION
9
- spec.authors = ["TAC"]
10
- spec.email = ["tac@tac42.net"]
11
- spec.summary = %q{Yasuri is easy scraping library.}
12
- spec.description = %q{Yasuri is an easy web-scraping library for supporting "Mechanize".}
13
- spec.homepage = "https://github.com/tac0x2a/yasuri"
14
- spec.license = "MIT"
7
+ spec.name = 'yasuri'
8
+ spec.version = Yasuri::VERSION
9
+ spec.authors = ['TAC']
10
+ spec.email = ['tac@tac42.net']
11
+ spec.summary = %q{Yasuri is easy scraping library.}
12
+ spec.description = %q{Yasuri is an easy web-scraping library for supporting 'Mechanize'.}
13
+ spec.homepage = 'https://github.com/tac0x2a/yasuri'
14
+ spec.license = 'MIT'
15
15
 
16
- spec.files = `git ls-files -z`.split("\x0")
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.bindir = 'exe'
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+ spec.required_ruby_version = '>= 2.7.0'
20
22
 
21
- spec.add_development_dependency "bundler"
22
- spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rspec"
24
- spec.add_development_dependency "fuubar"
25
- spec.add_development_dependency "glint"
26
- spec.add_development_dependency "coveralls"
27
- spec.add_development_dependency "simplecov"
28
- spec.add_development_dependency "codeclimate-test-reporter"
23
+ spec.add_development_dependency 'bundler'
24
+ spec.add_development_dependency 'codeclimate-test-reporter'
25
+ spec.add_development_dependency 'coveralls'
26
+ spec.add_development_dependency 'fuubar'
27
+ spec.add_development_dependency 'glint'
28
+ spec.add_development_dependency 'rake'
29
+ spec.add_development_dependency 'rspec'
30
+ spec.add_development_dependency 'rubocop'
31
+ spec.add_development_dependency 'rubocop-performance'
32
+ spec.add_development_dependency 'rubocop-rspec'
33
+ spec.add_development_dependency 'rubocop-rubycw'
34
+ spec.add_development_dependency 'simplecov'
29
35
 
30
- spec.add_dependency "mechanize"
36
+ spec.add_dependency 'mechanize'
37
+ spec.add_dependency 'thor'
31
38
  end