yasuri 2.0.13 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/.ruby-version +1 -1
- data/README.md +82 -31
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -75
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +17 -14
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +13 -8
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -11
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +92 -60
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +108 -19
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
@@ -1,107 +1,121 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
##########
|
7
|
-
# Struct #
|
8
|
-
##########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::StructNode' do
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@table_1996 = [
|
18
|
-
{ "title" => "The Perfect Insider",
|
8
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
9
|
+
let(:table1996) do
|
10
|
+
[
|
11
|
+
{ "title" => "The Perfect Insider",
|
19
12
|
"pub_date" => "1996/4/5" },
|
20
|
-
{ "title"
|
13
|
+
{ "title" => "Doctors in Isolated Room",
|
21
14
|
"pub_date" => "1996/7/5" },
|
22
|
-
{ "title"
|
23
|
-
"pub_date" => "1996/9/5" }
|
15
|
+
{ "title" => "Mathematical Goodbye",
|
16
|
+
"pub_date" => "1996/9/5" }
|
24
17
|
]
|
25
|
-
|
26
|
-
|
18
|
+
end
|
19
|
+
let(:table1997) do
|
20
|
+
[
|
21
|
+
{ "title" => "Jack the Poetical Private",
|
27
22
|
"pub_date" => "1997/1/5" },
|
28
|
-
{ "title"
|
23
|
+
{ "title" => "Who Inside",
|
29
24
|
"pub_date" => "1997/4/5" },
|
30
|
-
{ "title"
|
31
|
-
"pub_date" => "1997/10/5" }
|
25
|
+
{ "title" => "Illusion Acts Like Magic",
|
26
|
+
"pub_date" => "1997/10/5" }
|
32
27
|
]
|
33
|
-
|
34
|
-
|
28
|
+
end
|
29
|
+
let(:table1998) do
|
30
|
+
[
|
31
|
+
{ "title" => "Replaceable Summer",
|
35
32
|
"pub_date" => "1998/1/7" },
|
36
|
-
{ "title"
|
33
|
+
{ "title" => "Switch Back",
|
37
34
|
"pub_date" => "1998/4/5" },
|
38
|
-
{ "title"
|
35
|
+
{ "title" => "Numerical Models",
|
39
36
|
"pub_date" => "1998/7/5" },
|
40
|
-
{ "title"
|
41
|
-
"pub_date" => "1998/10/5" }
|
37
|
+
{ "title" => "The Perfect Outsider",
|
38
|
+
"pub_date" => "1998/10/5" }
|
42
39
|
]
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
end
|
41
|
+
|
42
|
+
let(:all_tables) do
|
43
|
+
[
|
44
|
+
{ "table" => table1996 },
|
45
|
+
{ "table" => table1997 },
|
46
|
+
{ "table" => table1998 }
|
47
47
|
]
|
48
48
|
end
|
49
49
|
|
50
50
|
it 'scrape single table contents' do
|
51
|
-
node = Yasuri::StructNode.new(
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
51
|
+
node = Yasuri::StructNode.new(
|
52
|
+
'/html/body/table[1]/tr', "table", [
|
53
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
54
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
55
|
+
]
|
56
|
+
)
|
57
|
+
expected = table1996
|
58
|
+
actual = node.scrape(uri_struct)
|
57
59
|
expect(actual).to match expected
|
58
60
|
end
|
59
61
|
|
60
62
|
it 'return single result without array' do
|
61
|
-
node = Yasuri::StructNode.new(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
63
|
+
node = Yasuri::StructNode.new(
|
64
|
+
'/html/body/table[1]/tr[1]', "table_first_tr", [
|
65
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
66
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
67
|
+
]
|
68
|
+
)
|
69
|
+
expected = table1996.first
|
70
|
+
actual = node.scrape(uri_struct)
|
67
71
|
expect(actual).to match expected
|
68
72
|
end
|
69
73
|
|
70
74
|
it 'return empty text if no match node' do
|
71
75
|
no_match_xpath = '/html/body/table[1]/t'
|
72
|
-
node = Yasuri::StructNode.new(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
+
node = Yasuri::StructNode.new(
|
77
|
+
no_match_xpath, "table", [
|
78
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
79
|
+
]
|
80
|
+
)
|
81
|
+
actual = node.scrape(uri_struct)
|
76
82
|
expect(actual).to be_empty
|
77
83
|
end
|
78
84
|
|
79
85
|
it 'fail with invalid xpath' do
|
80
86
|
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
81
|
-
node = Yasuri::StructNode.new(
|
82
|
-
|
83
|
-
|
84
|
-
|
87
|
+
node = Yasuri::StructNode.new(
|
88
|
+
invalid_xpath, "table", [
|
89
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
90
|
+
]
|
91
|
+
)
|
92
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
85
93
|
end
|
86
94
|
|
87
95
|
it 'fail with invalid xpath in children' do
|
88
96
|
invalid_xpath = './td[1]['
|
89
|
-
node = Yasuri::StructNode.new(
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
97
|
+
node = Yasuri::StructNode.new(
|
98
|
+
'/html/body/table[1]/tr', "table", [
|
99
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
100
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
101
|
+
]
|
102
|
+
)
|
103
|
+
expect { node.scrape(uri_struct) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
94
104
|
end
|
95
105
|
|
96
106
|
it 'scrape all tables' do
|
97
|
-
node = Yasuri::StructNode.new(
|
98
|
-
|
99
|
-
Yasuri::
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
107
|
+
node = Yasuri::StructNode.new(
|
108
|
+
'/html/body/table', "tables", [
|
109
|
+
Yasuri::StructNode.new(
|
110
|
+
'./tr', "table", [
|
111
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
112
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
113
|
+
]
|
114
|
+
)
|
115
|
+
]
|
116
|
+
)
|
117
|
+
expected = all_tables
|
118
|
+
actual = node.scrape(uri_struct)
|
105
119
|
expect(actual).to match expected
|
106
120
|
end
|
107
121
|
|
@@ -112,65 +126,71 @@ describe 'Yasuri' do
|
|
112
126
|
text_pub_date './td[2]'
|
113
127
|
end
|
114
128
|
end
|
115
|
-
original = Yasuri::StructNode.new(
|
116
|
-
|
117
|
-
Yasuri::
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
129
|
+
original = Yasuri::StructNode.new(
|
130
|
+
'/html/body/table', "tables", [
|
131
|
+
Yasuri::StructNode.new(
|
132
|
+
'./tr', "table", [
|
133
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
134
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
135
|
+
]
|
136
|
+
)
|
137
|
+
]
|
138
|
+
)
|
139
|
+
compare_generated_vs_original(generated, original, uri_struct)
|
122
140
|
end
|
123
141
|
|
124
142
|
it 'return child node as symbol' do
|
125
|
-
node = Yasuri::StructNode.new(
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
143
|
+
node = Yasuri::StructNode.new(
|
144
|
+
'/html/body/table[1]/tr', "table", [
|
145
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
146
|
+
Yasuri::TextNode.new('./td[2]', "pub_date")
|
147
|
+
]
|
148
|
+
)
|
149
|
+
expected = table1996.map { |h| h.transform_keys(&:to_sym) }
|
150
|
+
actual = node.scrape(uri_struct, symbolize_names: true)
|
131
151
|
expect(actual).to match expected
|
132
152
|
end
|
133
|
-
|
134
153
|
end
|
135
154
|
|
136
155
|
describe '::StructNode::Links' do
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
@table = [
|
156
|
+
let(:uri_struct) { "#{uri}/struct/structual_links.html" }
|
157
|
+
let(:table) do
|
158
|
+
[
|
142
159
|
{ "title" => "Child01,02",
|
143
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
160
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }] },
|
144
161
|
|
145
162
|
{ "title" => "Child01,02,03",
|
146
|
-
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
163
|
+
"child" => [{ "p" => "Child 01 page." }, { "p" => "Child 02 page." }, { "p" => "Child 03 page." }] }
|
147
164
|
]
|
148
165
|
end
|
149
166
|
|
150
167
|
it 'return child node in links inside struct' do
|
151
|
-
node = Yasuri::StructNode.new(
|
152
|
-
|
153
|
-
|
154
|
-
Yasuri::
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
168
|
+
node = Yasuri::StructNode.new(
|
169
|
+
'/html/body/table/tr', "table", [
|
170
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
171
|
+
Yasuri::LinksNode.new(
|
172
|
+
'./td/a', "child", [
|
173
|
+
Yasuri::TextNode.new('/html/body/p', "p")
|
174
|
+
]
|
175
|
+
)
|
176
|
+
]
|
177
|
+
)
|
178
|
+
expected = table
|
179
|
+
actual = node.scrape(uri_struct)
|
159
180
|
expect(actual).to match expected
|
160
181
|
end
|
161
|
-
end
|
182
|
+
end
|
162
183
|
|
163
184
|
describe '::StructNode::Pages' do
|
164
|
-
|
165
|
-
@agent = Mechanize.new
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
-
end
|
185
|
+
let(:uri_struct) { "#{uri}/struct/structual_text.html" }
|
168
186
|
|
169
187
|
it 'not supported' do
|
170
|
-
node = Yasuri::StructNode.new(
|
171
|
-
|
172
|
-
|
173
|
-
|
188
|
+
node = Yasuri::StructNode.new(
|
189
|
+
'/html/body/table[1]/tr', "table", [
|
190
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
191
|
+
]
|
192
|
+
)
|
193
|
+
expect { node.scrape(uri_struct) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
194
|
end
|
175
195
|
end
|
176
196
|
end
|
@@ -1,78 +1,68 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
2
|
|
4
3
|
require_relative 'spec_helper'
|
5
4
|
|
6
|
-
########
|
7
|
-
# Text #
|
8
|
-
########
|
9
5
|
describe 'Yasuri' do
|
10
6
|
include_context 'httpserver'
|
11
7
|
|
12
|
-
before do
|
13
|
-
@agent = Mechanize.new
|
14
|
-
@index_page = @agent.get(uri)
|
15
|
-
end
|
16
|
-
|
17
8
|
describe '::TextNode' do
|
18
|
-
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
|
-
|
20
9
|
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
-
|
10
|
+
node = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
11
|
+
actual = node.scrape(uri)
|
22
12
|
expect(actual).to eq "Hello,Yasuri"
|
23
13
|
end
|
24
14
|
|
25
15
|
it 'return empty text if no match node' do
|
26
16
|
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
-
actual = no_match_node.
|
17
|
+
actual = no_match_node.scrape(uri)
|
28
18
|
expect(actual).to be_empty
|
29
19
|
end
|
30
20
|
|
31
21
|
it 'fail with invalid xpath' do
|
32
22
|
invalid_xpath = '/html/body/no_match_node['
|
33
23
|
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
-
expect { node.
|
24
|
+
expect { node.scrape(uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
35
25
|
end
|
36
26
|
|
37
27
|
it "can be defined by DSL, return single TextNode title" do
|
38
28
|
generated = Yasuri.text_title '/html/body/p[1]'
|
39
|
-
original
|
40
|
-
compare_generated_vs_original(generated, original,
|
29
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
30
|
+
compare_generated_vs_original(generated, original, uri)
|
41
31
|
end
|
42
32
|
|
43
|
-
it "can
|
44
|
-
node
|
45
|
-
actual = node.
|
33
|
+
it "can truncate head by regexp" do
|
34
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate: /^[^,]+/
|
35
|
+
actual = node.scrape(uri)
|
46
36
|
expect(actual).to eq "Hello"
|
47
37
|
end
|
48
38
|
|
49
|
-
it "
|
50
|
-
node
|
51
|
-
actual = node.
|
52
|
-
expect(actual).to eq "
|
39
|
+
it "can truncate tail by regexp" do
|
40
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate: /[^,]+$/
|
41
|
+
actual = node.scrape(uri)
|
42
|
+
expect(actual).to eq "Yasuri"
|
53
43
|
end
|
54
44
|
|
55
|
-
it "
|
56
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate
|
57
|
-
actual = node.
|
58
|
-
expect(actual).to eq "
|
45
|
+
it "return first captured if matched given capture pattern" do
|
46
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate: /H(.+)i/
|
47
|
+
actual = node.scrape(uri)
|
48
|
+
expect(actual).to eq "ello,Yasur"
|
59
49
|
end
|
60
50
|
|
61
51
|
it "return empty string if truncated with no match to regexp" do
|
62
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate
|
63
|
-
actual = node.
|
52
|
+
node = Yasuri.text_title '/html/body/p[1]', truncate: /^hoge/
|
53
|
+
actual = node.scrape(uri)
|
64
54
|
expect(actual).to be_empty
|
65
55
|
end
|
66
56
|
|
67
57
|
it "return symbol method applied string" do
|
68
58
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
69
|
-
actual = node.
|
59
|
+
actual = node.scrape(uri)
|
70
60
|
expect(actual).to eq "HELLO,YASURI"
|
71
61
|
end
|
72
62
|
|
73
63
|
it "return apply multi arguments" do
|
74
|
-
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate
|
75
|
-
actual = node.
|
64
|
+
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate: /H(.+)i/
|
65
|
+
actual = node.scrape(uri)
|
76
66
|
expect(actual).to eq "ELLO,YASUR"
|
77
67
|
end
|
78
68
|
end
|
data/yasuri.gemspec
CHANGED
@@ -1,31 +1,38 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
lib = File.expand_path('../lib', __FILE__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'yasuri/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name
|
8
|
-
spec.version
|
9
|
-
spec.authors
|
10
|
-
spec.email
|
11
|
-
spec.summary
|
12
|
-
spec.description
|
13
|
-
spec.homepage
|
14
|
-
spec.license
|
7
|
+
spec.name = 'yasuri'
|
8
|
+
spec.version = Yasuri::VERSION
|
9
|
+
spec.authors = ['TAC']
|
10
|
+
spec.email = ['tac@tac42.net']
|
11
|
+
spec.summary = %q{Yasuri is easy scraping library.}
|
12
|
+
spec.description = %q{Yasuri is an easy web-scraping library for supporting 'Mechanize'.}
|
13
|
+
spec.homepage = 'https://github.com/tac0x2a/yasuri'
|
14
|
+
spec.license = 'MIT'
|
15
15
|
|
16
|
-
spec.files
|
17
|
-
spec.executables
|
18
|
-
spec.
|
19
|
-
spec.
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.bindir = 'exe'
|
19
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
|
+
spec.require_paths = ['lib']
|
21
|
+
spec.required_ruby_version = '>= 2.7.0'
|
20
22
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
23
|
-
spec.add_development_dependency
|
24
|
-
spec.add_development_dependency
|
25
|
-
spec.add_development_dependency
|
26
|
-
spec.add_development_dependency
|
27
|
-
spec.add_development_dependency
|
28
|
-
spec.add_development_dependency
|
23
|
+
spec.add_development_dependency 'bundler'
|
24
|
+
spec.add_development_dependency 'codeclimate-test-reporter'
|
25
|
+
spec.add_development_dependency 'coveralls'
|
26
|
+
spec.add_development_dependency 'fuubar'
|
27
|
+
spec.add_development_dependency 'glint'
|
28
|
+
spec.add_development_dependency 'rake'
|
29
|
+
spec.add_development_dependency 'rspec'
|
30
|
+
spec.add_development_dependency 'rubocop'
|
31
|
+
spec.add_development_dependency 'rubocop-performance'
|
32
|
+
spec.add_development_dependency 'rubocop-rspec'
|
33
|
+
spec.add_development_dependency 'rubocop-rubycw'
|
34
|
+
spec.add_development_dependency 'simplecov'
|
29
35
|
|
30
|
-
spec.add_dependency
|
36
|
+
spec.add_dependency 'mechanize'
|
37
|
+
spec.add_dependency 'thor'
|
31
38
|
end
|