yasuri 2.0.12 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
@@ -11,8 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
|
-
@
|
15
|
-
@page = @agent.get(uri + "/struct/structual_text.html")
|
14
|
+
@uri = uri + "/struct/structual_text.html"
|
16
15
|
|
17
16
|
@table_1996 = [
|
18
17
|
{ "title" => "The Perfect Insider",
|
@@ -53,7 +52,7 @@ describe 'Yasuri' do
|
|
53
52
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
53
|
])
|
55
54
|
expected = @table_1996
|
56
|
-
actual = node.
|
55
|
+
actual = node.scrape(@uri)
|
57
56
|
expect(actual).to match expected
|
58
57
|
end
|
59
58
|
|
@@ -63,7 +62,7 @@ describe 'Yasuri' do
|
|
63
62
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
64
63
|
])
|
65
64
|
expected = @table_1996.first
|
66
|
-
actual = node.
|
65
|
+
actual = node.scrape(@uri)
|
67
66
|
expect(actual).to match expected
|
68
67
|
end
|
69
68
|
|
@@ -72,7 +71,7 @@ describe 'Yasuri' do
|
|
72
71
|
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
73
72
|
Yasuri::TextNode.new('./td[1]', "title")
|
74
73
|
])
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to be_empty
|
77
76
|
end
|
78
77
|
|
@@ -81,7 +80,7 @@ describe 'Yasuri' do
|
|
81
80
|
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
82
81
|
Yasuri::TextNode.new('./td[1]', "title")
|
83
82
|
])
|
84
|
-
expect { node.
|
83
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
85
84
|
end
|
86
85
|
|
87
86
|
it 'fail with invalid xpath in children' do
|
@@ -90,7 +89,7 @@ describe 'Yasuri' do
|
|
90
89
|
Yasuri::TextNode.new(invalid_xpath, "title"),
|
91
90
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
92
91
|
])
|
93
|
-
expect { node.
|
92
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
94
93
|
end
|
95
94
|
|
96
95
|
it 'scrape all tables' do
|
@@ -101,7 +100,7 @@ describe 'Yasuri' do
|
|
101
100
|
])
|
102
101
|
])
|
103
102
|
expected = @all_tables
|
104
|
-
actual = node.
|
103
|
+
actual = node.scrape(@uri)
|
105
104
|
expect(actual).to match expected
|
106
105
|
end
|
107
106
|
|
@@ -118,7 +117,7 @@ describe 'Yasuri' do
|
|
118
117
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
119
118
|
])
|
120
119
|
])
|
121
|
-
compare_generated_vs_original(generated, original, @
|
120
|
+
compare_generated_vs_original(generated, original, @uri)
|
122
121
|
end
|
123
122
|
|
124
123
|
it 'return child node as symbol' do
|
@@ -127,7 +126,7 @@ describe 'Yasuri' do
|
|
127
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
127
|
])
|
129
128
|
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
|
-
actual = node.
|
129
|
+
actual = node.scrape(@uri, symbolize_names:true)
|
131
130
|
expect(actual).to match expected
|
132
131
|
end
|
133
132
|
|
@@ -135,9 +134,7 @@ describe 'Yasuri' do
|
|
135
134
|
|
136
135
|
describe '::StructNode::Links' do
|
137
136
|
before do
|
138
|
-
@
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
137
|
+
@uri = uri + "/struct/structual_links.html"
|
141
138
|
@table = [
|
142
139
|
{ "title" => "Child01,02",
|
143
140
|
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
@@ -155,22 +152,21 @@ describe 'Yasuri' do
|
|
155
152
|
])
|
156
153
|
])
|
157
154
|
expected = @table
|
158
|
-
actual = node.
|
155
|
+
actual = node.scrape(@uri)
|
159
156
|
expect(actual).to match expected
|
160
157
|
end
|
161
158
|
end # descrive
|
162
159
|
|
163
160
|
describe '::StructNode::Pages' do
|
164
161
|
before do
|
165
|
-
@
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
162
|
+
@uri = uri + "/struct/structual_text.html"
|
167
163
|
end
|
168
164
|
|
169
165
|
it 'not supported' do
|
170
166
|
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
167
|
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
168
|
])
|
173
|
-
expect{ node.
|
169
|
+
expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
170
|
end
|
175
171
|
end
|
176
172
|
end
|
@@ -10,69 +10,68 @@ describe 'Yasuri' do
|
|
10
10
|
include_context 'httpserver'
|
11
11
|
|
12
12
|
before do
|
13
|
-
@
|
14
|
-
@index_page = @agent.get(uri)
|
13
|
+
@uri = uri
|
15
14
|
end
|
16
15
|
|
17
16
|
describe '::TextNode' do
|
18
17
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
18
|
|
20
19
|
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
-
actual = @node.
|
20
|
+
actual = @node.scrape(@uri)
|
22
21
|
expect(actual).to eq "Hello,Yasuri"
|
23
22
|
end
|
24
23
|
|
25
24
|
it 'return empty text if no match node' do
|
26
25
|
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
-
actual = no_match_node.
|
26
|
+
actual = no_match_node.scrape(@uri)
|
28
27
|
expect(actual).to be_empty
|
29
28
|
end
|
30
29
|
|
31
30
|
it 'fail with invalid xpath' do
|
32
31
|
invalid_xpath = '/html/body/no_match_node['
|
33
32
|
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
-
expect { node.
|
33
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
35
34
|
end
|
36
35
|
|
37
36
|
it "can be defined by DSL, return single TextNode title" do
|
38
37
|
generated = Yasuri.text_title '/html/body/p[1]'
|
39
38
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
-
compare_generated_vs_original(generated, original, @
|
39
|
+
compare_generated_vs_original(generated, original, @uri)
|
41
40
|
end
|
42
41
|
|
43
42
|
it "can be truncated with regexp" do
|
44
43
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
45
|
-
actual = node.
|
44
|
+
actual = node.scrape(@uri)
|
46
45
|
expect(actual).to eq "Hello"
|
47
46
|
end
|
48
47
|
|
49
48
|
it "return first captured if matched given capture pattern" do
|
50
49
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
|
51
|
-
actual = node.
|
50
|
+
actual = node.scrape(@uri)
|
52
51
|
expect(actual).to eq "ello,Yasur"
|
53
52
|
end
|
54
53
|
|
55
54
|
it "can be truncated with regexp" do
|
56
55
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
|
57
|
-
actual = node.
|
56
|
+
actual = node.scrape(@uri)
|
58
57
|
expect(actual).to eq "Yasuri"
|
59
58
|
end
|
60
59
|
|
61
60
|
it "return empty string if truncated with no match to regexp" do
|
62
61
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
|
63
|
-
actual = node.
|
62
|
+
actual = node.scrape(@uri)
|
64
63
|
expect(actual).to be_empty
|
65
64
|
end
|
66
65
|
|
67
66
|
it "return symbol method applied string" do
|
68
67
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
69
|
-
actual = node.
|
68
|
+
actual = node.scrape(@uri)
|
70
69
|
expect(actual).to eq "HELLO,YASURI"
|
71
70
|
end
|
72
71
|
|
73
72
|
it "return apply multi arguments" do
|
74
73
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to eq "ELLO,YASUR"
|
77
76
|
end
|
78
77
|
end
|
data/yasuri.gemspec
CHANGED
@@ -14,12 +14,13 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
17
|
-
spec.executables = spec.files.grep(%r{^
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.bindir = "exe"
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
20
|
spec.require_paths = ["lib"]
|
20
21
|
|
21
|
-
spec.add_development_dependency "bundler"
|
22
|
-
spec.add_development_dependency "rake"
|
22
|
+
spec.add_development_dependency "bundler"
|
23
|
+
spec.add_development_dependency "rake"
|
23
24
|
spec.add_development_dependency "rspec"
|
24
25
|
spec.add_development_dependency "fuubar"
|
25
26
|
spec.add_development_dependency "glint"
|
@@ -28,4 +29,5 @@ Gem::Specification.new do |spec|
|
|
28
29
|
spec.add_development_dependency "codeclimate-test-reporter"
|
29
30
|
|
30
31
|
spec.add_dependency "mechanize"
|
32
|
+
spec.add_dependency "thor"
|
31
33
|
end
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
|
-
autorequire:
|
9
|
-
bindir:
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,16 +136,33 @@ dependencies:
|
|
136
136
|
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: thor
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
139
153
|
description: Yasuri is an easy web-scraping library for supporting "Mechanize".
|
140
154
|
email:
|
141
155
|
- tac@tac42.net
|
142
|
-
executables:
|
156
|
+
executables:
|
157
|
+
- yasuri
|
143
158
|
extensions: []
|
144
159
|
extra_rdoc_files: []
|
145
160
|
files:
|
146
161
|
- ".coveralls.yml"
|
162
|
+
- ".github/workflows/ruby.yml"
|
147
163
|
- ".gitignore"
|
148
164
|
- ".rspec"
|
165
|
+
- ".ruby-version"
|
149
166
|
- ".travis.yml"
|
150
167
|
- Gemfile
|
151
168
|
- LICENSE
|
@@ -153,16 +170,26 @@ files:
|
|
153
170
|
- Rakefile
|
154
171
|
- USAGE.ja.md
|
155
172
|
- USAGE.md
|
156
|
-
-
|
173
|
+
- examples/example.rb
|
174
|
+
- examples/github.yml
|
175
|
+
- examples/sample.json
|
176
|
+
- examples/sample.yml
|
177
|
+
- exe/yasuri
|
157
178
|
- lib/yasuri.rb
|
158
179
|
- lib/yasuri/version.rb
|
159
180
|
- lib/yasuri/yasuri.rb
|
181
|
+
- lib/yasuri/yasuri_cli.rb
|
160
182
|
- lib/yasuri/yasuri_links_node.rb
|
183
|
+
- lib/yasuri/yasuri_map_node.rb
|
161
184
|
- lib/yasuri/yasuri_node.rb
|
162
185
|
- lib/yasuri/yasuri_node_generator.rb
|
163
186
|
- lib/yasuri/yasuri_paginate_node.rb
|
164
187
|
- lib/yasuri/yasuri_struct_node.rb
|
165
188
|
- lib/yasuri/yasuri_text_node.rb
|
189
|
+
- spec/cli_resources/tree.json
|
190
|
+
- spec/cli_resources/tree.yml
|
191
|
+
- spec/cli_resources/tree_wrong.json
|
192
|
+
- spec/cli_resources/tree_wrong.yml
|
166
193
|
- spec/htdocs/child01.html
|
167
194
|
- spec/htdocs/child01_sub.html
|
168
195
|
- spec/htdocs/child02.html
|
@@ -178,7 +205,9 @@ files:
|
|
178
205
|
- spec/htdocs/struct/structual_text.html
|
179
206
|
- spec/servers/httpserver.rb
|
180
207
|
- spec/spec_helper.rb
|
208
|
+
- spec/yasuri_cli_spec.rb
|
181
209
|
- spec/yasuri_links_node_spec.rb
|
210
|
+
- spec/yasuri_map_spec.rb
|
182
211
|
- spec/yasuri_node_spec.rb
|
183
212
|
- spec/yasuri_paginate_node_spec.rb
|
184
213
|
- spec/yasuri_spec.rb
|
@@ -189,7 +218,7 @@ homepage: https://github.com/tac0x2a/yasuri
|
|
189
218
|
licenses:
|
190
219
|
- MIT
|
191
220
|
metadata: {}
|
192
|
-
post_install_message:
|
221
|
+
post_install_message:
|
193
222
|
rdoc_options: []
|
194
223
|
require_paths:
|
195
224
|
- lib
|
@@ -204,12 +233,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
204
233
|
- !ruby/object:Gem::Version
|
205
234
|
version: '0'
|
206
235
|
requirements: []
|
207
|
-
|
208
|
-
|
209
|
-
signing_key:
|
236
|
+
rubygems_version: 3.2.3
|
237
|
+
signing_key:
|
210
238
|
specification_version: 4
|
211
239
|
summary: Yasuri is easy scraping library.
|
212
240
|
test_files:
|
241
|
+
- spec/cli_resources/tree.json
|
242
|
+
- spec/cli_resources/tree.yml
|
243
|
+
- spec/cli_resources/tree_wrong.json
|
244
|
+
- spec/cli_resources/tree_wrong.yml
|
213
245
|
- spec/htdocs/child01.html
|
214
246
|
- spec/htdocs/child01_sub.html
|
215
247
|
- spec/htdocs/child02.html
|
@@ -225,7 +257,9 @@ test_files:
|
|
225
257
|
- spec/htdocs/struct/structual_text.html
|
226
258
|
- spec/servers/httpserver.rb
|
227
259
|
- spec/spec_helper.rb
|
260
|
+
- spec/yasuri_cli_spec.rb
|
228
261
|
- spec/yasuri_links_node_spec.rb
|
262
|
+
- spec/yasuri_map_spec.rb
|
229
263
|
- spec/yasuri_node_spec.rb
|
230
264
|
- spec/yasuri_paginate_node_spec.rb
|
231
265
|
- spec/yasuri_spec.rb
|
data/app.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
# Author:: TAC (tac@tac42.net)
|
5
|
-
|
6
|
-
require 'pp'
|
7
|
-
require 'time'
|
8
|
-
require 'mechanize'
|
9
|
-
|
10
|
-
require_relative 'lib/yasuri/yasuri'
|
11
|
-
|
12
|
-
agent = Mechanize.new
|
13
|
-
|
14
|
-
uri = "http://www.asahi.com/"
|
15
|
-
|
16
|
-
# Node tree constructing by DSL
|
17
|
-
root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
|
18
|
-
text_title '//*[@id="MainInner"]/div[1]/div/h1'
|
19
|
-
text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
|
20
|
-
end
|
21
|
-
|
22
|
-
# Node tree constructing by JSON
|
23
|
-
src = <<-EOJSON
|
24
|
-
{ "node" : "links",
|
25
|
-
"name" : "root",
|
26
|
-
"path" : "//*[@id='MainInner']/div[1]/ul/li/a",
|
27
|
-
"children" : [
|
28
|
-
{ "node" : "text",
|
29
|
-
"name" : "title",
|
30
|
-
"path" : "//*[@id='MainInner']/div[1]/div/h1"
|
31
|
-
},
|
32
|
-
{ "node" : "text",
|
33
|
-
"name" : "article",
|
34
|
-
"path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
|
35
|
-
}
|
36
|
-
]
|
37
|
-
}
|
38
|
-
EOJSON
|
39
|
-
root = Yasuri.json2tree(src)
|
40
|
-
|
41
|
-
# Access to parsed resources
|
42
|
-
page = agent.get(uri)
|
43
|
-
contents = root.inject(agent, page)
|
44
|
-
|
45
|
-
contents.each do |h|
|
46
|
-
t = h['title']
|
47
|
-
a = h['article']
|
48
|
-
|
49
|
-
puts t
|
50
|
-
puts a
|
51
|
-
puts "=" * 100
|
52
|
-
end
|