yasuri 2.0.12 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
@@ -11,8 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
|
-
@
|
15
|
-
@page = @agent.get(uri + "/struct/structual_text.html")
|
14
|
+
@uri = uri + "/struct/structual_text.html"
|
16
15
|
|
17
16
|
@table_1996 = [
|
18
17
|
{ "title" => "The Perfect Insider",
|
@@ -53,7 +52,7 @@ describe 'Yasuri' do
|
|
53
52
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
53
|
])
|
55
54
|
expected = @table_1996
|
56
|
-
actual = node.
|
55
|
+
actual = node.scrape(@uri)
|
57
56
|
expect(actual).to match expected
|
58
57
|
end
|
59
58
|
|
@@ -63,7 +62,7 @@ describe 'Yasuri' do
|
|
63
62
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
64
63
|
])
|
65
64
|
expected = @table_1996.first
|
66
|
-
actual = node.
|
65
|
+
actual = node.scrape(@uri)
|
67
66
|
expect(actual).to match expected
|
68
67
|
end
|
69
68
|
|
@@ -72,7 +71,7 @@ describe 'Yasuri' do
|
|
72
71
|
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
73
72
|
Yasuri::TextNode.new('./td[1]', "title")
|
74
73
|
])
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to be_empty
|
77
76
|
end
|
78
77
|
|
@@ -81,7 +80,7 @@ describe 'Yasuri' do
|
|
81
80
|
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
82
81
|
Yasuri::TextNode.new('./td[1]', "title")
|
83
82
|
])
|
84
|
-
expect { node.
|
83
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
85
84
|
end
|
86
85
|
|
87
86
|
it 'fail with invalid xpath in children' do
|
@@ -90,7 +89,7 @@ describe 'Yasuri' do
|
|
90
89
|
Yasuri::TextNode.new(invalid_xpath, "title"),
|
91
90
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
92
91
|
])
|
93
|
-
expect { node.
|
92
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
94
93
|
end
|
95
94
|
|
96
95
|
it 'scrape all tables' do
|
@@ -101,7 +100,7 @@ describe 'Yasuri' do
|
|
101
100
|
])
|
102
101
|
])
|
103
102
|
expected = @all_tables
|
104
|
-
actual = node.
|
103
|
+
actual = node.scrape(@uri)
|
105
104
|
expect(actual).to match expected
|
106
105
|
end
|
107
106
|
|
@@ -118,7 +117,7 @@ describe 'Yasuri' do
|
|
118
117
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
119
118
|
])
|
120
119
|
])
|
121
|
-
compare_generated_vs_original(generated, original, @
|
120
|
+
compare_generated_vs_original(generated, original, @uri)
|
122
121
|
end
|
123
122
|
|
124
123
|
it 'return child node as symbol' do
|
@@ -127,7 +126,7 @@ describe 'Yasuri' do
|
|
127
126
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
127
|
])
|
129
128
|
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
|
-
actual = node.
|
129
|
+
actual = node.scrape(@uri, symbolize_names:true)
|
131
130
|
expect(actual).to match expected
|
132
131
|
end
|
133
132
|
|
@@ -135,9 +134,7 @@ describe 'Yasuri' do
|
|
135
134
|
|
136
135
|
describe '::StructNode::Links' do
|
137
136
|
before do
|
138
|
-
@
|
139
|
-
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
-
|
137
|
+
@uri = uri + "/struct/structual_links.html"
|
141
138
|
@table = [
|
142
139
|
{ "title" => "Child01,02",
|
143
140
|
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
@@ -155,22 +152,21 @@ describe 'Yasuri' do
|
|
155
152
|
])
|
156
153
|
])
|
157
154
|
expected = @table
|
158
|
-
actual = node.
|
155
|
+
actual = node.scrape(@uri)
|
159
156
|
expect(actual).to match expected
|
160
157
|
end
|
161
158
|
end # descrive
|
162
159
|
|
163
160
|
describe '::StructNode::Pages' do
|
164
161
|
before do
|
165
|
-
@
|
166
|
-
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
162
|
+
@uri = uri + "/struct/structual_text.html"
|
167
163
|
end
|
168
164
|
|
169
165
|
it 'not supported' do
|
170
166
|
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
167
|
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
168
|
])
|
173
|
-
expect{ node.
|
169
|
+
expect{ node.scrape(@uri) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
170
|
end
|
175
171
|
end
|
176
172
|
end
|
@@ -10,69 +10,68 @@ describe 'Yasuri' do
|
|
10
10
|
include_context 'httpserver'
|
11
11
|
|
12
12
|
before do
|
13
|
-
@
|
14
|
-
@index_page = @agent.get(uri)
|
13
|
+
@uri = uri
|
15
14
|
end
|
16
15
|
|
17
16
|
describe '::TextNode' do
|
18
17
|
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
18
|
|
20
19
|
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
-
actual = @node.
|
20
|
+
actual = @node.scrape(@uri)
|
22
21
|
expect(actual).to eq "Hello,Yasuri"
|
23
22
|
end
|
24
23
|
|
25
24
|
it 'return empty text if no match node' do
|
26
25
|
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
-
actual = no_match_node.
|
26
|
+
actual = no_match_node.scrape(@uri)
|
28
27
|
expect(actual).to be_empty
|
29
28
|
end
|
30
29
|
|
31
30
|
it 'fail with invalid xpath' do
|
32
31
|
invalid_xpath = '/html/body/no_match_node['
|
33
32
|
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
-
expect { node.
|
33
|
+
expect { node.scrape(@uri) }.to raise_error(Nokogiri::XML::XPath::SyntaxError)
|
35
34
|
end
|
36
35
|
|
37
36
|
it "can be defined by DSL, return single TextNode title" do
|
38
37
|
generated = Yasuri.text_title '/html/body/p[1]'
|
39
38
|
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
-
compare_generated_vs_original(generated, original, @
|
39
|
+
compare_generated_vs_original(generated, original, @uri)
|
41
40
|
end
|
42
41
|
|
43
42
|
it "can be truncated with regexp" do
|
44
43
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
45
|
-
actual = node.
|
44
|
+
actual = node.scrape(@uri)
|
46
45
|
expect(actual).to eq "Hello"
|
47
46
|
end
|
48
47
|
|
49
48
|
it "return first captured if matched given capture pattern" do
|
50
49
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/H(.+)i/
|
51
|
-
actual = node.
|
50
|
+
actual = node.scrape(@uri)
|
52
51
|
expect(actual).to eq "ello,Yasur"
|
53
52
|
end
|
54
53
|
|
55
54
|
it "can be truncated with regexp" do
|
56
55
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/[^,]+$/
|
57
|
-
actual = node.
|
56
|
+
actual = node.scrape(@uri)
|
58
57
|
expect(actual).to eq "Yasuri"
|
59
58
|
end
|
60
59
|
|
61
60
|
it "return empty string if truncated with no match to regexp" do
|
62
61
|
node = Yasuri.text_title '/html/body/p[1]', truncate:/^hoge/
|
63
|
-
actual = node.
|
62
|
+
actual = node.scrape(@uri)
|
64
63
|
expect(actual).to be_empty
|
65
64
|
end
|
66
65
|
|
67
66
|
it "return symbol method applied string" do
|
68
67
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
69
|
-
actual = node.
|
68
|
+
actual = node.scrape(@uri)
|
70
69
|
expect(actual).to eq "HELLO,YASURI"
|
71
70
|
end
|
72
71
|
|
73
72
|
it "return apply multi arguments" do
|
74
73
|
node = Yasuri.text_title '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
|
75
|
-
actual = node.
|
74
|
+
actual = node.scrape(@uri)
|
76
75
|
expect(actual).to eq "ELLO,YASUR"
|
77
76
|
end
|
78
77
|
end
|
data/yasuri.gemspec
CHANGED
@@ -14,12 +14,13 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
17
|
-
spec.executables = spec.files.grep(%r{^
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.bindir = "exe"
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
20
|
spec.require_paths = ["lib"]
|
20
21
|
|
21
|
-
spec.add_development_dependency "bundler"
|
22
|
-
spec.add_development_dependency "rake"
|
22
|
+
spec.add_development_dependency "bundler"
|
23
|
+
spec.add_development_dependency "rake"
|
23
24
|
spec.add_development_dependency "rspec"
|
24
25
|
spec.add_development_dependency "fuubar"
|
25
26
|
spec.add_development_dependency "glint"
|
@@ -28,4 +29,5 @@ Gem::Specification.new do |spec|
|
|
28
29
|
spec.add_development_dependency "codeclimate-test-reporter"
|
29
30
|
|
30
31
|
spec.add_dependency "mechanize"
|
32
|
+
spec.add_dependency "thor"
|
31
33
|
end
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
|
-
autorequire:
|
9
|
-
bindir:
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,16 +136,33 @@ dependencies:
|
|
136
136
|
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: thor
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
139
153
|
description: Yasuri is an easy web-scraping library for supporting "Mechanize".
|
140
154
|
email:
|
141
155
|
- tac@tac42.net
|
142
|
-
executables:
|
156
|
+
executables:
|
157
|
+
- yasuri
|
143
158
|
extensions: []
|
144
159
|
extra_rdoc_files: []
|
145
160
|
files:
|
146
161
|
- ".coveralls.yml"
|
162
|
+
- ".github/workflows/ruby.yml"
|
147
163
|
- ".gitignore"
|
148
164
|
- ".rspec"
|
165
|
+
- ".ruby-version"
|
149
166
|
- ".travis.yml"
|
150
167
|
- Gemfile
|
151
168
|
- LICENSE
|
@@ -153,16 +170,26 @@ files:
|
|
153
170
|
- Rakefile
|
154
171
|
- USAGE.ja.md
|
155
172
|
- USAGE.md
|
156
|
-
-
|
173
|
+
- examples/example.rb
|
174
|
+
- examples/github.yml
|
175
|
+
- examples/sample.json
|
176
|
+
- examples/sample.yml
|
177
|
+
- exe/yasuri
|
157
178
|
- lib/yasuri.rb
|
158
179
|
- lib/yasuri/version.rb
|
159
180
|
- lib/yasuri/yasuri.rb
|
181
|
+
- lib/yasuri/yasuri_cli.rb
|
160
182
|
- lib/yasuri/yasuri_links_node.rb
|
183
|
+
- lib/yasuri/yasuri_map_node.rb
|
161
184
|
- lib/yasuri/yasuri_node.rb
|
162
185
|
- lib/yasuri/yasuri_node_generator.rb
|
163
186
|
- lib/yasuri/yasuri_paginate_node.rb
|
164
187
|
- lib/yasuri/yasuri_struct_node.rb
|
165
188
|
- lib/yasuri/yasuri_text_node.rb
|
189
|
+
- spec/cli_resources/tree.json
|
190
|
+
- spec/cli_resources/tree.yml
|
191
|
+
- spec/cli_resources/tree_wrong.json
|
192
|
+
- spec/cli_resources/tree_wrong.yml
|
166
193
|
- spec/htdocs/child01.html
|
167
194
|
- spec/htdocs/child01_sub.html
|
168
195
|
- spec/htdocs/child02.html
|
@@ -178,7 +205,9 @@ files:
|
|
178
205
|
- spec/htdocs/struct/structual_text.html
|
179
206
|
- spec/servers/httpserver.rb
|
180
207
|
- spec/spec_helper.rb
|
208
|
+
- spec/yasuri_cli_spec.rb
|
181
209
|
- spec/yasuri_links_node_spec.rb
|
210
|
+
- spec/yasuri_map_spec.rb
|
182
211
|
- spec/yasuri_node_spec.rb
|
183
212
|
- spec/yasuri_paginate_node_spec.rb
|
184
213
|
- spec/yasuri_spec.rb
|
@@ -189,7 +218,7 @@ homepage: https://github.com/tac0x2a/yasuri
|
|
189
218
|
licenses:
|
190
219
|
- MIT
|
191
220
|
metadata: {}
|
192
|
-
post_install_message:
|
221
|
+
post_install_message:
|
193
222
|
rdoc_options: []
|
194
223
|
require_paths:
|
195
224
|
- lib
|
@@ -204,12 +233,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
204
233
|
- !ruby/object:Gem::Version
|
205
234
|
version: '0'
|
206
235
|
requirements: []
|
207
|
-
|
208
|
-
|
209
|
-
signing_key:
|
236
|
+
rubygems_version: 3.2.3
|
237
|
+
signing_key:
|
210
238
|
specification_version: 4
|
211
239
|
summary: Yasuri is easy scraping library.
|
212
240
|
test_files:
|
241
|
+
- spec/cli_resources/tree.json
|
242
|
+
- spec/cli_resources/tree.yml
|
243
|
+
- spec/cli_resources/tree_wrong.json
|
244
|
+
- spec/cli_resources/tree_wrong.yml
|
213
245
|
- spec/htdocs/child01.html
|
214
246
|
- spec/htdocs/child01_sub.html
|
215
247
|
- spec/htdocs/child02.html
|
@@ -225,7 +257,9 @@ test_files:
|
|
225
257
|
- spec/htdocs/struct/structual_text.html
|
226
258
|
- spec/servers/httpserver.rb
|
227
259
|
- spec/spec_helper.rb
|
260
|
+
- spec/yasuri_cli_spec.rb
|
228
261
|
- spec/yasuri_links_node_spec.rb
|
262
|
+
- spec/yasuri_map_spec.rb
|
229
263
|
- spec/yasuri_node_spec.rb
|
230
264
|
- spec/yasuri_paginate_node_spec.rb
|
231
265
|
- spec/yasuri_spec.rb
|
data/app.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
# Author:: TAC (tac@tac42.net)
|
5
|
-
|
6
|
-
require 'pp'
|
7
|
-
require 'time'
|
8
|
-
require 'mechanize'
|
9
|
-
|
10
|
-
require_relative 'lib/yasuri/yasuri'
|
11
|
-
|
12
|
-
agent = Mechanize.new
|
13
|
-
|
14
|
-
uri = "http://www.asahi.com/"
|
15
|
-
|
16
|
-
# Node tree constructing by DSL
|
17
|
-
root = Yasuri.links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
|
18
|
-
text_title '//*[@id="MainInner"]/div[1]/div/h1'
|
19
|
-
text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
|
20
|
-
end
|
21
|
-
|
22
|
-
# Node tree constructing by JSON
|
23
|
-
src = <<-EOJSON
|
24
|
-
{ "node" : "links",
|
25
|
-
"name" : "root",
|
26
|
-
"path" : "//*[@id='MainInner']/div[1]/ul/li/a",
|
27
|
-
"children" : [
|
28
|
-
{ "node" : "text",
|
29
|
-
"name" : "title",
|
30
|
-
"path" : "//*[@id='MainInner']/div[1]/div/h1"
|
31
|
-
},
|
32
|
-
{ "node" : "text",
|
33
|
-
"name" : "article",
|
34
|
-
"path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
|
35
|
-
}
|
36
|
-
]
|
37
|
-
}
|
38
|
-
EOJSON
|
39
|
-
root = Yasuri.json2tree(src)
|
40
|
-
|
41
|
-
# Access to parsed resources
|
42
|
-
page = agent.get(uri)
|
43
|
-
contents = root.inject(agent, page)
|
44
|
-
|
45
|
-
contents.each do |h|
|
46
|
-
t = h['title']
|
47
|
-
a = h['article']
|
48
|
-
|
49
|
-
puts t
|
50
|
-
puts a
|
51
|
-
puts "=" * 100
|
52
|
-
end
|