yasuri 2.0.12 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,15 +7,50 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
+
def scrape(uri, opt = {})
|
15
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
16
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
17
|
+
|
18
|
+
agent = Mechanize.new
|
19
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
20
|
+
scrape_with_agent(uri, agent, opt)
|
21
|
+
end
|
22
|
+
|
23
|
+
def scrape_with_agent(uri, agent, opt = {})
|
24
|
+
page = agent.get(uri)
|
25
|
+
inject(agent, page, opt)
|
26
|
+
end
|
27
|
+
|
14
28
|
def inject(agent, page, opt = {}, element = page)
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
29
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_h
|
33
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
34
|
+
|
35
|
+
node_hash = {}
|
36
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
37
|
+
|
38
|
+
node_hash[:path] = @xpath if @xpath
|
39
|
+
|
40
|
+
children.each do |child|
|
41
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
42
|
+
node_hash[child_node_name] = child.to_h
|
43
|
+
end
|
44
|
+
|
45
|
+
node_hash
|
16
46
|
end
|
47
|
+
|
17
48
|
def opts
|
18
49
|
{}
|
19
50
|
end
|
51
|
+
|
52
|
+
def node_type_str
|
53
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
54
|
+
end
|
20
55
|
end
|
21
56
|
end
|
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
|
|
6
6
|
require_relative 'yasuri_struct_node'
|
7
7
|
require_relative 'yasuri_links_node'
|
8
8
|
require_relative 'yasuri_paginate_node'
|
9
|
+
require_relative 'yasuri_map_node'
|
9
10
|
|
10
11
|
module Yasuri
|
11
12
|
class NodeGenerator
|
@@ -15,29 +16,33 @@ module Yasuri
|
|
15
16
|
@nodes
|
16
17
|
end
|
17
18
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
19
|
+
def method_missing(name, pattern=nil, **args, &block)
|
20
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
21
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
22
|
@nodes << node
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.gen(
|
25
|
-
xpath, opt = *args
|
26
|
-
opt = [opt].flatten.compact
|
25
|
+
def self.gen(method_name, xpath, **opt, &block)
|
27
26
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
27
|
|
29
|
-
case
|
28
|
+
case method_name
|
30
29
|
when /^text_(.+)$/
|
31
|
-
|
30
|
+
# Todo raise error xpath is not valid
|
31
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
32
|
when /^struct_(.+)$/
|
33
|
-
|
33
|
+
# Todo raise error xpath is not valid
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
35
|
when /^links_(.+)$/
|
35
|
-
|
36
|
+
# Todo raise error xpath is not valid
|
37
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
38
|
when /^pages_(.+)$/
|
37
|
-
|
39
|
+
# Todo raise error xpath is not valid
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
41
|
+
when /^map_(.+)$/
|
42
|
+
Yasuri::MapNode.new($1, children, **opt)
|
38
43
|
else
|
39
44
|
nil
|
40
45
|
end
|
41
|
-
end # of self.gen(
|
46
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
42
47
|
end # of class NodeGenerator
|
43
48
|
end
|
@@ -14,7 +14,8 @@ module Yasuri
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] ||
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
18
19
|
|
19
20
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
21
|
|
@@ -22,16 +23,16 @@ module Yasuri
|
|
22
23
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
24
|
while page
|
24
25
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
26
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
27
|
[child_name, child_node.inject(agent, page, opt)]
|
27
28
|
end
|
28
29
|
child_results << Hash[child_results_kv]
|
29
30
|
|
30
|
-
link = page.search(@xpath).first
|
31
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
31
32
|
break if link == nil
|
32
33
|
|
33
34
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
34
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
35
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
35
36
|
break if (limit -= 1) <= 0
|
36
37
|
end
|
37
38
|
|
@@ -41,8 +42,13 @@ module Yasuri
|
|
41
42
|
|
42
43
|
child_results
|
43
44
|
end
|
45
|
+
|
44
46
|
def opts
|
45
47
|
{limit:@limit, flatten:@flatten}
|
46
48
|
end
|
49
|
+
|
50
|
+
def node_type_str
|
51
|
+
"pages".freeze
|
52
|
+
end
|
47
53
|
end
|
48
54
|
end
|
@@ -10,12 +10,16 @@ module Yasuri
|
|
10
10
|
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
14
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -7,15 +7,17 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
+
truncate = opt[:truncate]
|
14
|
+
proc = opt[:proc]
|
15
|
+
|
13
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
14
17
|
@truncate = truncate
|
15
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
16
19
|
|
17
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
18
|
-
|
19
21
|
end
|
20
22
|
|
21
23
|
def inject(agent, page, opt = {}, element = page)
|
@@ -28,11 +30,16 @@ module Yasuri
|
|
28
30
|
end
|
29
31
|
|
30
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
33
|
+
|
31
34
|
text
|
32
35
|
end
|
33
36
|
|
34
37
|
def opts
|
35
38
|
{truncate:@truncate, proc:@proc}
|
36
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
37
44
|
end
|
38
45
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
|
|
12
12
|
}
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
15
|
require 'simplecov'
|
21
16
|
require 'coveralls'
|
22
17
|
Coveralls.wear!
|
23
18
|
|
24
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
25
20
|
SimpleCov::Formatter::HTMLFormatter,
|
26
21
|
Coveralls::SimpleCov::Formatter
|
27
22
|
]
|
@@ -31,8 +26,8 @@ SimpleCov.start
|
|
31
26
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
32
27
|
require 'yasuri'
|
33
28
|
|
34
|
-
def compare_generated_vs_original(generated, original,
|
35
|
-
expected = original.
|
36
|
-
actual = generated.
|
29
|
+
def compare_generated_vs_original(generated, original, uri)
|
30
|
+
expected = original.scrape(uri)
|
31
|
+
actual = generated.scrape(uri)
|
37
32
|
expect(actual).to match expected
|
38
33
|
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
it "interval option is effect for each request" do
|
64
|
+
allow(Kernel).to receive(:sleep)
|
65
|
+
|
66
|
+
Yasuri::CLI.new.invoke(
|
67
|
+
:scrape,
|
68
|
+
[uri+"/pagination/page01.html"],
|
69
|
+
{file: "#{@res_dir}/tree.yml", interval: 500}
|
70
|
+
)
|
71
|
+
|
72
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
73
|
+
expect(interval_sec).to match 0.5
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "display ERROR when json string is wrong" do
|
78
|
+
wrong_json = '{,,}'
|
79
|
+
expect {
|
80
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
81
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
82
|
+
end
|
83
|
+
it "display ERROR when json file contains is wrong" do
|
84
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
85
|
+
expect {
|
86
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
87
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
88
|
+
end
|
89
|
+
it "display ERROR when yaml file contains is wrong" do
|
90
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
91
|
+
expect {
|
92
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
93
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -11,9 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::LinksNode' do
|
13
13
|
before do
|
14
|
-
@agent = Mechanize.new
|
15
14
|
@uri = uri
|
16
|
-
@index_page = @agent.get(@uri)
|
17
15
|
end
|
18
16
|
|
19
17
|
it 'scrape links' do
|
@@ -21,7 +19,7 @@ describe 'Yasuri' do
|
|
21
19
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
20
|
])
|
23
21
|
|
24
|
-
actual = root_node.
|
22
|
+
actual = root_node.scrape(@uri)
|
25
23
|
expected = [
|
26
24
|
{"content" => "Child 01 page."},
|
27
25
|
{"content" => "Child 02 page."},
|
@@ -36,7 +34,7 @@ describe 'Yasuri' do
|
|
36
34
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
37
35
|
])
|
38
36
|
|
39
|
-
actual = root_node.
|
37
|
+
actual = root_node.scrape(@uri)
|
40
38
|
expect(actual).to be_empty
|
41
39
|
end
|
42
40
|
|
@@ -47,7 +45,7 @@ describe 'Yasuri' do
|
|
47
45
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
48
46
|
]),
|
49
47
|
])
|
50
|
-
actual = root_node.
|
48
|
+
actual = root_node.scrape(@uri)
|
51
49
|
expected = [
|
52
50
|
{"content" => "Child 01 page.",
|
53
51
|
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
@@ -59,10 +57,18 @@ describe 'Yasuri' do
|
|
59
57
|
]
|
60
58
|
expect(actual).to match expected
|
61
59
|
end
|
62
|
-
it 'can be defined by DSL, return
|
63
|
-
|
64
|
-
|
65
|
-
|
60
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
61
|
+
root_node = Yasuri.links_title '/html/body/a'
|
62
|
+
actual = root_node.scrape(@uri)
|
63
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
64
|
+
expect(actual).to match expected
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'can be defined return no contains if no child node' do
|
68
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
69
|
+
actual = root_node.scrape(@uri)
|
70
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
71
|
+
expect(actual).to match expected
|
66
72
|
end
|
67
73
|
it 'can be defined by DSL, return nested contents under link' do
|
68
74
|
generated = Yasuri.links_title '/html/body/a' do
|
@@ -71,7 +77,7 @@ describe 'Yasuri' do
|
|
71
77
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
72
78
|
Yasuri::TextNode.new('/html/body/p', "name"),
|
73
79
|
])
|
74
|
-
compare_generated_vs_original(generated, original, @
|
80
|
+
compare_generated_vs_original(generated, original, @uri)
|
75
81
|
end
|
76
82
|
|
77
83
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -88,7 +94,7 @@ describe 'Yasuri' do
|
|
88
94
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
89
95
|
]),
|
90
96
|
])
|
91
|
-
compare_generated_vs_original(generated, original, @
|
97
|
+
compare_generated_vs_original(generated, original, @uri)
|
92
98
|
end
|
93
99
|
|
94
100
|
it 'return child node as symbol' do
|
@@ -96,7 +102,7 @@ describe 'Yasuri' do
|
|
96
102
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
97
103
|
])
|
98
104
|
|
99
|
-
actual = root_node.
|
105
|
+
actual = root_node.scrape(@uri, symbolize_names: true )
|
100
106
|
expected = [
|
101
107
|
{:content => "Child 01 page."},
|
102
108
|
{:content => "Child 02 page."},
|
@@ -104,5 +110,21 @@ describe 'Yasuri' do
|
|
104
110
|
]
|
105
111
|
expect(actual).to match expected
|
106
112
|
end
|
113
|
+
|
114
|
+
it 'scrape with interval for each request' do
|
115
|
+
allow(Kernel).to receive(:sleep)
|
116
|
+
|
117
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
118
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
119
|
+
])
|
120
|
+
actual = root_node.scrape(@uri, interval_ms: 100)
|
121
|
+
|
122
|
+
expect(actual.size).to match 3
|
123
|
+
|
124
|
+
# request will be run 4(1+3) times because root page will be requested
|
125
|
+
expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
|
126
|
+
expect(interval_sec).to match 0.1
|
127
|
+
end
|
128
|
+
end
|
107
129
|
end
|
108
130
|
end
|