yasuri 3.0.0 → 3.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +70 -27
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -76
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +14 -9
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +9 -7
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -6
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +82 -58
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +105 -15
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
module Yasuri
|
3
|
+
class MapNode
|
4
|
+
include Node
|
5
|
+
attr_reader :name, :children
|
6
|
+
|
7
|
+
def initialize(name, children, **opt)
|
8
|
+
@name = name
|
9
|
+
@children = children
|
10
|
+
@opt = opt
|
11
|
+
end
|
12
|
+
|
13
|
+
def inject(agent, page, opt = {}, _element = page)
|
14
|
+
child_results_kv = @children.map do |node|
|
15
|
+
[node.name, node.inject(agent, page, opt)]
|
16
|
+
end
|
17
|
+
Hash[child_results_kv]
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
|
23
|
+
|
24
|
+
children.each do |child|
|
25
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
26
|
+
node_hash[child_node_name] = child.to_h
|
27
|
+
end
|
28
|
+
|
29
|
+
node_hash
|
30
|
+
end
|
31
|
+
|
32
|
+
def opts
|
33
|
+
{}
|
34
|
+
end
|
35
|
+
|
36
|
+
def node_type_str
|
37
|
+
"map".freeze
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -1,21 +1,53 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
module Node
|
8
6
|
attr_reader :url, :xpath, :name, :children
|
9
7
|
|
10
|
-
def initialize(xpath, name, children = [],
|
8
|
+
def initialize(xpath, name, children = [], **_opt)
|
11
9
|
@xpath, @name, @children = xpath, name, children
|
12
10
|
end
|
13
11
|
|
12
|
+
def scrape(uri, opt = {})
|
13
|
+
agent = Mechanize.new
|
14
|
+
scrape_with_agent(uri, agent, opt)
|
15
|
+
end
|
16
|
+
|
17
|
+
def scrape_with_agent(uri, agent, opt = {})
|
18
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
19
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
20
|
+
|
21
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
22
|
+
inject(agent, page, opt)
|
23
|
+
end
|
24
|
+
|
14
25
|
def inject(agent, page, opt = {}, element = page)
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
26
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
16
27
|
end
|
28
|
+
|
29
|
+
def to_h
|
30
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
31
|
+
|
32
|
+
node_hash = {}
|
33
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
34
|
+
|
35
|
+
node_hash[:path] = @xpath if @xpath
|
36
|
+
|
37
|
+
children.each do |child|
|
38
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
39
|
+
node_hash[child_node_name] = child.to_h
|
40
|
+
end
|
41
|
+
|
42
|
+
node_hash
|
43
|
+
end
|
44
|
+
|
17
45
|
def opts
|
18
46
|
{}
|
19
47
|
end
|
48
|
+
|
49
|
+
def node_type_str
|
50
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
51
|
+
end
|
20
52
|
end
|
21
53
|
end
|
@@ -1,11 +1,10 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
require_relative 'yasuri_text_node'
|
6
4
|
require_relative 'yasuri_struct_node'
|
7
5
|
require_relative 'yasuri_links_node'
|
8
6
|
require_relative 'yasuri_paginate_node'
|
7
|
+
require_relative 'yasuri_map_node'
|
9
8
|
|
10
9
|
module Yasuri
|
11
10
|
class NodeGenerator
|
@@ -15,27 +14,33 @@ module Yasuri
|
|
15
14
|
@nodes
|
16
15
|
end
|
17
16
|
|
18
|
-
def method_missing(name, pattern, **args, &block)
|
17
|
+
def method_missing(name, pattern=nil, **args, &block)
|
19
18
|
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
19
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
20
|
@nodes << node
|
22
21
|
end
|
23
22
|
|
24
|
-
def self.gen(
|
25
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if
|
23
|
+
def self.gen(method_name, xpath, **opt, &block)
|
24
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
|
26
25
|
|
27
|
-
case
|
26
|
+
case method_name
|
28
27
|
when /^text_(.+)$/
|
29
|
-
|
28
|
+
# Todo raise error xpath is not valid
|
29
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
30
30
|
when /^struct_(.+)$/
|
31
|
+
# Todo raise error xpath is not valid
|
31
32
|
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
32
33
|
when /^links_(.+)$/
|
33
|
-
|
34
|
+
# Todo raise error xpath is not valid
|
35
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
34
36
|
when /^pages_(.+)$/
|
37
|
+
# Todo raise error xpath is not valid
|
35
38
|
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
39
|
+
when /^map_(.+)$/
|
40
|
+
Yasuri::MapNode.new($1, children, **opt)
|
36
41
|
else
|
37
42
|
nil
|
38
43
|
end
|
39
|
-
end # of self.gen(
|
44
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
40
45
|
end # of class NodeGenerator
|
41
46
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -14,35 +12,47 @@ module Yasuri
|
|
14
12
|
end
|
15
13
|
|
16
14
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] || 5
|
18
|
-
|
19
15
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
16
|
|
21
|
-
child_results = []
|
22
17
|
limit = @limit.nil? ? Float::MAX : @limit
|
18
|
+
child_results = inject_child(agent, page, limit, opt)
|
19
|
+
|
20
|
+
return child_results.map(&:values).flatten if @flatten == true
|
21
|
+
|
22
|
+
child_results
|
23
|
+
end
|
24
|
+
|
25
|
+
def opts
|
26
|
+
{ limit: @limit, flatten: @flatten }
|
27
|
+
end
|
28
|
+
|
29
|
+
def node_type_str
|
30
|
+
"pages".freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def inject_child(agent, page, limit, opt)
|
36
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
37
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
38
|
+
|
39
|
+
child_results = []
|
23
40
|
while page
|
24
41
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
42
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
43
|
[child_name, child_node.inject(agent, page, opt)]
|
27
44
|
end
|
28
45
|
child_results << Hash[child_results_kv]
|
29
46
|
|
30
|
-
link = page.search(@xpath).first
|
31
|
-
break if link
|
47
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
48
|
+
break if link.nil?
|
32
49
|
|
33
50
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
34
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
51
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
35
52
|
break if (limit -= 1) <= 0
|
36
53
|
end
|
37
54
|
|
38
|
-
if @flatten == true
|
39
|
-
return child_results.map{|h| h.values}.flatten
|
40
|
-
end
|
41
|
-
|
42
55
|
child_results
|
43
56
|
end
|
44
|
-
def opts
|
45
|
-
{limit:@limit, flatten:@flatten}
|
46
|
-
end
|
47
57
|
end
|
48
58
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -10,12 +8,16 @@ module Yasuri
|
|
10
8
|
sub_tags = element.search(@xpath)
|
11
9
|
tree = sub_tags.map do |sub_tag|
|
12
10
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
11
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
12
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
13
|
end
|
16
14
|
Hash[child_results_kv]
|
17
15
|
end
|
18
16
|
tree.size == 1 ? tree.first : tree
|
19
|
-
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def node_type_str
|
20
|
+
"struct".freeze
|
21
|
+
end
|
20
22
|
end
|
21
23
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -13,15 +11,14 @@ module Yasuri
|
|
13
11
|
truncate = opt[:truncate]
|
14
12
|
proc = opt[:proc]
|
15
13
|
|
16
|
-
truncate = Regexp.new(truncate)
|
14
|
+
truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
|
17
15
|
@truncate = truncate
|
18
|
-
@truncate = Regexp.new(@truncate.to_s)
|
16
|
+
@truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
|
19
17
|
|
20
18
|
@proc = proc.nil? ? nil : proc.to_sym
|
21
|
-
|
22
19
|
end
|
23
20
|
|
24
|
-
def inject(
|
21
|
+
def inject(_agent, page, _opt = {}, element = page)
|
25
22
|
node = element.search(@xpath)
|
26
23
|
text = node.text.to_s
|
27
24
|
|
@@ -31,11 +28,16 @@ module Yasuri
|
|
31
28
|
end
|
32
29
|
|
33
30
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
31
|
+
|
34
32
|
text
|
35
33
|
end
|
36
34
|
|
37
35
|
def opts
|
38
|
-
{truncate
|
36
|
+
{ truncate: @truncate, proc: @proc }
|
37
|
+
end
|
38
|
+
|
39
|
+
def node_type_str
|
40
|
+
"text".freeze
|
39
41
|
end
|
40
42
|
end
|
41
43
|
end
|
data/spec/servers/httpserver.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
1
|
|
4
2
|
require 'glint'
|
5
3
|
Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
|
@@ -16,7 +14,7 @@ require 'simplecov'
|
|
16
14
|
require 'coveralls'
|
17
15
|
Coveralls.wear!
|
18
16
|
|
19
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
17
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
20
18
|
SimpleCov::Formatter::HTMLFormatter,
|
21
19
|
Coveralls::SimpleCov::Formatter
|
22
20
|
]
|
@@ -26,8 +24,8 @@ SimpleCov.start
|
|
26
24
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
27
25
|
require 'yasuri'
|
28
26
|
|
29
|
-
def compare_generated_vs_original(generated, original,
|
30
|
-
expected = original.
|
31
|
-
actual = generated.
|
27
|
+
def compare_generated_vs_original(generated, original, uri)
|
28
|
+
expected = original.scrape(uri)
|
29
|
+
actual = generated.scrape(uri)
|
32
30
|
expect(actual).to match expected
|
33
31
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
let(:res_dir) { File.expand_path('cli_resources', __dir__) }
|
6
|
+
|
7
|
+
describe 'cli scrape' do
|
8
|
+
it 'require --file or --json option' do
|
9
|
+
expect do
|
10
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
11
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'only one of --file or --json option' do
|
15
|
+
expect do
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
|
17
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'require --file option is not empty string' do
|
21
|
+
expect do
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
|
23
|
+
end.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'require --json option is not empty string' do
|
27
|
+
expect do
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
|
29
|
+
end.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'display text node as simple string' do
|
33
|
+
expect do
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
|
35
|
+
end.to output("Yasuri Test\n").to_stdout
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'display texts in single json' do
|
39
|
+
expect do
|
40
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
|
41
|
+
end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'display text node as simple string via json file' do
|
45
|
+
expect do
|
46
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
|
47
|
+
end.to output(
|
48
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
49
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
50
|
+
"\n"
|
51
|
+
).to_stdout
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'display text node as simple string via yaml file' do
|
55
|
+
expect do
|
56
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
|
57
|
+
end.to output(
|
58
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
59
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
60
|
+
"\n"
|
61
|
+
).to_stdout
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'interval option is effect for each request' do
|
65
|
+
allow(Kernel).to receive(:sleep)
|
66
|
+
|
67
|
+
expect do
|
68
|
+
Yasuri::CLI.new.invoke(
|
69
|
+
:scrape,
|
70
|
+
["#{uri}/pagination/page01.html"],
|
71
|
+
{ file: "#{res_dir}/tree.yml", interval: 500 }
|
72
|
+
)
|
73
|
+
end.to output(
|
74
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
75
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
76
|
+
"\n"
|
77
|
+
).to_stdout
|
78
|
+
|
79
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
80
|
+
expect(interval_sec).to match 0.5
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'display ERROR when json string is wrong' do
|
85
|
+
wrong_json = '{,,}'
|
86
|
+
expect do
|
87
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
|
88
|
+
end.to output(
|
89
|
+
'ERROR: Failed to convert json to yasuri tree. ' \
|
90
|
+
"809: unexpected token at '#{wrong_json}'\n"
|
91
|
+
).to_stderr
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'display ERROR when json file contains is wrong' do
|
95
|
+
file_path = "#{res_dir}/tree_wrong.json"
|
96
|
+
expect do
|
97
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
98
|
+
end.to output(
|
99
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
100
|
+
"(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
|
101
|
+
).to_stderr
|
102
|
+
end
|
103
|
+
|
104
|
+
it 'display ERROR when yaml file contains is wrong' do
|
105
|
+
file_path = "#{res_dir}/tree_wrong.yml"
|
106
|
+
expect do
|
107
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
108
|
+
end.to output(
|
109
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
110
|
+
"(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
|
111
|
+
).to_stderr
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|