yasuri 2.0.13 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/.ruby-version +1 -1
- data/README.md +82 -31
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -75
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +17 -14
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +13 -8
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -11
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +92 -60
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +108 -19
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
module Yasuri
|
3
|
+
class MapNode
|
4
|
+
include Node
|
5
|
+
attr_reader :name, :children
|
6
|
+
|
7
|
+
def initialize(name, children, **opt)
|
8
|
+
@name = name
|
9
|
+
@children = children
|
10
|
+
@opt = opt
|
11
|
+
end
|
12
|
+
|
13
|
+
def inject(agent, page, opt = {}, _element = page)
|
14
|
+
child_results_kv = @children.map do |node|
|
15
|
+
[node.name, node.inject(agent, page, opt)]
|
16
|
+
end
|
17
|
+
Hash[child_results_kv]
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
|
23
|
+
|
24
|
+
children.each do |child|
|
25
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
26
|
+
node_hash[child_node_name] = child.to_h
|
27
|
+
end
|
28
|
+
|
29
|
+
node_hash
|
30
|
+
end
|
31
|
+
|
32
|
+
def opts
|
33
|
+
{}
|
34
|
+
end
|
35
|
+
|
36
|
+
def node_type_str
|
37
|
+
"map".freeze
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -1,21 +1,53 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
module Node
|
8
6
|
attr_reader :url, :xpath, :name, :children
|
9
7
|
|
10
|
-
def initialize(xpath, name, children = [],
|
8
|
+
def initialize(xpath, name, children = [], **_opt)
|
11
9
|
@xpath, @name, @children = xpath, name, children
|
12
10
|
end
|
13
11
|
|
12
|
+
def scrape(uri, opt = {})
|
13
|
+
agent = Mechanize.new
|
14
|
+
scrape_with_agent(uri, agent, opt)
|
15
|
+
end
|
16
|
+
|
17
|
+
def scrape_with_agent(uri, agent, opt = {})
|
18
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
19
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
20
|
+
|
21
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
22
|
+
inject(agent, page, opt)
|
23
|
+
end
|
24
|
+
|
14
25
|
def inject(agent, page, opt = {}, element = page)
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
26
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
16
27
|
end
|
28
|
+
|
29
|
+
def to_h
|
30
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
31
|
+
|
32
|
+
node_hash = {}
|
33
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
34
|
+
|
35
|
+
node_hash[:path] = @xpath if @xpath
|
36
|
+
|
37
|
+
children.each do |child|
|
38
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
39
|
+
node_hash[child_node_name] = child.to_h
|
40
|
+
end
|
41
|
+
|
42
|
+
node_hash
|
43
|
+
end
|
44
|
+
|
17
45
|
def opts
|
18
46
|
{}
|
19
47
|
end
|
48
|
+
|
49
|
+
def node_type_str
|
50
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
51
|
+
end
|
20
52
|
end
|
21
53
|
end
|
@@ -1,11 +1,10 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
require_relative 'yasuri_text_node'
|
6
4
|
require_relative 'yasuri_struct_node'
|
7
5
|
require_relative 'yasuri_links_node'
|
8
6
|
require_relative 'yasuri_paginate_node'
|
7
|
+
require_relative 'yasuri_map_node'
|
9
8
|
|
10
9
|
module Yasuri
|
11
10
|
class NodeGenerator
|
@@ -15,29 +14,33 @@ module Yasuri
|
|
15
14
|
@nodes
|
16
15
|
end
|
17
16
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
17
|
+
def method_missing(name, pattern=nil, **args, &block)
|
18
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
19
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
20
|
@nodes << node
|
22
21
|
end
|
23
22
|
|
24
|
-
def self.gen(
|
25
|
-
|
26
|
-
opt = [opt].flatten.compact
|
27
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
23
|
+
def self.gen(method_name, xpath, **opt, &block)
|
24
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
|
28
25
|
|
29
|
-
case
|
26
|
+
case method_name
|
30
27
|
when /^text_(.+)$/
|
31
|
-
|
28
|
+
# Todo raise error xpath is not valid
|
29
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
30
|
when /^struct_(.+)$/
|
33
|
-
|
31
|
+
# Todo raise error xpath is not valid
|
32
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
33
|
when /^links_(.+)$/
|
35
|
-
|
34
|
+
# Todo raise error xpath is not valid
|
35
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
36
|
when /^pages_(.+)$/
|
37
|
-
|
37
|
+
# Todo raise error xpath is not valid
|
38
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
39
|
+
when /^map_(.+)$/
|
40
|
+
Yasuri::MapNode.new($1, children, **opt)
|
38
41
|
else
|
39
42
|
nil
|
40
43
|
end
|
41
|
-
end # of self.gen(
|
44
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
42
45
|
end # of class NodeGenerator
|
43
46
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -14,35 +12,47 @@ module Yasuri
|
|
14
12
|
end
|
15
13
|
|
16
14
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] || 5
|
18
|
-
|
19
15
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
16
|
|
21
|
-
child_results = []
|
22
17
|
limit = @limit.nil? ? Float::MAX : @limit
|
18
|
+
child_results = inject_child(agent, page, limit, opt)
|
19
|
+
|
20
|
+
return child_results.map(&:values).flatten if @flatten == true
|
21
|
+
|
22
|
+
child_results
|
23
|
+
end
|
24
|
+
|
25
|
+
def opts
|
26
|
+
{ limit: @limit, flatten: @flatten }
|
27
|
+
end
|
28
|
+
|
29
|
+
def node_type_str
|
30
|
+
"pages".freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def inject_child(agent, page, limit, opt)
|
36
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
37
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
38
|
+
|
39
|
+
child_results = []
|
23
40
|
while page
|
24
41
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
42
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
43
|
[child_name, child_node.inject(agent, page, opt)]
|
27
44
|
end
|
28
45
|
child_results << Hash[child_results_kv]
|
29
46
|
|
30
|
-
link = page.search(@xpath).first
|
31
|
-
break if link
|
47
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
48
|
+
break if link.nil?
|
32
49
|
|
33
50
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
34
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
51
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
35
52
|
break if (limit -= 1) <= 0
|
36
53
|
end
|
37
54
|
|
38
|
-
if @flatten == true
|
39
|
-
return child_results.map{|h| h.values}.flatten
|
40
|
-
end
|
41
|
-
|
42
55
|
child_results
|
43
56
|
end
|
44
|
-
def opts
|
45
|
-
{limit:@limit, flatten:@flatten}
|
46
|
-
end
|
47
57
|
end
|
48
58
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -10,12 +8,16 @@ module Yasuri
|
|
10
8
|
sub_tags = element.search(@xpath)
|
11
9
|
tree = sub_tags.map do |sub_tag|
|
12
10
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
11
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
12
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
13
|
end
|
16
14
|
Hash[child_results_kv]
|
17
15
|
end
|
18
16
|
tree.size == 1 ? tree.first : tree
|
19
|
-
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def node_type_str
|
20
|
+
"struct".freeze
|
21
|
+
end
|
20
22
|
end
|
21
23
|
end
|
@@ -1,24 +1,24 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
class TextNode
|
8
6
|
include Node
|
9
7
|
|
10
|
-
def initialize(xpath, name, children = [],
|
8
|
+
def initialize(xpath, name, children = [], **opt)
|
11
9
|
super(xpath, name, children)
|
12
10
|
|
13
|
-
truncate =
|
11
|
+
truncate = opt[:truncate]
|
12
|
+
proc = opt[:proc]
|
13
|
+
|
14
|
+
truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
|
14
15
|
@truncate = truncate
|
15
|
-
@truncate = Regexp.new(@truncate.to_s)
|
16
|
+
@truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
|
16
17
|
|
17
18
|
@proc = proc.nil? ? nil : proc.to_sym
|
18
|
-
|
19
19
|
end
|
20
20
|
|
21
|
-
def inject(
|
21
|
+
def inject(_agent, page, _opt = {}, element = page)
|
22
22
|
node = element.search(@xpath)
|
23
23
|
text = node.text.to_s
|
24
24
|
|
@@ -28,11 +28,16 @@ module Yasuri
|
|
28
28
|
end
|
29
29
|
|
30
30
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
31
|
+
|
31
32
|
text
|
32
33
|
end
|
33
34
|
|
34
35
|
def opts
|
35
|
-
{truncate
|
36
|
+
{ truncate: @truncate, proc: @proc }
|
37
|
+
end
|
38
|
+
|
39
|
+
def node_type_str
|
40
|
+
"text".freeze
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
data/spec/servers/httpserver.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
1
|
|
4
2
|
require 'glint'
|
5
3
|
Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
|
@@ -12,16 +10,11 @@ shared_context 'httpserver' do
|
|
12
10
|
}
|
13
11
|
end
|
14
12
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
13
|
require 'simplecov'
|
21
14
|
require 'coveralls'
|
22
15
|
Coveralls.wear!
|
23
16
|
|
24
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
17
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
25
18
|
SimpleCov::Formatter::HTMLFormatter,
|
26
19
|
Coveralls::SimpleCov::Formatter
|
27
20
|
]
|
@@ -31,8 +24,8 @@ SimpleCov.start
|
|
31
24
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
32
25
|
require 'yasuri'
|
33
26
|
|
34
|
-
def compare_generated_vs_original(generated, original,
|
35
|
-
expected = original.
|
36
|
-
actual = generated.
|
27
|
+
def compare_generated_vs_original(generated, original, uri)
|
28
|
+
expected = original.scrape(uri)
|
29
|
+
actual = generated.scrape(uri)
|
37
30
|
expect(actual).to match expected
|
38
31
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
let(:res_dir) { File.expand_path('cli_resources', __dir__) }
|
6
|
+
|
7
|
+
describe 'cli scrape' do
|
8
|
+
it 'require --file or --json option' do
|
9
|
+
expect do
|
10
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
11
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'only one of --file or --json option' do
|
15
|
+
expect do
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
|
17
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'require --file option is not empty string' do
|
21
|
+
expect do
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
|
23
|
+
end.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'require --json option is not empty string' do
|
27
|
+
expect do
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
|
29
|
+
end.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'display text node as simple string' do
|
33
|
+
expect do
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
|
35
|
+
end.to output("Yasuri Test\n").to_stdout
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'display texts in single json' do
|
39
|
+
expect do
|
40
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
|
41
|
+
end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'display text node as simple string via json file' do
|
45
|
+
expect do
|
46
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
|
47
|
+
end.to output(
|
48
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
49
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
50
|
+
"\n"
|
51
|
+
).to_stdout
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'display text node as simple string via yaml file' do
|
55
|
+
expect do
|
56
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
|
57
|
+
end.to output(
|
58
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
59
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
60
|
+
"\n"
|
61
|
+
).to_stdout
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'interval option is effect for each request' do
|
65
|
+
allow(Kernel).to receive(:sleep)
|
66
|
+
|
67
|
+
expect do
|
68
|
+
Yasuri::CLI.new.invoke(
|
69
|
+
:scrape,
|
70
|
+
["#{uri}/pagination/page01.html"],
|
71
|
+
{ file: "#{res_dir}/tree.yml", interval: 500 }
|
72
|
+
)
|
73
|
+
end.to output(
|
74
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
75
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
76
|
+
"\n"
|
77
|
+
).to_stdout
|
78
|
+
|
79
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
80
|
+
expect(interval_sec).to match 0.5
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'display ERROR when json string is wrong' do
|
85
|
+
wrong_json = '{,,}'
|
86
|
+
expect do
|
87
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
|
88
|
+
end.to output(
|
89
|
+
'ERROR: Failed to convert json to yasuri tree. ' \
|
90
|
+
"809: unexpected token at '#{wrong_json}'\n"
|
91
|
+
).to_stderr
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'display ERROR when json file contains is wrong' do
|
95
|
+
file_path = "#{res_dir}/tree_wrong.json"
|
96
|
+
expect do
|
97
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
98
|
+
end.to output(
|
99
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
100
|
+
"(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
|
101
|
+
).to_stderr
|
102
|
+
end
|
103
|
+
|
104
|
+
it 'display ERROR when yaml file contains is wrong' do
|
105
|
+
file_path = "#{res_dir}/tree_wrong.yml"
|
106
|
+
expect do
|
107
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
108
|
+
end.to output(
|
109
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
110
|
+
"(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
|
111
|
+
).to_stderr
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|