yasuri 2.0.12 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,15 +7,50 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
+
def scrape(uri, opt = {})
|
15
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
16
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
17
|
+
|
18
|
+
agent = Mechanize.new
|
19
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
20
|
+
scrape_with_agent(uri, agent, opt)
|
21
|
+
end
|
22
|
+
|
23
|
+
def scrape_with_agent(uri, agent, opt = {})
|
24
|
+
page = agent.get(uri)
|
25
|
+
inject(agent, page, opt)
|
26
|
+
end
|
27
|
+
|
14
28
|
def inject(agent, page, opt = {}, element = page)
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
29
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_h
|
33
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
34
|
+
|
35
|
+
node_hash = {}
|
36
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
37
|
+
|
38
|
+
node_hash[:path] = @xpath if @xpath
|
39
|
+
|
40
|
+
children.each do |child|
|
41
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
42
|
+
node_hash[child_node_name] = child.to_h
|
43
|
+
end
|
44
|
+
|
45
|
+
node_hash
|
16
46
|
end
|
47
|
+
|
17
48
|
def opts
|
18
49
|
{}
|
19
50
|
end
|
51
|
+
|
52
|
+
def node_type_str
|
53
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
54
|
+
end
|
20
55
|
end
|
21
56
|
end
|
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
|
|
6
6
|
require_relative 'yasuri_struct_node'
|
7
7
|
require_relative 'yasuri_links_node'
|
8
8
|
require_relative 'yasuri_paginate_node'
|
9
|
+
require_relative 'yasuri_map_node'
|
9
10
|
|
10
11
|
module Yasuri
|
11
12
|
class NodeGenerator
|
@@ -15,29 +16,33 @@ module Yasuri
|
|
15
16
|
@nodes
|
16
17
|
end
|
17
18
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
19
|
+
def method_missing(name, pattern=nil, **args, &block)
|
20
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
21
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
22
|
@nodes << node
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.gen(
|
25
|
-
xpath, opt = *args
|
26
|
-
opt = [opt].flatten.compact
|
25
|
+
def self.gen(method_name, xpath, **opt, &block)
|
27
26
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
27
|
|
29
|
-
case
|
28
|
+
case method_name
|
30
29
|
when /^text_(.+)$/
|
31
|
-
|
30
|
+
# Todo raise error xpath is not valid
|
31
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
32
|
when /^struct_(.+)$/
|
33
|
-
|
33
|
+
# Todo raise error xpath is not valid
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
35
|
when /^links_(.+)$/
|
35
|
-
|
36
|
+
# Todo raise error xpath is not valid
|
37
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
38
|
when /^pages_(.+)$/
|
37
|
-
|
39
|
+
# Todo raise error xpath is not valid
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
41
|
+
when /^map_(.+)$/
|
42
|
+
Yasuri::MapNode.new($1, children, **opt)
|
38
43
|
else
|
39
44
|
nil
|
40
45
|
end
|
41
|
-
end # of self.gen(
|
46
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
42
47
|
end # of class NodeGenerator
|
43
48
|
end
|
@@ -14,7 +14,8 @@ module Yasuri
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] ||
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
18
19
|
|
19
20
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
21
|
|
@@ -22,16 +23,16 @@ module Yasuri
|
|
22
23
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
24
|
while page
|
24
25
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
26
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
27
|
[child_name, child_node.inject(agent, page, opt)]
|
27
28
|
end
|
28
29
|
child_results << Hash[child_results_kv]
|
29
30
|
|
30
|
-
link = page.search(@xpath).first
|
31
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
31
32
|
break if link == nil
|
32
33
|
|
33
34
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
34
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
35
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
35
36
|
break if (limit -= 1) <= 0
|
36
37
|
end
|
37
38
|
|
@@ -41,8 +42,13 @@ module Yasuri
|
|
41
42
|
|
42
43
|
child_results
|
43
44
|
end
|
45
|
+
|
44
46
|
def opts
|
45
47
|
{limit:@limit, flatten:@flatten}
|
46
48
|
end
|
49
|
+
|
50
|
+
def node_type_str
|
51
|
+
"pages".freeze
|
52
|
+
end
|
47
53
|
end
|
48
54
|
end
|
@@ -10,12 +10,16 @@ module Yasuri
|
|
10
10
|
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
14
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -7,15 +7,17 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
+
truncate = opt[:truncate]
|
14
|
+
proc = opt[:proc]
|
15
|
+
|
13
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
14
17
|
@truncate = truncate
|
15
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
16
19
|
|
17
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
18
|
-
|
19
21
|
end
|
20
22
|
|
21
23
|
def inject(agent, page, opt = {}, element = page)
|
@@ -28,11 +30,16 @@ module Yasuri
|
|
28
30
|
end
|
29
31
|
|
30
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
33
|
+
|
31
34
|
text
|
32
35
|
end
|
33
36
|
|
34
37
|
def opts
|
35
38
|
{truncate:@truncate, proc:@proc}
|
36
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
37
44
|
end
|
38
45
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
|
|
12
12
|
}
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
15
|
require 'simplecov'
|
21
16
|
require 'coveralls'
|
22
17
|
Coveralls.wear!
|
23
18
|
|
24
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
25
20
|
SimpleCov::Formatter::HTMLFormatter,
|
26
21
|
Coveralls::SimpleCov::Formatter
|
27
22
|
]
|
@@ -31,8 +26,8 @@ SimpleCov.start
|
|
31
26
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
32
27
|
require 'yasuri'
|
33
28
|
|
34
|
-
def compare_generated_vs_original(generated, original,
|
35
|
-
expected = original.
|
36
|
-
actual = generated.
|
29
|
+
def compare_generated_vs_original(generated, original, uri)
|
30
|
+
expected = original.scrape(uri)
|
31
|
+
actual = generated.scrape(uri)
|
37
32
|
expect(actual).to match expected
|
38
33
|
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
it "interval option is effect for each request" do
|
64
|
+
allow(Kernel).to receive(:sleep)
|
65
|
+
|
66
|
+
Yasuri::CLI.new.invoke(
|
67
|
+
:scrape,
|
68
|
+
[uri+"/pagination/page01.html"],
|
69
|
+
{file: "#{@res_dir}/tree.yml", interval: 500}
|
70
|
+
)
|
71
|
+
|
72
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
73
|
+
expect(interval_sec).to match 0.5
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "display ERROR when json string is wrong" do
|
78
|
+
wrong_json = '{,,}'
|
79
|
+
expect {
|
80
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
81
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
82
|
+
end
|
83
|
+
it "display ERROR when json file contains is wrong" do
|
84
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
85
|
+
expect {
|
86
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
87
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
88
|
+
end
|
89
|
+
it "display ERROR when yaml file contains is wrong" do
|
90
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
91
|
+
expect {
|
92
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
93
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -11,9 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::LinksNode' do
|
13
13
|
before do
|
14
|
-
@agent = Mechanize.new
|
15
14
|
@uri = uri
|
16
|
-
@index_page = @agent.get(@uri)
|
17
15
|
end
|
18
16
|
|
19
17
|
it 'scrape links' do
|
@@ -21,7 +19,7 @@ describe 'Yasuri' do
|
|
21
19
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
20
|
])
|
23
21
|
|
24
|
-
actual = root_node.
|
22
|
+
actual = root_node.scrape(@uri)
|
25
23
|
expected = [
|
26
24
|
{"content" => "Child 01 page."},
|
27
25
|
{"content" => "Child 02 page."},
|
@@ -36,7 +34,7 @@ describe 'Yasuri' do
|
|
36
34
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
37
35
|
])
|
38
36
|
|
39
|
-
actual = root_node.
|
37
|
+
actual = root_node.scrape(@uri)
|
40
38
|
expect(actual).to be_empty
|
41
39
|
end
|
42
40
|
|
@@ -47,7 +45,7 @@ describe 'Yasuri' do
|
|
47
45
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
48
46
|
]),
|
49
47
|
])
|
50
|
-
actual = root_node.
|
48
|
+
actual = root_node.scrape(@uri)
|
51
49
|
expected = [
|
52
50
|
{"content" => "Child 01 page.",
|
53
51
|
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
@@ -59,10 +57,18 @@ describe 'Yasuri' do
|
|
59
57
|
]
|
60
58
|
expect(actual).to match expected
|
61
59
|
end
|
62
|
-
it 'can be defined by DSL, return
|
63
|
-
|
64
|
-
|
65
|
-
|
60
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
61
|
+
root_node = Yasuri.links_title '/html/body/a'
|
62
|
+
actual = root_node.scrape(@uri)
|
63
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
64
|
+
expect(actual).to match expected
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'can be defined return no contains if no child node' do
|
68
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
69
|
+
actual = root_node.scrape(@uri)
|
70
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
71
|
+
expect(actual).to match expected
|
66
72
|
end
|
67
73
|
it 'can be defined by DSL, return nested contents under link' do
|
68
74
|
generated = Yasuri.links_title '/html/body/a' do
|
@@ -71,7 +77,7 @@ describe 'Yasuri' do
|
|
71
77
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
72
78
|
Yasuri::TextNode.new('/html/body/p', "name"),
|
73
79
|
])
|
74
|
-
compare_generated_vs_original(generated, original, @
|
80
|
+
compare_generated_vs_original(generated, original, @uri)
|
75
81
|
end
|
76
82
|
|
77
83
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -88,7 +94,7 @@ describe 'Yasuri' do
|
|
88
94
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
89
95
|
]),
|
90
96
|
])
|
91
|
-
compare_generated_vs_original(generated, original, @
|
97
|
+
compare_generated_vs_original(generated, original, @uri)
|
92
98
|
end
|
93
99
|
|
94
100
|
it 'return child node as symbol' do
|
@@ -96,7 +102,7 @@ describe 'Yasuri' do
|
|
96
102
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
97
103
|
])
|
98
104
|
|
99
|
-
actual = root_node.
|
105
|
+
actual = root_node.scrape(@uri, symbolize_names: true )
|
100
106
|
expected = [
|
101
107
|
{:content => "Child 01 page."},
|
102
108
|
{:content => "Child 02 page."},
|
@@ -104,5 +110,21 @@ describe 'Yasuri' do
|
|
104
110
|
]
|
105
111
|
expect(actual).to match expected
|
106
112
|
end
|
113
|
+
|
114
|
+
it 'scrape with interval for each request' do
|
115
|
+
allow(Kernel).to receive(:sleep)
|
116
|
+
|
117
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
118
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
119
|
+
])
|
120
|
+
actual = root_node.scrape(@uri, interval_ms: 100)
|
121
|
+
|
122
|
+
expect(actual.size).to match 3
|
123
|
+
|
124
|
+
# request will be run 4(1+3) times because root page will be requested
|
125
|
+
expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
|
126
|
+
expect(interval_sec).to match 0.1
|
127
|
+
end
|
128
|
+
end
|
107
129
|
end
|
108
130
|
end
|