yasuri 3.3.0 → 3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
require_relative 'yasuri_text_node'
|
6
4
|
require_relative 'yasuri_struct_node'
|
@@ -23,7 +21,7 @@ module Yasuri
|
|
23
21
|
end
|
24
22
|
|
25
23
|
def self.gen(method_name, xpath, **opt, &block)
|
26
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if
|
24
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
|
27
25
|
|
28
26
|
case method_name
|
29
27
|
when /^text_(.+)$/
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -14,13 +12,31 @@ module Yasuri
|
|
14
12
|
end
|
15
13
|
|
16
14
|
def inject(agent, page, opt = {}, element = page)
|
15
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
16
|
+
|
17
|
+
limit = @limit.nil? ? Float::MAX : @limit
|
18
|
+
child_results = inject_child(agent, page, limit, opt)
|
19
|
+
|
20
|
+
return child_results.map(&:values).flatten if @flatten == true
|
21
|
+
|
22
|
+
child_results
|
23
|
+
end
|
24
|
+
|
25
|
+
def opts
|
26
|
+
{ limit: @limit, flatten: @flatten }
|
27
|
+
end
|
28
|
+
|
29
|
+
def node_type_str
|
30
|
+
"pages".freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def inject_child(agent, page, limit, opt)
|
17
36
|
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
37
|
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
19
38
|
|
20
|
-
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
21
|
-
|
22
39
|
child_results = []
|
23
|
-
limit = @limit.nil? ? Float::MAX : @limit
|
24
40
|
while page
|
25
41
|
child_results_kv = @children.map do |child_node|
|
26
42
|
child_name = Yasuri.node_name(child_node.name, opt)
|
@@ -29,26 +45,14 @@ module Yasuri
|
|
29
45
|
child_results << Hash[child_results_kv]
|
30
46
|
|
31
47
|
link = page.search(@xpath).first # Todo raise: link is not found
|
32
|
-
break if link
|
48
|
+
break if link.nil?
|
33
49
|
|
34
50
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
35
51
|
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
36
52
|
break if (limit -= 1) <= 0
|
37
53
|
end
|
38
54
|
|
39
|
-
if @flatten == true
|
40
|
-
return child_results.map{|h| h.values}.flatten
|
41
|
-
end
|
42
|
-
|
43
55
|
child_results
|
44
56
|
end
|
45
|
-
|
46
|
-
def opts
|
47
|
-
{limit:@limit, flatten:@flatten}
|
48
|
-
end
|
49
|
-
|
50
|
-
def node_type_str
|
51
|
-
"pages".freeze
|
52
|
-
end
|
53
57
|
end
|
54
58
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -16,7 +14,7 @@ module Yasuri
|
|
16
14
|
Hash[child_results_kv]
|
17
15
|
end
|
18
16
|
tree.size == 1 ? tree.first : tree
|
19
|
-
end
|
17
|
+
end
|
20
18
|
|
21
19
|
def node_type_str
|
22
20
|
"struct".freeze
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -13,14 +11,14 @@ module Yasuri
|
|
13
11
|
truncate = opt[:truncate]
|
14
12
|
proc = opt[:proc]
|
15
13
|
|
16
|
-
truncate = Regexp.new(truncate)
|
14
|
+
truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
|
17
15
|
@truncate = truncate
|
18
|
-
@truncate = Regexp.new(@truncate.to_s)
|
16
|
+
@truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
|
19
17
|
|
20
18
|
@proc = proc.nil? ? nil : proc.to_sym
|
21
19
|
end
|
22
20
|
|
23
|
-
def inject(
|
21
|
+
def inject(_agent, page, _opt = {}, element = page)
|
24
22
|
node = element.search(@xpath)
|
25
23
|
text = node.text.to_s
|
26
24
|
|
@@ -35,7 +33,7 @@ module Yasuri
|
|
35
33
|
end
|
36
34
|
|
37
35
|
def opts
|
38
|
-
{truncate
|
36
|
+
{ truncate: @truncate, proc: @proc }
|
39
37
|
end
|
40
38
|
|
41
39
|
def node_type_str
|
data/spec/servers/httpserver.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
data/spec/yasuri_cli_spec.rb
CHANGED
@@ -2,95 +2,113 @@ require_relative 'spec_helper'
|
|
2
2
|
|
3
3
|
describe 'Yasuri' do
|
4
4
|
include_context 'httpserver'
|
5
|
-
|
6
|
-
before do
|
7
|
-
@agent = Mechanize.new
|
8
|
-
@index_page = @agent.get(uri)
|
9
|
-
|
10
|
-
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
-
end
|
5
|
+
let(:res_dir) { File.expand_path('cli_resources', __dir__) }
|
12
6
|
|
13
7
|
describe 'cli scrape' do
|
14
|
-
it
|
15
|
-
expect
|
8
|
+
it 'require --file or --json option' do
|
9
|
+
expect do
|
16
10
|
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
-
|
11
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
12
|
end
|
19
13
|
|
20
|
-
it
|
21
|
-
expect
|
22
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file:
|
23
|
-
|
14
|
+
it 'only one of --file or --json option' do
|
15
|
+
expect do
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
|
17
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
18
|
end
|
25
19
|
|
26
|
-
it
|
27
|
-
expect
|
28
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file:
|
29
|
-
|
20
|
+
it 'require --file option is not empty string' do
|
21
|
+
expect do
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
|
23
|
+
end.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
24
|
end
|
31
25
|
|
32
|
-
it
|
33
|
-
expect
|
34
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json:
|
35
|
-
|
26
|
+
it 'require --json option is not empty string' do
|
27
|
+
expect do
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
|
29
|
+
end.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
30
|
end
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
}.to output("Yasuri Test\n").to_stdout
|
32
|
+
it 'display text node as simple string' do
|
33
|
+
expect do
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
|
35
|
+
end.to output("Yasuri Test\n").to_stdout
|
43
36
|
end
|
44
37
|
|
45
|
-
it
|
46
|
-
expect
|
47
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
-
|
38
|
+
it 'display texts in single json' do
|
39
|
+
expect do
|
40
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
|
41
|
+
end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
|
49
42
|
end
|
50
43
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
44
|
+
it 'display text node as simple string via json file' do
|
45
|
+
expect do
|
46
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
|
47
|
+
end.to output(
|
48
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
49
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
50
|
+
"\n"
|
51
|
+
).to_stdout
|
56
52
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
53
|
+
|
54
|
+
it 'display text node as simple string via yaml file' do
|
55
|
+
expect do
|
56
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
|
57
|
+
end.to output(
|
58
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
59
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
60
|
+
"\n"
|
61
|
+
).to_stdout
|
61
62
|
end
|
62
63
|
|
63
|
-
it
|
64
|
+
it 'interval option is effect for each request' do
|
64
65
|
allow(Kernel).to receive(:sleep)
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
67
|
+
expect do
|
68
|
+
Yasuri::CLI.new.invoke(
|
69
|
+
:scrape,
|
70
|
+
["#{uri}/pagination/page01.html"],
|
71
|
+
{ file: "#{res_dir}/tree.yml", interval: 500 }
|
72
|
+
)
|
73
|
+
end.to output(
|
74
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
75
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
76
|
+
"\n"
|
77
|
+
).to_stdout
|
71
78
|
|
72
79
|
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
73
80
|
expect(interval_sec).to match 0.5
|
74
81
|
end
|
75
82
|
end
|
76
83
|
|
77
|
-
it
|
84
|
+
it 'display ERROR when json string is wrong' do
|
78
85
|
wrong_json = '{,,}'
|
79
|
-
expect
|
80
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
81
|
-
|
86
|
+
expect do
|
87
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
|
88
|
+
end.to output(
|
89
|
+
'ERROR: Failed to convert json to yasuri tree. ' \
|
90
|
+
"809: unexpected token at '#{wrong_json}'\n"
|
91
|
+
).to_stderr
|
82
92
|
end
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
93
|
+
|
94
|
+
it 'display ERROR when json file contains is wrong' do
|
95
|
+
file_path = "#{res_dir}/tree_wrong.json"
|
96
|
+
expect do
|
97
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
98
|
+
end.to output(
|
99
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
100
|
+
"(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
|
101
|
+
).to_stderr
|
88
102
|
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
103
|
+
|
104
|
+
it 'display ERROR when yaml file contains is wrong' do
|
105
|
+
file_path = "#{res_dir}/tree_wrong.yml"
|
106
|
+
expect do
|
107
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
108
|
+
end.to output(
|
109
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
110
|
+
"(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
|
111
|
+
).to_stderr
|
94
112
|
end
|
95
113
|
end
|
96
|
-
end
|
114
|
+
end
|
@@ -1,83 +1,86 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
#########
|
7
|
-
# Links #
|
8
|
-
#########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::LinksNode' do
|
13
|
-
before do
|
14
|
-
@uri = uri
|
15
|
-
end
|
16
|
-
|
17
8
|
it 'scrape links' do
|
18
|
-
root_node = Yasuri::LinksNode.new(
|
19
|
-
|
20
|
-
|
9
|
+
root_node = Yasuri::LinksNode.new(
|
10
|
+
'/html/body/a', "root", [
|
11
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
12
|
+
]
|
13
|
+
)
|
21
14
|
|
22
|
-
actual = root_node.scrape(
|
15
|
+
actual = root_node.scrape(uri)
|
23
16
|
expected = [
|
24
|
-
{"content" => "Child 01 page."},
|
25
|
-
{"content" => "Child 02 page."},
|
26
|
-
{"content" => "Child 03 page."}
|
17
|
+
{ "content" => "Child 01 page." },
|
18
|
+
{ "content" => "Child 02 page." },
|
19
|
+
{ "content" => "Child 03 page." }
|
27
20
|
]
|
28
21
|
expect(actual).to match expected
|
29
22
|
end
|
30
23
|
|
31
24
|
it 'return empty set if no match node' do
|
32
25
|
missing_xpath = '/html/body/b'
|
33
|
-
root_node = Yasuri::LinksNode.new(
|
34
|
-
|
35
|
-
|
26
|
+
root_node = Yasuri::LinksNode.new(
|
27
|
+
missing_xpath, "root", [
|
28
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
29
|
+
]
|
30
|
+
)
|
36
31
|
|
37
|
-
actual = root_node.scrape(
|
32
|
+
actual = root_node.scrape(uri)
|
38
33
|
expect(actual).to be_empty
|
39
34
|
end
|
40
35
|
|
41
36
|
it 'scrape links, recursive' do
|
42
|
-
root_node = Yasuri::LinksNode.new(
|
43
|
-
|
44
|
-
|
45
|
-
Yasuri::
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
root_node = Yasuri::LinksNode.new(
|
38
|
+
'/html/body/a', "root", [
|
39
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
40
|
+
Yasuri::LinksNode.new(
|
41
|
+
'/html/body/ul/li/a', "sub_link", [
|
42
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
43
|
+
]
|
44
|
+
)
|
45
|
+
]
|
46
|
+
)
|
47
|
+
actual = root_node.scrape(uri)
|
49
48
|
expected = [
|
50
|
-
{"content"
|
51
|
-
|
52
|
-
|
53
|
-
{"content" => "Child 02 page.",
|
54
|
-
|
55
|
-
{"content" => "Child 03 page.",
|
56
|
-
|
49
|
+
{ "content" => "Child 01 page.",
|
50
|
+
"sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
|
51
|
+
{ "sub_page_title" => "Child 02 SubPage Test" }] },
|
52
|
+
{ "content" => "Child 02 page.",
|
53
|
+
"sub_link" => [] },
|
54
|
+
{ "content" => "Child 03 page.",
|
55
|
+
"sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
|
57
56
|
]
|
58
57
|
expect(actual).to match expected
|
59
58
|
end
|
59
|
+
|
60
60
|
it 'can be defined by DSL, return no contains if no child node' do
|
61
61
|
root_node = Yasuri.links_title '/html/body/a'
|
62
|
-
actual = root_node.scrape(
|
62
|
+
actual = root_node.scrape(uri)
|
63
63
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
64
64
|
expect(actual).to match expected
|
65
65
|
end
|
66
66
|
|
67
67
|
it 'can be defined return no contains if no child node' do
|
68
68
|
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
69
|
-
actual = root_node.scrape(
|
69
|
+
actual = root_node.scrape(uri)
|
70
70
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
71
71
|
expect(actual).to match expected
|
72
72
|
end
|
73
|
+
|
73
74
|
it 'can be defined by DSL, return nested contents under link' do
|
74
75
|
generated = Yasuri.links_title '/html/body/a' do
|
75
|
-
|
76
|
-
|
77
|
-
original = Yasuri::LinksNode.new(
|
78
|
-
|
79
|
-
|
80
|
-
|
76
|
+
text_name '/html/body/p'
|
77
|
+
end
|
78
|
+
original = Yasuri::LinksNode.new(
|
79
|
+
'/html/body/a', "root", [
|
80
|
+
Yasuri::TextNode.new('/html/body/p', "name")
|
81
|
+
]
|
82
|
+
)
|
83
|
+
compare_generated_vs_original(generated, original, uri)
|
81
84
|
end
|
82
85
|
|
83
86
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -88,25 +91,31 @@ describe 'Yasuri' do
|
|
88
91
|
end
|
89
92
|
end
|
90
93
|
|
91
|
-
original = Yasuri::LinksNode.new(
|
92
|
-
|
93
|
-
|
94
|
-
Yasuri::
|
95
|
-
|
96
|
-
|
97
|
-
|
94
|
+
original = Yasuri::LinksNode.new(
|
95
|
+
'/html/body/a', "root", [
|
96
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
97
|
+
Yasuri::LinksNode.new(
|
98
|
+
'/html/body/ul/li/a', "sub_link", [
|
99
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
100
|
+
]
|
101
|
+
)
|
102
|
+
]
|
103
|
+
)
|
104
|
+
compare_generated_vs_original(generated, original, uri)
|
98
105
|
end
|
99
106
|
|
100
107
|
it 'return child node as symbol' do
|
101
|
-
root_node = Yasuri::LinksNode.new(
|
102
|
-
|
103
|
-
|
108
|
+
root_node = Yasuri::LinksNode.new(
|
109
|
+
'/html/body/a', "root", [
|
110
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
111
|
+
]
|
112
|
+
)
|
104
113
|
|
105
|
-
actual = root_node.scrape(
|
114
|
+
actual = root_node.scrape(uri, symbolize_names: true)
|
106
115
|
expected = [
|
107
|
-
{:
|
108
|
-
{:
|
109
|
-
{:
|
116
|
+
{ content: "Child 01 page." },
|
117
|
+
{ content: "Child 02 page." },
|
118
|
+
{ content: "Child 03 page." }
|
110
119
|
]
|
111
120
|
expect(actual).to match expected
|
112
121
|
end
|
@@ -114,15 +123,16 @@ describe 'Yasuri' do
|
|
114
123
|
it 'scrape with interval for each request' do
|
115
124
|
allow(Kernel).to receive(:sleep)
|
116
125
|
|
117
|
-
root_node = Yasuri::LinksNode.new(
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
126
|
+
root_node = Yasuri::LinksNode.new(
|
127
|
+
'/html/body/a', "root", [
|
128
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
129
|
+
]
|
130
|
+
)
|
131
|
+
actual = root_node.scrape(uri, interval_ms: 100)
|
122
132
|
expect(actual.size).to match 3
|
123
133
|
|
124
134
|
# request will be run 4(1+3) times because root page will be requested
|
125
|
-
expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
|
135
|
+
expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
|
126
136
|
expect(interval_sec).to match 0.1
|
127
137
|
end
|
128
138
|
end
|