yasuri 3.3.0 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
require_relative 'yasuri_text_node'
|
6
4
|
require_relative 'yasuri_struct_node'
|
@@ -23,7 +21,7 @@ module Yasuri
|
|
23
21
|
end
|
24
22
|
|
25
23
|
def self.gen(method_name, xpath, **opt, &block)
|
26
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if
|
24
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
|
27
25
|
|
28
26
|
case method_name
|
29
27
|
when /^text_(.+)$/
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -14,13 +12,31 @@ module Yasuri
|
|
14
12
|
end
|
15
13
|
|
16
14
|
def inject(agent, page, opt = {}, element = page)
|
15
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
16
|
+
|
17
|
+
limit = @limit.nil? ? Float::MAX : @limit
|
18
|
+
child_results = inject_child(agent, page, limit, opt)
|
19
|
+
|
20
|
+
return child_results.map(&:values).flatten if @flatten == true
|
21
|
+
|
22
|
+
child_results
|
23
|
+
end
|
24
|
+
|
25
|
+
def opts
|
26
|
+
{ limit: @limit, flatten: @flatten }
|
27
|
+
end
|
28
|
+
|
29
|
+
def node_type_str
|
30
|
+
"pages".freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def inject_child(agent, page, limit, opt)
|
17
36
|
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
37
|
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
19
38
|
|
20
|
-
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
21
|
-
|
22
39
|
child_results = []
|
23
|
-
limit = @limit.nil? ? Float::MAX : @limit
|
24
40
|
while page
|
25
41
|
child_results_kv = @children.map do |child_node|
|
26
42
|
child_name = Yasuri.node_name(child_node.name, opt)
|
@@ -29,26 +45,14 @@ module Yasuri
|
|
29
45
|
child_results << Hash[child_results_kv]
|
30
46
|
|
31
47
|
link = page.search(@xpath).first # Todo raise: link is not found
|
32
|
-
break if link
|
48
|
+
break if link.nil?
|
33
49
|
|
34
50
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
35
51
|
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
36
52
|
break if (limit -= 1) <= 0
|
37
53
|
end
|
38
54
|
|
39
|
-
if @flatten == true
|
40
|
-
return child_results.map{|h| h.values}.flatten
|
41
|
-
end
|
42
|
-
|
43
55
|
child_results
|
44
56
|
end
|
45
|
-
|
46
|
-
def opts
|
47
|
-
{limit:@limit, flatten:@flatten}
|
48
|
-
end
|
49
|
-
|
50
|
-
def node_type_str
|
51
|
-
"pages".freeze
|
52
|
-
end
|
53
57
|
end
|
54
58
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -16,7 +14,7 @@ module Yasuri
|
|
16
14
|
Hash[child_results_kv]
|
17
15
|
end
|
18
16
|
tree.size == 1 ? tree.first : tree
|
19
|
-
end
|
17
|
+
end
|
20
18
|
|
21
19
|
def node_type_str
|
22
20
|
"struct".freeze
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -13,14 +11,14 @@ module Yasuri
|
|
13
11
|
truncate = opt[:truncate]
|
14
12
|
proc = opt[:proc]
|
15
13
|
|
16
|
-
truncate = Regexp.new(truncate)
|
14
|
+
truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
|
17
15
|
@truncate = truncate
|
18
|
-
@truncate = Regexp.new(@truncate.to_s)
|
16
|
+
@truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
|
19
17
|
|
20
18
|
@proc = proc.nil? ? nil : proc.to_sym
|
21
19
|
end
|
22
20
|
|
23
|
-
def inject(
|
21
|
+
def inject(_agent, page, _opt = {}, element = page)
|
24
22
|
node = element.search(@xpath)
|
25
23
|
text = node.text.to_s
|
26
24
|
|
@@ -35,7 +33,7 @@ module Yasuri
|
|
35
33
|
end
|
36
34
|
|
37
35
|
def opts
|
38
|
-
{truncate
|
36
|
+
{ truncate: @truncate, proc: @proc }
|
39
37
|
end
|
40
38
|
|
41
39
|
def node_type_str
|
data/spec/servers/httpserver.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
data/spec/yasuri_cli_spec.rb
CHANGED
@@ -2,95 +2,113 @@ require_relative 'spec_helper'
|
|
2
2
|
|
3
3
|
describe 'Yasuri' do
|
4
4
|
include_context 'httpserver'
|
5
|
-
|
6
|
-
before do
|
7
|
-
@agent = Mechanize.new
|
8
|
-
@index_page = @agent.get(uri)
|
9
|
-
|
10
|
-
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
-
end
|
5
|
+
let(:res_dir) { File.expand_path('cli_resources', __dir__) }
|
12
6
|
|
13
7
|
describe 'cli scrape' do
|
14
|
-
it
|
15
|
-
expect
|
8
|
+
it 'require --file or --json option' do
|
9
|
+
expect do
|
16
10
|
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
-
|
11
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
12
|
end
|
19
13
|
|
20
|
-
it
|
21
|
-
expect
|
22
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file:
|
23
|
-
|
14
|
+
it 'only one of --file or --json option' do
|
15
|
+
expect do
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
|
17
|
+
end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
18
|
end
|
25
19
|
|
26
|
-
it
|
27
|
-
expect
|
28
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file:
|
29
|
-
|
20
|
+
it 'require --file option is not empty string' do
|
21
|
+
expect do
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
|
23
|
+
end.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
24
|
end
|
31
25
|
|
32
|
-
it
|
33
|
-
expect
|
34
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json:
|
35
|
-
|
26
|
+
it 'require --json option is not empty string' do
|
27
|
+
expect do
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
|
29
|
+
end.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
30
|
end
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
}.to output("Yasuri Test\n").to_stdout
|
32
|
+
it 'display text node as simple string' do
|
33
|
+
expect do
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
|
35
|
+
end.to output("Yasuri Test\n").to_stdout
|
43
36
|
end
|
44
37
|
|
45
|
-
it
|
46
|
-
expect
|
47
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
-
|
38
|
+
it 'display texts in single json' do
|
39
|
+
expect do
|
40
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
|
41
|
+
end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
|
49
42
|
end
|
50
43
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
44
|
+
it 'display text node as simple string via json file' do
|
45
|
+
expect do
|
46
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
|
47
|
+
end.to output(
|
48
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
49
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
50
|
+
"\n"
|
51
|
+
).to_stdout
|
56
52
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
53
|
+
|
54
|
+
it 'display text node as simple string via yaml file' do
|
55
|
+
expect do
|
56
|
+
Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
|
57
|
+
end.to output(
|
58
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
59
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
60
|
+
"\n"
|
61
|
+
).to_stdout
|
61
62
|
end
|
62
63
|
|
63
|
-
it
|
64
|
+
it 'interval option is effect for each request' do
|
64
65
|
allow(Kernel).to receive(:sleep)
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
67
|
+
expect do
|
68
|
+
Yasuri::CLI.new.invoke(
|
69
|
+
:scrape,
|
70
|
+
["#{uri}/pagination/page01.html"],
|
71
|
+
{ file: "#{res_dir}/tree.yml", interval: 500 }
|
72
|
+
)
|
73
|
+
end.to output(
|
74
|
+
'[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
|
75
|
+
'{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
|
76
|
+
"\n"
|
77
|
+
).to_stdout
|
71
78
|
|
72
79
|
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
73
80
|
expect(interval_sec).to match 0.5
|
74
81
|
end
|
75
82
|
end
|
76
83
|
|
77
|
-
it
|
84
|
+
it 'display ERROR when json string is wrong' do
|
78
85
|
wrong_json = '{,,}'
|
79
|
-
expect
|
80
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
81
|
-
|
86
|
+
expect do
|
87
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
|
88
|
+
end.to output(
|
89
|
+
'ERROR: Failed to convert json to yasuri tree. ' \
|
90
|
+
"809: unexpected token at '#{wrong_json}'\n"
|
91
|
+
).to_stderr
|
82
92
|
end
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
93
|
+
|
94
|
+
it 'display ERROR when json file contains is wrong' do
|
95
|
+
file_path = "#{res_dir}/tree_wrong.json"
|
96
|
+
expect do
|
97
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
98
|
+
end.to output(
|
99
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
100
|
+
"(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
|
101
|
+
).to_stderr
|
88
102
|
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
103
|
+
|
104
|
+
it 'display ERROR when yaml file contains is wrong' do
|
105
|
+
file_path = "#{res_dir}/tree_wrong.yml"
|
106
|
+
expect do
|
107
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
|
108
|
+
end.to output(
|
109
|
+
"ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
|
110
|
+
"(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
|
111
|
+
).to_stderr
|
94
112
|
end
|
95
113
|
end
|
96
|
-
end
|
114
|
+
end
|
@@ -1,83 +1,86 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'spec_helper'
|
5
3
|
|
6
|
-
#########
|
7
|
-
# Links #
|
8
|
-
#########
|
9
4
|
describe 'Yasuri' do
|
10
5
|
include_context 'httpserver'
|
11
6
|
|
12
7
|
describe '::LinksNode' do
|
13
|
-
before do
|
14
|
-
@uri = uri
|
15
|
-
end
|
16
|
-
|
17
8
|
it 'scrape links' do
|
18
|
-
root_node = Yasuri::LinksNode.new(
|
19
|
-
|
20
|
-
|
9
|
+
root_node = Yasuri::LinksNode.new(
|
10
|
+
'/html/body/a', "root", [
|
11
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
12
|
+
]
|
13
|
+
)
|
21
14
|
|
22
|
-
actual = root_node.scrape(
|
15
|
+
actual = root_node.scrape(uri)
|
23
16
|
expected = [
|
24
|
-
{"content" => "Child 01 page."},
|
25
|
-
{"content" => "Child 02 page."},
|
26
|
-
{"content" => "Child 03 page."}
|
17
|
+
{ "content" => "Child 01 page." },
|
18
|
+
{ "content" => "Child 02 page." },
|
19
|
+
{ "content" => "Child 03 page." }
|
27
20
|
]
|
28
21
|
expect(actual).to match expected
|
29
22
|
end
|
30
23
|
|
31
24
|
it 'return empty set if no match node' do
|
32
25
|
missing_xpath = '/html/body/b'
|
33
|
-
root_node = Yasuri::LinksNode.new(
|
34
|
-
|
35
|
-
|
26
|
+
root_node = Yasuri::LinksNode.new(
|
27
|
+
missing_xpath, "root", [
|
28
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
29
|
+
]
|
30
|
+
)
|
36
31
|
|
37
|
-
actual = root_node.scrape(
|
32
|
+
actual = root_node.scrape(uri)
|
38
33
|
expect(actual).to be_empty
|
39
34
|
end
|
40
35
|
|
41
36
|
it 'scrape links, recursive' do
|
42
|
-
root_node = Yasuri::LinksNode.new(
|
43
|
-
|
44
|
-
|
45
|
-
Yasuri::
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
root_node = Yasuri::LinksNode.new(
|
38
|
+
'/html/body/a', "root", [
|
39
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
40
|
+
Yasuri::LinksNode.new(
|
41
|
+
'/html/body/ul/li/a', "sub_link", [
|
42
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
43
|
+
]
|
44
|
+
)
|
45
|
+
]
|
46
|
+
)
|
47
|
+
actual = root_node.scrape(uri)
|
49
48
|
expected = [
|
50
|
-
{"content"
|
51
|
-
|
52
|
-
|
53
|
-
{"content" => "Child 02 page.",
|
54
|
-
|
55
|
-
{"content" => "Child 03 page.",
|
56
|
-
|
49
|
+
{ "content" => "Child 01 page.",
|
50
|
+
"sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
|
51
|
+
{ "sub_page_title" => "Child 02 SubPage Test" }] },
|
52
|
+
{ "content" => "Child 02 page.",
|
53
|
+
"sub_link" => [] },
|
54
|
+
{ "content" => "Child 03 page.",
|
55
|
+
"sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
|
57
56
|
]
|
58
57
|
expect(actual).to match expected
|
59
58
|
end
|
59
|
+
|
60
60
|
it 'can be defined by DSL, return no contains if no child node' do
|
61
61
|
root_node = Yasuri.links_title '/html/body/a'
|
62
|
-
actual = root_node.scrape(
|
62
|
+
actual = root_node.scrape(uri)
|
63
63
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
64
64
|
expect(actual).to match expected
|
65
65
|
end
|
66
66
|
|
67
67
|
it 'can be defined return no contains if no child node' do
|
68
68
|
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
69
|
-
actual = root_node.scrape(
|
69
|
+
actual = root_node.scrape(uri)
|
70
70
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
71
71
|
expect(actual).to match expected
|
72
72
|
end
|
73
|
+
|
73
74
|
it 'can be defined by DSL, return nested contents under link' do
|
74
75
|
generated = Yasuri.links_title '/html/body/a' do
|
75
|
-
|
76
|
-
|
77
|
-
original = Yasuri::LinksNode.new(
|
78
|
-
|
79
|
-
|
80
|
-
|
76
|
+
text_name '/html/body/p'
|
77
|
+
end
|
78
|
+
original = Yasuri::LinksNode.new(
|
79
|
+
'/html/body/a', "root", [
|
80
|
+
Yasuri::TextNode.new('/html/body/p', "name")
|
81
|
+
]
|
82
|
+
)
|
83
|
+
compare_generated_vs_original(generated, original, uri)
|
81
84
|
end
|
82
85
|
|
83
86
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -88,25 +91,31 @@ describe 'Yasuri' do
|
|
88
91
|
end
|
89
92
|
end
|
90
93
|
|
91
|
-
original = Yasuri::LinksNode.new(
|
92
|
-
|
93
|
-
|
94
|
-
Yasuri::
|
95
|
-
|
96
|
-
|
97
|
-
|
94
|
+
original = Yasuri::LinksNode.new(
|
95
|
+
'/html/body/a', "root", [
|
96
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
97
|
+
Yasuri::LinksNode.new(
|
98
|
+
'/html/body/ul/li/a', "sub_link", [
|
99
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title")
|
100
|
+
]
|
101
|
+
)
|
102
|
+
]
|
103
|
+
)
|
104
|
+
compare_generated_vs_original(generated, original, uri)
|
98
105
|
end
|
99
106
|
|
100
107
|
it 'return child node as symbol' do
|
101
|
-
root_node = Yasuri::LinksNode.new(
|
102
|
-
|
103
|
-
|
108
|
+
root_node = Yasuri::LinksNode.new(
|
109
|
+
'/html/body/a', "root", [
|
110
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
111
|
+
]
|
112
|
+
)
|
104
113
|
|
105
|
-
actual = root_node.scrape(
|
114
|
+
actual = root_node.scrape(uri, symbolize_names: true)
|
106
115
|
expected = [
|
107
|
-
{:
|
108
|
-
{:
|
109
|
-
{:
|
116
|
+
{ content: "Child 01 page." },
|
117
|
+
{ content: "Child 02 page." },
|
118
|
+
{ content: "Child 03 page." }
|
110
119
|
]
|
111
120
|
expect(actual).to match expected
|
112
121
|
end
|
@@ -114,15 +123,16 @@ describe 'Yasuri' do
|
|
114
123
|
it 'scrape with interval for each request' do
|
115
124
|
allow(Kernel).to receive(:sleep)
|
116
125
|
|
117
|
-
root_node = Yasuri::LinksNode.new(
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
126
|
+
root_node = Yasuri::LinksNode.new(
|
127
|
+
'/html/body/a', "root", [
|
128
|
+
Yasuri::TextNode.new('/html/body/p', "content")
|
129
|
+
]
|
130
|
+
)
|
131
|
+
actual = root_node.scrape(uri, interval_ms: 100)
|
122
132
|
expect(actual.size).to match 3
|
123
133
|
|
124
134
|
# request will be run 4(1+3) times because root page will be requested
|
125
|
-
expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
|
135
|
+
expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
|
126
136
|
expect(interval_sec).to match 0.1
|
127
137
|
end
|
128
138
|
end
|