yasuri 3.3.0 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
  require_relative 'yasuri_text_node'
6
4
  require_relative 'yasuri_struct_node'
@@ -23,7 +21,7 @@ module Yasuri
23
21
  end
24
22
 
25
23
  def self.gen(method_name, xpath, **opt, &block)
26
- children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
24
+ children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
27
25
 
28
26
  case method_name
29
27
  when /^text_(.+)$/
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -14,13 +12,31 @@ module Yasuri
14
12
  end
15
13
 
16
14
  def inject(agent, page, opt = {}, element = page)
15
+ raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
16
+
17
+ limit = @limit.nil? ? Float::MAX : @limit
18
+ child_results = inject_child(agent, page, limit, opt)
19
+
20
+ return child_results.map(&:values).flatten if @flatten == true
21
+
22
+ child_results
23
+ end
24
+
25
+ def opts
26
+ { limit: @limit, flatten: @flatten }
27
+ end
28
+
29
+ def node_type_str
30
+ "pages".freeze
31
+ end
32
+
33
+ private
34
+
35
+ def inject_child(agent, page, limit, opt)
17
36
  retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
37
  interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
19
38
 
20
- raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
21
-
22
39
  child_results = []
23
- limit = @limit.nil? ? Float::MAX : @limit
24
40
  while page
25
41
  child_results_kv = @children.map do |child_node|
26
42
  child_name = Yasuri.node_name(child_node.name, opt)
@@ -29,26 +45,14 @@ module Yasuri
29
45
  child_results << Hash[child_results_kv]
30
46
 
31
47
  link = page.search(@xpath).first # Todo raise: link is not found
32
- break if link == nil
48
+ break if link.nil?
33
49
 
34
50
  link_button = Mechanize::Page::Link.new(link, agent, page)
35
51
  page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
36
52
  break if (limit -= 1) <= 0
37
53
  end
38
54
 
39
- if @flatten == true
40
- return child_results.map{|h| h.values}.flatten
41
- end
42
-
43
55
  child_results
44
56
  end
45
-
46
- def opts
47
- {limit:@limit, flatten:@flatten}
48
- end
49
-
50
- def node_type_str
51
- "pages".freeze
52
- end
53
57
  end
54
58
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -16,7 +14,7 @@ module Yasuri
16
14
  Hash[child_results_kv]
17
15
  end
18
16
  tree.size == 1 ? tree.first : tree
19
- end # inject
17
+ end
20
18
 
21
19
  def node_type_str
22
20
  "struct".freeze
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -13,14 +11,14 @@ module Yasuri
13
11
  truncate = opt[:truncate]
14
12
  proc = opt[:proc]
15
13
 
16
- truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
14
+ truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
17
15
  @truncate = truncate
18
- @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
16
+ @truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
19
17
 
20
18
  @proc = proc.nil? ? nil : proc.to_sym
21
19
  end
22
20
 
23
- def inject(agent, page, opt = {}, element = page)
21
+ def inject(_agent, page, _opt = {}, element = page)
24
22
  node = element.search(@xpath)
25
23
  text = node.text.to_s
26
24
 
@@ -35,7 +33,7 @@ module Yasuri
35
33
  end
36
34
 
37
35
  def opts
38
- {truncate:@truncate, proc:@proc}
36
+ { truncate: @truncate, proc: @proc }
39
37
  end
40
38
 
41
39
  def node_type_str
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
 
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
  Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
@@ -2,95 +2,113 @@ require_relative 'spec_helper'
2
2
 
3
3
  describe 'Yasuri' do
4
4
  include_context 'httpserver'
5
-
6
- before do
7
- @agent = Mechanize.new
8
- @index_page = @agent.get(uri)
9
-
10
- @res_dir = File.expand_path('../cli_resources', __FILE__)
11
- end
5
+ let(:res_dir) { File.expand_path('cli_resources', __dir__) }
12
6
 
13
7
  describe 'cli scrape' do
14
- it "require --file or --json option" do
15
- expect {
8
+ it 'require --file or --json option' do
9
+ expect do
16
10
  Yasuri::CLI.new.invoke(:scrape, [uri], {})
17
- }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
11
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
18
12
  end
19
13
 
20
- it "only one of --file or --json option" do
21
- expect {
22
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
23
- }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
14
+ it 'only one of --file or --json option' do
15
+ expect do
16
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
17
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
24
18
  end
25
19
 
26
- it "require --file option is not empty string" do
27
- expect {
28
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
29
- }.to output("ERROR: --file option require not empty argument.\n").to_stderr
20
+ it 'require --file option is not empty string' do
21
+ expect do
22
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
23
+ end.to output("ERROR: --file option require not empty argument.\n").to_stderr
30
24
  end
31
25
 
32
- it "require --json option is not empty string" do
33
- expect {
34
- Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
35
- }.to output("ERROR: --json option require not empty argument.\n").to_stderr
26
+ it 'require --json option is not empty string' do
27
+ expect do
28
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
29
+ end.to output("ERROR: --json option require not empty argument.\n").to_stderr
36
30
  end
37
31
 
38
-
39
- it "display text node as simple string" do
40
- expect {
41
- Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
42
- }.to output("Yasuri Test\n").to_stdout
32
+ it 'display text node as simple string' do
33
+ expect do
34
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
35
+ end.to output("Yasuri Test\n").to_stdout
43
36
  end
44
37
 
45
- it "display texts in single json" do
46
- expect {
47
- Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
48
- }.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
38
+ it 'display texts in single json' do
39
+ expect do
40
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
41
+ end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
49
42
  end
50
43
 
51
-
52
- it "display text node as simple string via json file" do
53
- expect {
54
- Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
55
- }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
44
+ it 'display text node as simple string via json file' do
45
+ expect do
46
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
47
+ end.to output(
48
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
49
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
50
+ "\n"
51
+ ).to_stdout
56
52
  end
57
- it "display text node as simple string via yaml file" do
58
- expect {
59
- Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
60
- }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
53
+
54
+ it 'display text node as simple string via yaml file' do
55
+ expect do
56
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
57
+ end.to output(
58
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
59
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
60
+ "\n"
61
+ ).to_stdout
61
62
  end
62
63
 
63
- it "interval option is effect for each request" do
64
+ it 'interval option is effect for each request' do
64
65
  allow(Kernel).to receive(:sleep)
65
66
 
66
- Yasuri::CLI.new.invoke(
67
- :scrape,
68
- [uri+"/pagination/page01.html"],
69
- {file: "#{@res_dir}/tree.yml", interval: 500}
70
- )
67
+ expect do
68
+ Yasuri::CLI.new.invoke(
69
+ :scrape,
70
+ ["#{uri}/pagination/page01.html"],
71
+ { file: "#{res_dir}/tree.yml", interval: 500 }
72
+ )
73
+ end.to output(
74
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
75
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
76
+ "\n"
77
+ ).to_stdout
71
78
 
72
79
  expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
73
80
  expect(interval_sec).to match 0.5
74
81
  end
75
82
  end
76
83
 
77
- it "display ERROR when json string is wrong" do
84
+ it 'display ERROR when json string is wrong' do
78
85
  wrong_json = '{,,}'
79
- expect {
80
- Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
81
- }.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
86
+ expect do
87
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
88
+ end.to output(
89
+ 'ERROR: Failed to convert json to yasuri tree. ' \
90
+ "809: unexpected token at '#{wrong_json}'\n"
91
+ ).to_stderr
82
92
  end
83
- it "display ERROR when json file contains is wrong" do
84
- file_path = "#{@res_dir}/tree_wrong.json"
85
- expect {
86
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
87
- }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
93
+
94
+ it 'display ERROR when json file contains is wrong' do
95
+ file_path = "#{res_dir}/tree_wrong.json"
96
+ expect do
97
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
98
+ end.to output(
99
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
100
+ "(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
101
+ ).to_stderr
88
102
  end
89
- it "display ERROR when yaml file contains is wrong" do
90
- file_path = "#{@res_dir}/tree_wrong.yml"
91
- expect {
92
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
93
- }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
103
+
104
+ it 'display ERROR when yaml file contains is wrong' do
105
+ file_path = "#{res_dir}/tree_wrong.yml"
106
+ expect do
107
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
108
+ end.to output(
109
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
110
+ "(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
111
+ ).to_stderr
94
112
  end
95
113
  end
96
- end
114
+ end
@@ -1,83 +1,86 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'spec_helper'
5
3
 
6
- #########
7
- # Links #
8
- #########
9
4
  describe 'Yasuri' do
10
5
  include_context 'httpserver'
11
6
 
12
7
  describe '::LinksNode' do
13
- before do
14
- @uri = uri
15
- end
16
-
17
8
  it 'scrape links' do
18
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
19
- Yasuri::TextNode.new('/html/body/p', "content"),
20
- ])
9
+ root_node = Yasuri::LinksNode.new(
10
+ '/html/body/a', "root", [
11
+ Yasuri::TextNode.new('/html/body/p', "content")
12
+ ]
13
+ )
21
14
 
22
- actual = root_node.scrape(@uri)
15
+ actual = root_node.scrape(uri)
23
16
  expected = [
24
- {"content" => "Child 01 page."},
25
- {"content" => "Child 02 page."},
26
- {"content" => "Child 03 page."},
17
+ { "content" => "Child 01 page." },
18
+ { "content" => "Child 02 page." },
19
+ { "content" => "Child 03 page." }
27
20
  ]
28
21
  expect(actual).to match expected
29
22
  end
30
23
 
31
24
  it 'return empty set if no match node' do
32
25
  missing_xpath = '/html/body/b'
33
- root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
34
- Yasuri::TextNode.new('/html/body/p', "content"),
35
- ])
26
+ root_node = Yasuri::LinksNode.new(
27
+ missing_xpath, "root", [
28
+ Yasuri::TextNode.new('/html/body/p', "content")
29
+ ]
30
+ )
36
31
 
37
- actual = root_node.scrape(@uri)
32
+ actual = root_node.scrape(uri)
38
33
  expect(actual).to be_empty
39
34
  end
40
35
 
41
36
  it 'scrape links, recursive' do
42
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
43
- Yasuri::TextNode.new('/html/body/p', "content"),
44
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
45
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
46
- ]),
47
- ])
48
- actual = root_node.scrape(@uri)
37
+ root_node = Yasuri::LinksNode.new(
38
+ '/html/body/a', "root", [
39
+ Yasuri::TextNode.new('/html/body/p', "content"),
40
+ Yasuri::LinksNode.new(
41
+ '/html/body/ul/li/a', "sub_link", [
42
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
43
+ ]
44
+ )
45
+ ]
46
+ )
47
+ actual = root_node.scrape(uri)
49
48
  expected = [
50
- {"content" => "Child 01 page.",
51
- "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
52
- {"sub_page_title" => "Child 02 SubPage Test"}],},
53
- {"content" => "Child 02 page.",
54
- "sub_link" => [],},
55
- {"content" => "Child 03 page.",
56
- "sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
49
+ { "content" => "Child 01 page.",
50
+ "sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
51
+ { "sub_page_title" => "Child 02 SubPage Test" }] },
52
+ { "content" => "Child 02 page.",
53
+ "sub_link" => [] },
54
+ { "content" => "Child 03 page.",
55
+ "sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
57
56
  ]
58
57
  expect(actual).to match expected
59
58
  end
59
+
60
60
  it 'can be defined by DSL, return no contains if no child node' do
61
61
  root_node = Yasuri.links_title '/html/body/a'
62
- actual = root_node.scrape(@uri)
62
+ actual = root_node.scrape(uri)
63
63
  expected = [{}, {}, {}] # Empty if no child node under links node.
64
64
  expect(actual).to match expected
65
65
  end
66
66
 
67
67
  it 'can be defined return no contains if no child node' do
68
68
  root_node = Yasuri::LinksNode.new('/html/body/a', "title")
69
- actual = root_node.scrape(@uri)
69
+ actual = root_node.scrape(uri)
70
70
  expected = [{}, {}, {}] # Empty if no child node under links node.
71
71
  expect(actual).to match expected
72
72
  end
73
+
73
74
  it 'can be defined by DSL, return nested contents under link' do
74
75
  generated = Yasuri.links_title '/html/body/a' do
75
- text_name '/html/body/p'
76
- end
77
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
78
- Yasuri::TextNode.new('/html/body/p', "name"),
79
- ])
80
- compare_generated_vs_original(generated, original, @uri)
76
+ text_name '/html/body/p'
77
+ end
78
+ original = Yasuri::LinksNode.new(
79
+ '/html/body/a', "root", [
80
+ Yasuri::TextNode.new('/html/body/p', "name")
81
+ ]
82
+ )
83
+ compare_generated_vs_original(generated, original, uri)
81
84
  end
82
85
 
83
86
  it 'can be defined by DSL, return recursive links node' do
@@ -88,25 +91,31 @@ describe 'Yasuri' do
88
91
  end
89
92
  end
90
93
 
91
- original = Yasuri::LinksNode.new('/html/body/a', "root", [
92
- Yasuri::TextNode.new('/html/body/p', "content"),
93
- Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
94
- Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
95
- ]),
96
- ])
97
- compare_generated_vs_original(generated, original, @uri)
94
+ original = Yasuri::LinksNode.new(
95
+ '/html/body/a', "root", [
96
+ Yasuri::TextNode.new('/html/body/p', "content"),
97
+ Yasuri::LinksNode.new(
98
+ '/html/body/ul/li/a', "sub_link", [
99
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title")
100
+ ]
101
+ )
102
+ ]
103
+ )
104
+ compare_generated_vs_original(generated, original, uri)
98
105
  end
99
106
 
100
107
  it 'return child node as symbol' do
101
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
102
- Yasuri::TextNode.new('/html/body/p', "content"),
103
- ])
108
+ root_node = Yasuri::LinksNode.new(
109
+ '/html/body/a', "root", [
110
+ Yasuri::TextNode.new('/html/body/p', "content")
111
+ ]
112
+ )
104
113
 
105
- actual = root_node.scrape(@uri, symbolize_names: true )
114
+ actual = root_node.scrape(uri, symbolize_names: true)
106
115
  expected = [
107
- {:content => "Child 01 page."},
108
- {:content => "Child 02 page."},
109
- {:content => "Child 03 page."},
116
+ { content: "Child 01 page." },
117
+ { content: "Child 02 page." },
118
+ { content: "Child 03 page." }
110
119
  ]
111
120
  expect(actual).to match expected
112
121
  end
@@ -114,15 +123,16 @@ describe 'Yasuri' do
114
123
  it 'scrape with interval for each request' do
115
124
  allow(Kernel).to receive(:sleep)
116
125
 
117
- root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
118
- Yasuri::TextNode.new('/html/body/p', "content"),
119
- ])
120
- actual = root_node.scrape(@uri, interval_ms: 100)
121
-
126
+ root_node = Yasuri::LinksNode.new(
127
+ '/html/body/a', "root", [
128
+ Yasuri::TextNode.new('/html/body/p', "content")
129
+ ]
130
+ )
131
+ actual = root_node.scrape(uri, interval_ms: 100)
122
132
  expect(actual.size).to match 3
123
133
 
124
134
  # request will be run 4(1+3) times because root page will be requested
125
- expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
135
+ expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
126
136
  expect(interval_sec).to match 0.1
127
137
  end
128
138
  end