yasuri 2.0.12 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,15 +7,50 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt: {})
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
+ def scrape(uri, opt = {})
15
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
16
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
17
+
18
+ agent = Mechanize.new
19
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
20
+ scrape_with_agent(uri, agent, opt)
21
+ end
22
+
23
+ def scrape_with_agent(uri, agent, opt = {})
24
+ page = agent.get(uri)
25
+ inject(agent, page, opt)
26
+ end
27
+
14
28
  def inject(agent, page, opt = {}, element = page)
15
- fail "#{Kernel.__method__} is not implemented."
29
+ fail "#{Kernel.__method__} is not implemented in included class."
30
+ end
31
+
32
+ def to_h
33
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
34
+
35
+ node_hash = {}
36
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
37
+
38
+ node_hash[:path] = @xpath if @xpath
39
+
40
+ children.each do |child|
41
+ child_node_name = "#{child.node_type_str}_#{child.name}"
42
+ node_hash[child_node_name] = child.to_h
43
+ end
44
+
45
+ node_hash
16
46
  end
47
+
17
48
  def opts
18
49
  {}
19
50
  end
51
+
52
+ def node_type_str
53
+ fail "#{Kernel.__method__} is not implemented in included class."
54
+ end
20
55
  end
21
56
  end
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
6
6
  require_relative 'yasuri_struct_node'
7
7
  require_relative 'yasuri_links_node'
8
8
  require_relative 'yasuri_paginate_node'
9
+ require_relative 'yasuri_map_node'
9
10
 
10
11
  module Yasuri
11
12
  class NodeGenerator
@@ -15,29 +16,33 @@ module Yasuri
15
16
  @nodes
16
17
  end
17
18
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
19
+ def method_missing(name, pattern=nil, **args, &block)
20
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
21
  raise "Undefined Node Name '#{name}'" if node == nil
21
22
  @nodes << node
22
23
  end
23
24
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
25
+ def self.gen(method_name, xpath, **opt, &block)
27
26
  children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
28
27
 
29
- case name
28
+ case method_name
30
29
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
30
+ # Todo raise error xpath is not valid
31
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
32
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
33
+ # Todo raise error xpath is not valid
34
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
35
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
36
+ # Todo raise error xpath is not valid
37
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
38
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
39
+ # Todo raise error xpath is not valid
40
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
41
+ when /^map_(.+)$/
42
+ Yasuri::MapNode.new($1, children, **opt)
38
43
  else
39
44
  nil
40
45
  end
41
- end # of self.gen(name, *args, &block)
46
+ end # of self.gen(method_name, xpath, **opt, &block)
42
47
  end # of class NodeGenerator
43
48
  end
@@ -14,7 +14,8 @@ module Yasuri
14
14
  end
15
15
 
16
16
  def inject(agent, page, opt = {}, element = page)
17
- retry_count = opt[:retry_count] || 5
17
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
18
19
 
19
20
  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
21
 
@@ -22,16 +23,16 @@ module Yasuri
22
23
  limit = @limit.nil? ? Float::MAX : @limit
23
24
  while page
24
25
  child_results_kv = @children.map do |child_node|
25
- child_name = Yasuri.NodeName(child_node.name, opt)
26
+ child_name = Yasuri.node_name(child_node.name, opt)
26
27
  [child_name, child_node.inject(agent, page, opt)]
27
28
  end
28
29
  child_results << Hash[child_results_kv]
29
30
 
30
- link = page.search(@xpath).first
31
+ link = page.search(@xpath).first # Todo raise: link is not found
31
32
  break if link == nil
32
33
 
33
34
  link_button = Mechanize::Page::Link.new(link, agent, page)
34
- page = Yasuri.with_retry(retry_count) { link_button.click }
35
+ page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
35
36
  break if (limit -= 1) <= 0
36
37
  end
37
38
 
@@ -41,8 +42,13 @@ module Yasuri
41
42
 
42
43
  child_results
43
44
  end
45
+
44
46
  def opts
45
47
  {limit:@limit, flatten:@flatten}
46
48
  end
49
+
50
+ def node_type_str
51
+ "pages".freeze
52
+ end
47
53
  end
48
54
  end
@@ -10,12 +10,16 @@ module Yasuri
10
10
  sub_tags = element.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
- child_name = Yasuri.NodeName(child_node.name, opt)
13
+ child_name = Yasuri.node_name(child_node.name, opt)
14
14
  [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
18
18
  tree.size == 1 ? tree.first : tree
19
19
  end # inject
20
+
21
+ def node_type_str
22
+ "struct".freeze
23
+ end
20
24
  end
21
25
  end
@@ -7,15 +7,17 @@ module Yasuri
7
7
  class TextNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], truncate: nil, proc:nil)
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  super(xpath, name, children)
12
12
 
13
+ truncate = opt[:truncate]
14
+ proc = opt[:proc]
15
+
13
16
  truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
14
17
  @truncate = truncate
15
18
  @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
16
19
 
17
20
  @proc = proc.nil? ? nil : proc.to_sym
18
-
19
21
  end
20
22
 
21
23
  def inject(agent, page, opt = {}, element = page)
@@ -28,11 +30,16 @@ module Yasuri
28
30
  end
29
31
 
30
32
  text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
33
+
31
34
  text
32
35
  end
33
36
 
34
37
  def opts
35
38
  {truncate:@truncate, proc:@proc}
36
39
  end
40
+
41
+ def node_type_str
42
+ "text".freeze
43
+ end
37
44
  end
38
45
  end
@@ -0,0 +1,8 @@
1
+ {
2
+ "pages_root": {
3
+ "path": "/html/body/nav/span/a[@class='next']",
4
+ "limit": 10,
5
+ "flatten": false,
6
+ "text_content": "/html/body/p"
7
+ }
8
+ }
@@ -0,0 +1,5 @@
1
+ pages_root:
2
+ path: "/html/body/nav/span/a[@class='next']"
3
+ limit: 10
4
+ flatten: false
5
+ text_content: "/html/body/p"
@@ -0,0 +1,9 @@
1
+ {
2
+ ,,,
3
+ "pages_root": {
4
+ "path": "/html/body/nav/span/a[@class='next']",
5
+ "limit": 10,
6
+ "flatten": false,
7
+ "text_content": "/html/body/p"
8
+ }
9
+ }
@@ -0,0 +1,6 @@
1
+ ,,,
2
+ pages_root:
3
+ path: "/html/body/nav/span/a[@class='next']"
4
+ limit: 10
5
+ flatten: false
6
+ text_content: "/html/body/p"
data/spec/spec_helper.rb CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
12
12
  }
13
13
  end
14
14
 
15
-
16
- # ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
17
- # require "codeclimate-test-reporter"
18
- # CodeClimate::TestReporter.start
19
-
20
15
  require 'simplecov'
21
16
  require 'coveralls'
22
17
  Coveralls.wear!
23
18
 
24
- SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
19
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
25
20
  SimpleCov::Formatter::HTMLFormatter,
26
21
  Coveralls::SimpleCov::Formatter
27
22
  ]
@@ -31,8 +26,8 @@ SimpleCov.start
31
26
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
32
27
  require 'yasuri'
33
28
 
34
- def compare_generated_vs_original(generated, original, page)
35
- expected = original.inject(@agent, page)
36
- actual = generated.inject(@agent, page)
29
+ def compare_generated_vs_original(generated, original, uri)
30
+ expected = original.scrape(uri)
31
+ actual = generated.scrape(uri)
37
32
  expect(actual).to match expected
38
33
  end
@@ -0,0 +1,96 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ before do
7
+ @agent = Mechanize.new
8
+ @index_page = @agent.get(uri)
9
+
10
+ @res_dir = File.expand_path('../cli_resources', __FILE__)
11
+ end
12
+
13
+ describe 'cli scrape' do
14
+ it "require --file or --json option" do
15
+ expect {
16
+ Yasuri::CLI.new.invoke(:scrape, [uri], {})
17
+ }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
18
+ end
19
+
20
+ it "only one of --file or --json option" do
21
+ expect {
22
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
23
+ }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
24
+ end
25
+
26
+ it "require --file option is not empty string" do
27
+ expect {
28
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
29
+ }.to output("ERROR: --file option require not empty argument.\n").to_stderr
30
+ end
31
+
32
+ it "require --json option is not empty string" do
33
+ expect {
34
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
35
+ }.to output("ERROR: --json option require not empty argument.\n").to_stderr
36
+ end
37
+
38
+
39
+ it "display text node as simple string" do
40
+ expect {
41
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
42
+ }.to output("Yasuri Test\n").to_stdout
43
+ end
44
+
45
+ it "display texts in single json" do
46
+ expect {
47
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
48
+ }.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
49
+ end
50
+
51
+
52
+ it "display text node as simple string via json file" do
53
+ expect {
54
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
55
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
56
+ end
57
+ it "display text node as simple string via yaml file" do
58
+ expect {
59
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
60
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
61
+ end
62
+
63
+ it "interval option is effect for each request" do
64
+ allow(Kernel).to receive(:sleep)
65
+
66
+ Yasuri::CLI.new.invoke(
67
+ :scrape,
68
+ [uri+"/pagination/page01.html"],
69
+ {file: "#{@res_dir}/tree.yml", interval: 500}
70
+ )
71
+
72
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
73
+ expect(interval_sec).to match 0.5
74
+ end
75
+ end
76
+
77
+ it "display ERROR when json string is wrong" do
78
+ wrong_json = '{,,}'
79
+ expect {
80
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
81
+ }.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
82
+ end
83
+ it "display ERROR when json file contains is wrong" do
84
+ file_path = "#{@res_dir}/tree_wrong.json"
85
+ expect {
86
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
87
+ }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
88
+ end
89
+ it "display ERROR when yaml file contains is wrong" do
90
+ file_path = "#{@res_dir}/tree_wrong.yml"
91
+ expect {
92
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
93
+ }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
94
+ end
95
+ end
96
+ end
@@ -11,9 +11,7 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::LinksNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri
16
- @index_page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it 'scrape links' do
@@ -21,7 +19,7 @@ describe 'Yasuri' do
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
21
 
24
- actual = root_node.inject(@agent, @index_page)
22
+ actual = root_node.scrape(@uri)
25
23
  expected = [
26
24
  {"content" => "Child 01 page."},
27
25
  {"content" => "Child 02 page."},
@@ -36,7 +34,7 @@ describe 'Yasuri' do
36
34
  Yasuri::TextNode.new('/html/body/p', "content"),
37
35
  ])
38
36
 
39
- actual = root_node.inject(@agent, @index_page)
37
+ actual = root_node.scrape(@uri)
40
38
  expect(actual).to be_empty
41
39
  end
42
40
 
@@ -47,7 +45,7 @@ describe 'Yasuri' do
47
45
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
48
46
  ]),
49
47
  ])
50
- actual = root_node.inject(@agent, @index_page)
48
+ actual = root_node.scrape(@uri)
51
49
  expected = [
52
50
  {"content" => "Child 01 page.",
53
51
  "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
@@ -59,10 +57,18 @@ describe 'Yasuri' do
59
57
  ]
60
58
  expect(actual).to match expected
61
59
  end
62
- it 'can be defined by DSL, return single LinkNode title' do
63
- generated = Yasuri.links_title '/html/body/a'
64
- original = Yasuri::LinksNode.new('/html/body/a', "title")
65
- compare_generated_vs_original(generated, original, @index_page)
60
+ it 'can be defined by DSL, return no contains if no child node' do
61
+ root_node = Yasuri.links_title '/html/body/a'
62
+ actual = root_node.scrape(@uri)
63
+ expected = [{}, {}, {}] # Empty if no child node under links node.
64
+ expect(actual).to match expected
65
+ end
66
+
67
+ it 'can be defined return no contains if no child node' do
68
+ root_node = Yasuri::LinksNode.new('/html/body/a', "title")
69
+ actual = root_node.scrape(@uri)
70
+ expected = [{}, {}, {}] # Empty if no child node under links node.
71
+ expect(actual).to match expected
66
72
  end
67
73
  it 'can be defined by DSL, return nested contents under link' do
68
74
  generated = Yasuri.links_title '/html/body/a' do
@@ -71,7 +77,7 @@ describe 'Yasuri' do
71
77
  original = Yasuri::LinksNode.new('/html/body/a', "root", [
72
78
  Yasuri::TextNode.new('/html/body/p', "name"),
73
79
  ])
74
- compare_generated_vs_original(generated, original, @index_page)
80
+ compare_generated_vs_original(generated, original, @uri)
75
81
  end
76
82
 
77
83
  it 'can be defined by DSL, return recursive links node' do
@@ -88,7 +94,7 @@ describe 'Yasuri' do
88
94
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
89
95
  ]),
90
96
  ])
91
- compare_generated_vs_original(generated, original, @index_page)
97
+ compare_generated_vs_original(generated, original, @uri)
92
98
  end
93
99
 
94
100
  it 'return child node as symbol' do
@@ -96,7 +102,7 @@ describe 'Yasuri' do
96
102
  Yasuri::TextNode.new('/html/body/p', "content"),
97
103
  ])
98
104
 
99
- actual = root_node.inject(@agent, @index_page, symbolize_names: true )
105
+ actual = root_node.scrape(@uri, symbolize_names: true )
100
106
  expected = [
101
107
  {:content => "Child 01 page."},
102
108
  {:content => "Child 02 page."},
@@ -104,5 +110,21 @@ describe 'Yasuri' do
104
110
  ]
105
111
  expect(actual).to match expected
106
112
  end
113
+
114
+ it 'scrape with interval for each request' do
115
+ allow(Kernel).to receive(:sleep)
116
+
117
+ root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
118
+ Yasuri::TextNode.new('/html/body/p', "content"),
119
+ ])
120
+ actual = root_node.scrape(@uri, interval_ms: 100)
121
+
122
+ expect(actual.size).to match 3
123
+
124
+ # request will be run 4(1+3) times because root page will be requested
125
+ expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
126
+ expect(interval_sec).to match 0.1
127
+ end
128
+ end
107
129
  end
108
130
  end