yasuri 2.0.12 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,15 +7,50 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt: {})
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
+ def scrape(uri, opt = {})
15
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
16
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
17
+
18
+ agent = Mechanize.new
19
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
20
+ scrape_with_agent(uri, agent, opt)
21
+ end
22
+
23
+ def scrape_with_agent(uri, agent, opt = {})
24
+ page = agent.get(uri)
25
+ inject(agent, page, opt)
26
+ end
27
+
14
28
  def inject(agent, page, opt = {}, element = page)
15
- fail "#{Kernel.__method__} is not implemented."
29
+ fail "#{Kernel.__method__} is not implemented in included class."
30
+ end
31
+
32
+ def to_h
33
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
34
+
35
+ node_hash = {}
36
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
37
+
38
+ node_hash[:path] = @xpath if @xpath
39
+
40
+ children.each do |child|
41
+ child_node_name = "#{child.node_type_str}_#{child.name}"
42
+ node_hash[child_node_name] = child.to_h
43
+ end
44
+
45
+ node_hash
16
46
  end
47
+
17
48
  def opts
18
49
  {}
19
50
  end
51
+
52
+ def node_type_str
53
+ fail "#{Kernel.__method__} is not implemented in included class."
54
+ end
20
55
  end
21
56
  end
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
6
6
  require_relative 'yasuri_struct_node'
7
7
  require_relative 'yasuri_links_node'
8
8
  require_relative 'yasuri_paginate_node'
9
+ require_relative 'yasuri_map_node'
9
10
 
10
11
  module Yasuri
11
12
  class NodeGenerator
@@ -15,29 +16,33 @@ module Yasuri
15
16
  @nodes
16
17
  end
17
18
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
19
+ def method_missing(name, pattern=nil, **args, &block)
20
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
21
  raise "Undefined Node Name '#{name}'" if node == nil
21
22
  @nodes << node
22
23
  end
23
24
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
25
+ def self.gen(method_name, xpath, **opt, &block)
27
26
  children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
28
27
 
29
- case name
28
+ case method_name
30
29
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
30
+ # Todo raise error xpath is not valid
31
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
32
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
33
+ # Todo raise error xpath is not valid
34
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
35
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
36
+ # Todo raise error xpath is not valid
37
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
38
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
39
+ # Todo raise error xpath is not valid
40
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
41
+ when /^map_(.+)$/
42
+ Yasuri::MapNode.new($1, children, **opt)
38
43
  else
39
44
  nil
40
45
  end
41
- end # of self.gen(name, *args, &block)
46
+ end # of self.gen(method_name, xpath, **opt, &block)
42
47
  end # of class NodeGenerator
43
48
  end
@@ -14,7 +14,8 @@ module Yasuri
14
14
  end
15
15
 
16
16
  def inject(agent, page, opt = {}, element = page)
17
- retry_count = opt[:retry_count] || 5
17
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
18
19
 
19
20
  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
21
 
@@ -22,16 +23,16 @@ module Yasuri
22
23
  limit = @limit.nil? ? Float::MAX : @limit
23
24
  while page
24
25
  child_results_kv = @children.map do |child_node|
25
- child_name = Yasuri.NodeName(child_node.name, opt)
26
+ child_name = Yasuri.node_name(child_node.name, opt)
26
27
  [child_name, child_node.inject(agent, page, opt)]
27
28
  end
28
29
  child_results << Hash[child_results_kv]
29
30
 
30
- link = page.search(@xpath).first
31
+ link = page.search(@xpath).first # Todo raise: link is not found
31
32
  break if link == nil
32
33
 
33
34
  link_button = Mechanize::Page::Link.new(link, agent, page)
34
- page = Yasuri.with_retry(retry_count) { link_button.click }
35
+ page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
35
36
  break if (limit -= 1) <= 0
36
37
  end
37
38
 
@@ -41,8 +42,13 @@ module Yasuri
41
42
 
42
43
  child_results
43
44
  end
45
+
44
46
  def opts
45
47
  {limit:@limit, flatten:@flatten}
46
48
  end
49
+
50
+ def node_type_str
51
+ "pages".freeze
52
+ end
47
53
  end
48
54
  end
@@ -10,12 +10,16 @@ module Yasuri
10
10
  sub_tags = element.search(@xpath)
11
11
  tree = sub_tags.map do |sub_tag|
12
12
  child_results_kv = @children.map do |child_node|
13
- child_name = Yasuri.NodeName(child_node.name, opt)
13
+ child_name = Yasuri.node_name(child_node.name, opt)
14
14
  [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
15
  end
16
16
  Hash[child_results_kv]
17
17
  end
18
18
  tree.size == 1 ? tree.first : tree
19
19
  end # inject
20
+
21
+ def node_type_str
22
+ "struct".freeze
23
+ end
20
24
  end
21
25
  end
@@ -7,15 +7,17 @@ module Yasuri
7
7
  class TextNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], truncate: nil, proc:nil)
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  super(xpath, name, children)
12
12
 
13
+ truncate = opt[:truncate]
14
+ proc = opt[:proc]
15
+
13
16
  truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
14
17
  @truncate = truncate
15
18
  @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
16
19
 
17
20
  @proc = proc.nil? ? nil : proc.to_sym
18
-
19
21
  end
20
22
 
21
23
  def inject(agent, page, opt = {}, element = page)
@@ -28,11 +30,16 @@ module Yasuri
28
30
  end
29
31
 
30
32
  text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
33
+
31
34
  text
32
35
  end
33
36
 
34
37
  def opts
35
38
  {truncate:@truncate, proc:@proc}
36
39
  end
40
+
41
+ def node_type_str
42
+ "text".freeze
43
+ end
37
44
  end
38
45
  end
@@ -0,0 +1,8 @@
1
+ {
2
+ "pages_root": {
3
+ "path": "/html/body/nav/span/a[@class='next']",
4
+ "limit": 10,
5
+ "flatten": false,
6
+ "text_content": "/html/body/p"
7
+ }
8
+ }
@@ -0,0 +1,5 @@
1
+ pages_root:
2
+ path: "/html/body/nav/span/a[@class='next']"
3
+ limit: 10
4
+ flatten: false
5
+ text_content: "/html/body/p"
@@ -0,0 +1,9 @@
1
+ {
2
+ ,,,
3
+ "pages_root": {
4
+ "path": "/html/body/nav/span/a[@class='next']",
5
+ "limit": 10,
6
+ "flatten": false,
7
+ "text_content": "/html/body/p"
8
+ }
9
+ }
@@ -0,0 +1,6 @@
1
+ ,,,
2
+ pages_root:
3
+ path: "/html/body/nav/span/a[@class='next']"
4
+ limit: 10
5
+ flatten: false
6
+ text_content: "/html/body/p"
data/spec/spec_helper.rb CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
12
12
  }
13
13
  end
14
14
 
15
-
16
- # ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
17
- # require "codeclimate-test-reporter"
18
- # CodeClimate::TestReporter.start
19
-
20
15
  require 'simplecov'
21
16
  require 'coveralls'
22
17
  Coveralls.wear!
23
18
 
24
- SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
19
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
25
20
  SimpleCov::Formatter::HTMLFormatter,
26
21
  Coveralls::SimpleCov::Formatter
27
22
  ]
@@ -31,8 +26,8 @@ SimpleCov.start
31
26
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
32
27
  require 'yasuri'
33
28
 
34
- def compare_generated_vs_original(generated, original, page)
35
- expected = original.inject(@agent, page)
36
- actual = generated.inject(@agent, page)
29
+ def compare_generated_vs_original(generated, original, uri)
30
+ expected = original.scrape(uri)
31
+ actual = generated.scrape(uri)
37
32
  expect(actual).to match expected
38
33
  end
@@ -0,0 +1,96 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ before do
7
+ @agent = Mechanize.new
8
+ @index_page = @agent.get(uri)
9
+
10
+ @res_dir = File.expand_path('../cli_resources', __FILE__)
11
+ end
12
+
13
+ describe 'cli scrape' do
14
+ it "require --file or --json option" do
15
+ expect {
16
+ Yasuri::CLI.new.invoke(:scrape, [uri], {})
17
+ }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
18
+ end
19
+
20
+ it "only one of --file or --json option" do
21
+ expect {
22
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
23
+ }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
24
+ end
25
+
26
+ it "require --file option is not empty string" do
27
+ expect {
28
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
29
+ }.to output("ERROR: --file option require not empty argument.\n").to_stderr
30
+ end
31
+
32
+ it "require --json option is not empty string" do
33
+ expect {
34
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
35
+ }.to output("ERROR: --json option require not empty argument.\n").to_stderr
36
+ end
37
+
38
+
39
+ it "display text node as simple string" do
40
+ expect {
41
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
42
+ }.to output("Yasuri Test\n").to_stdout
43
+ end
44
+
45
+ it "display texts in single json" do
46
+ expect {
47
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
48
+ }.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
49
+ end
50
+
51
+
52
+ it "display text node as simple string via json file" do
53
+ expect {
54
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
55
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
56
+ end
57
+ it "display text node as simple string via yaml file" do
58
+ expect {
59
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
60
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
61
+ end
62
+
63
+ it "interval option is effect for each request" do
64
+ allow(Kernel).to receive(:sleep)
65
+
66
+ Yasuri::CLI.new.invoke(
67
+ :scrape,
68
+ [uri+"/pagination/page01.html"],
69
+ {file: "#{@res_dir}/tree.yml", interval: 500}
70
+ )
71
+
72
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
73
+ expect(interval_sec).to match 0.5
74
+ end
75
+ end
76
+
77
+ it "display ERROR when json string is wrong" do
78
+ wrong_json = '{,,}'
79
+ expect {
80
+ Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
81
+ }.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
82
+ end
83
+ it "display ERROR when json file contains is wrong" do
84
+ file_path = "#{@res_dir}/tree_wrong.json"
85
+ expect {
86
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
87
+ }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
88
+ end
89
+ it "display ERROR when yaml file contains is wrong" do
90
+ file_path = "#{@res_dir}/tree_wrong.yml"
91
+ expect {
92
+ Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
93
+ }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
94
+ end
95
+ end
96
+ end
@@ -11,9 +11,7 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::LinksNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri
16
- @index_page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it 'scrape links' do
@@ -21,7 +19,7 @@ describe 'Yasuri' do
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
21
 
24
- actual = root_node.inject(@agent, @index_page)
22
+ actual = root_node.scrape(@uri)
25
23
  expected = [
26
24
  {"content" => "Child 01 page."},
27
25
  {"content" => "Child 02 page."},
@@ -36,7 +34,7 @@ describe 'Yasuri' do
36
34
  Yasuri::TextNode.new('/html/body/p', "content"),
37
35
  ])
38
36
 
39
- actual = root_node.inject(@agent, @index_page)
37
+ actual = root_node.scrape(@uri)
40
38
  expect(actual).to be_empty
41
39
  end
42
40
 
@@ -47,7 +45,7 @@ describe 'Yasuri' do
47
45
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
48
46
  ]),
49
47
  ])
50
- actual = root_node.inject(@agent, @index_page)
48
+ actual = root_node.scrape(@uri)
51
49
  expected = [
52
50
  {"content" => "Child 01 page.",
53
51
  "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
@@ -59,10 +57,18 @@ describe 'Yasuri' do
59
57
  ]
60
58
  expect(actual).to match expected
61
59
  end
62
- it 'can be defined by DSL, return single LinkNode title' do
63
- generated = Yasuri.links_title '/html/body/a'
64
- original = Yasuri::LinksNode.new('/html/body/a', "title")
65
- compare_generated_vs_original(generated, original, @index_page)
60
+ it 'can be defined by DSL, return no contains if no child node' do
61
+ root_node = Yasuri.links_title '/html/body/a'
62
+ actual = root_node.scrape(@uri)
63
+ expected = [{}, {}, {}] # Empty if no child node under links node.
64
+ expect(actual).to match expected
65
+ end
66
+
67
+ it 'can be defined return no contains if no child node' do
68
+ root_node = Yasuri::LinksNode.new('/html/body/a', "title")
69
+ actual = root_node.scrape(@uri)
70
+ expected = [{}, {}, {}] # Empty if no child node under links node.
71
+ expect(actual).to match expected
66
72
  end
67
73
  it 'can be defined by DSL, return nested contents under link' do
68
74
  generated = Yasuri.links_title '/html/body/a' do
@@ -71,7 +77,7 @@ describe 'Yasuri' do
71
77
  original = Yasuri::LinksNode.new('/html/body/a', "root", [
72
78
  Yasuri::TextNode.new('/html/body/p', "name"),
73
79
  ])
74
- compare_generated_vs_original(generated, original, @index_page)
80
+ compare_generated_vs_original(generated, original, @uri)
75
81
  end
76
82
 
77
83
  it 'can be defined by DSL, return recursive links node' do
@@ -88,7 +94,7 @@ describe 'Yasuri' do
88
94
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
89
95
  ]),
90
96
  ])
91
- compare_generated_vs_original(generated, original, @index_page)
97
+ compare_generated_vs_original(generated, original, @uri)
92
98
  end
93
99
 
94
100
  it 'return child node as symbol' do
@@ -96,7 +102,7 @@ describe 'Yasuri' do
96
102
  Yasuri::TextNode.new('/html/body/p', "content"),
97
103
  ])
98
104
 
99
- actual = root_node.inject(@agent, @index_page, symbolize_names: true )
105
+ actual = root_node.scrape(@uri, symbolize_names: true )
100
106
  expected = [
101
107
  {:content => "Child 01 page."},
102
108
  {:content => "Child 02 page."},
@@ -104,5 +110,21 @@ describe 'Yasuri' do
104
110
  ]
105
111
  expect(actual).to match expected
106
112
  end
113
+
114
+ it 'scrape with interval for each request' do
115
+ allow(Kernel).to receive(:sleep)
116
+
117
+ root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
118
+ Yasuri::TextNode.new('/html/body/p', "content"),
119
+ ])
120
+ actual = root_node.scrape(@uri, interval_ms: 100)
121
+
122
+ expect(actual.size).to match 3
123
+
124
+ # request will be run 4(1+3) times because root page will be requested
125
+ expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
126
+ expect(interval_sec).to match 0.1
127
+ end
128
+ end
107
129
  end
108
130
  end