yasuri 3.0.0 → 3.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,40 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ include Node
5
+ attr_reader :name, :children
6
+
7
+ def initialize(name, children, **opt)
8
+ @name = name
9
+ @children = children
10
+ @opt = opt
11
+ end
12
+
13
+ def inject(agent, page, opt = {}, _element = page)
14
+ child_results_kv = @children.map do |node|
15
+ [node.name, node.inject(agent, page, opt)]
16
+ end
17
+ Hash[child_results_kv]
18
+ end
19
+
20
+ def to_h
21
+ node_hash = {}
22
+ self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
23
+
24
+ children.each do |child|
25
+ child_node_name = "#{child.node_type_str}_#{child.name}"
26
+ node_hash[child_node_name] = child.to_h
27
+ end
28
+
29
+ node_hash
30
+ end
31
+
32
+ def opts
33
+ {}
34
+ end
35
+
36
+ def node_type_str
37
+ "map".freeze
38
+ end
39
+ end
40
+ end
@@ -1,21 +1,53 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  module Node
8
6
  attr_reader :url, :xpath, :name, :children
9
7
 
10
- def initialize(xpath, name, children = [], opt: {})
8
+ def initialize(xpath, name, children = [], **_opt)
11
9
  @xpath, @name, @children = xpath, name, children
12
10
  end
13
11
 
12
+ def scrape(uri, opt = {})
13
+ agent = Mechanize.new
14
+ scrape_with_agent(uri, agent, opt)
15
+ end
16
+
17
+ def scrape_with_agent(uri, agent, opt = {})
18
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
19
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
20
+
21
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
22
+ inject(agent, page, opt)
23
+ end
24
+
14
25
  def inject(agent, page, opt = {}, element = page)
15
- fail "#{Kernel.__method__} is not implemented."
26
+ fail "#{Kernel.__method__} is not implemented in included class."
16
27
  end
28
+
29
+ def to_h
30
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
31
+
32
+ node_hash = {}
33
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
34
+
35
+ node_hash[:path] = @xpath if @xpath
36
+
37
+ children.each do |child|
38
+ child_node_name = "#{child.node_type_str}_#{child.name}"
39
+ node_hash[child_node_name] = child.to_h
40
+ end
41
+
42
+ node_hash
43
+ end
44
+
17
45
  def opts
18
46
  {}
19
47
  end
48
+
49
+ def node_type_str
50
+ fail "#{Kernel.__method__} is not implemented in included class."
51
+ end
20
52
  end
21
53
  end
@@ -1,11 +1,10 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
  require_relative 'yasuri_text_node'
6
4
  require_relative 'yasuri_struct_node'
7
5
  require_relative 'yasuri_links_node'
8
6
  require_relative 'yasuri_paginate_node'
7
+ require_relative 'yasuri_map_node'
9
8
 
10
9
  module Yasuri
11
10
  class NodeGenerator
@@ -15,27 +14,33 @@ module Yasuri
15
14
  @nodes
16
15
  end
17
16
 
18
- def method_missing(name, pattern, **args, &block)
17
+ def method_missing(name, pattern=nil, **args, &block)
19
18
  node = NodeGenerator.gen(name, pattern, **args, &block)
20
19
  raise "Undefined Node Name '#{name}'" if node == nil
21
20
  @nodes << node
22
21
  end
23
22
 
24
- def self.gen(name, xpath, **opt, &block)
25
- children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
23
+ def self.gen(method_name, xpath, **opt, &block)
24
+ children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
26
25
 
27
- case name
26
+ case method_name
28
27
  when /^text_(.+)$/
29
- Yasuri::TextNode.new(xpath, $1, children || [], **opt)
28
+ # Todo raise error xpath is not valid
29
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
30
30
  when /^struct_(.+)$/
31
+ # Todo raise error xpath is not valid
31
32
  Yasuri::StructNode.new(xpath, $1, children || [], **opt)
32
33
  when /^links_(.+)$/
33
- Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
34
+ # Todo raise error xpath is not valid
35
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
34
36
  when /^pages_(.+)$/
37
+ # Todo raise error xpath is not valid
35
38
  Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
39
+ when /^map_(.+)$/
40
+ Yasuri::MapNode.new($1, children, **opt)
36
41
  else
37
42
  nil
38
43
  end
39
- end # of self.gen(name, *args, &block)
44
+ end # of self.gen(method_name, xpath, **opt, &block)
40
45
  end # of class NodeGenerator
41
46
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -14,35 +12,47 @@ module Yasuri
14
12
  end
15
13
 
16
14
  def inject(agent, page, opt = {}, element = page)
17
- retry_count = opt[:retry_count] || 5
18
-
19
15
  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
16
 
21
- child_results = []
22
17
  limit = @limit.nil? ? Float::MAX : @limit
18
+ child_results = inject_child(agent, page, limit, opt)
19
+
20
+ return child_results.map(&:values).flatten if @flatten == true
21
+
22
+ child_results
23
+ end
24
+
25
+ def opts
26
+ { limit: @limit, flatten: @flatten }
27
+ end
28
+
29
+ def node_type_str
30
+ "pages".freeze
31
+ end
32
+
33
+ private
34
+
35
+ def inject_child(agent, page, limit, opt)
36
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
37
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
38
+
39
+ child_results = []
23
40
  while page
24
41
  child_results_kv = @children.map do |child_node|
25
- child_name = Yasuri.NodeName(child_node.name, opt)
42
+ child_name = Yasuri.node_name(child_node.name, opt)
26
43
  [child_name, child_node.inject(agent, page, opt)]
27
44
  end
28
45
  child_results << Hash[child_results_kv]
29
46
 
30
- link = page.search(@xpath).first
31
- break if link == nil
47
+ link = page.search(@xpath).first # Todo raise: link is not found
48
+ break if link.nil?
32
49
 
33
50
  link_button = Mechanize::Page::Link.new(link, agent, page)
34
- page = Yasuri.with_retry(retry_count) { link_button.click }
51
+ page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
35
52
  break if (limit -= 1) <= 0
36
53
  end
37
54
 
38
- if @flatten == true
39
- return child_results.map{|h| h.values}.flatten
40
- end
41
-
42
55
  child_results
43
56
  end
44
- def opts
45
- {limit:@limit, flatten:@flatten}
46
- end
47
57
  end
48
58
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -10,12 +8,16 @@ module Yasuri
10
8
  sub_tags = element.search(@xpath)
11
9
  tree = sub_tags.map do |sub_tag|
12
10
  child_results_kv = @children.map do |child_node|
13
- child_name = Yasuri.NodeName(child_node.name, opt)
11
+ child_name = Yasuri.node_name(child_node.name, opt)
14
12
  [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
13
  end
16
14
  Hash[child_results_kv]
17
15
  end
18
16
  tree.size == 1 ? tree.first : tree
19
- end # inject
17
+ end
18
+
19
+ def node_type_str
20
+ "struct".freeze
21
+ end
20
22
  end
21
23
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -13,15 +11,14 @@ module Yasuri
13
11
  truncate = opt[:truncate]
14
12
  proc = opt[:proc]
15
13
 
16
- truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
14
+ truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
17
15
  @truncate = truncate
18
- @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
16
+ @truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
19
17
 
20
18
  @proc = proc.nil? ? nil : proc.to_sym
21
-
22
19
  end
23
20
 
24
- def inject(agent, page, opt = {}, element = page)
21
+ def inject(_agent, page, _opt = {}, element = page)
25
22
  node = element.search(@xpath)
26
23
  text = node.text.to_s
27
24
 
@@ -31,11 +28,16 @@ module Yasuri
31
28
  end
32
29
 
33
30
  text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
31
+
34
32
  text
35
33
  end
36
34
 
37
35
  def opts
38
- {truncate:@truncate, proc:@proc}
36
+ { truncate: @truncate, proc: @proc }
37
+ end
38
+
39
+ def node_type_str
40
+ "text".freeze
39
41
  end
40
42
  end
41
43
  end
@@ -0,0 +1,8 @@
1
+ {
2
+ "pages_root": {
3
+ "path": "/html/body/nav/span/a[@class='next']",
4
+ "limit": 10,
5
+ "flatten": false,
6
+ "text_content": "/html/body/p"
7
+ }
8
+ }
@@ -0,0 +1,5 @@
1
+ pages_root:
2
+ path: "/html/body/nav/span/a[@class='next']"
3
+ limit: 10
4
+ flatten: false
5
+ text_content: "/html/body/p"
@@ -0,0 +1,9 @@
1
+ {
2
+ ,,,
3
+ "pages_root": {
4
+ "path": "/html/body/nav/span/a[@class='next']",
5
+ "limit": 10,
6
+ "flatten": false,
7
+ "text_content": "/html/body/p"
8
+ }
9
+ }
@@ -0,0 +1,6 @@
1
+ ,,,
2
+ pages_root:
3
+ path: "/html/body/nav/span/a[@class='next']"
4
+ limit: 10
5
+ flatten: false
6
+ text_content: "/html/body/p"
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
 
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
  Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
@@ -16,7 +14,7 @@ require 'simplecov'
16
14
  require 'coveralls'
17
15
  Coveralls.wear!
18
16
 
19
- SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
17
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
20
18
  SimpleCov::Formatter::HTMLFormatter,
21
19
  Coveralls::SimpleCov::Formatter
22
20
  ]
@@ -26,8 +24,8 @@ SimpleCov.start
26
24
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
27
25
  require 'yasuri'
28
26
 
29
- def compare_generated_vs_original(generated, original, page)
30
- expected = original.inject(@agent, page)
31
- actual = generated.inject(@agent, page)
27
+ def compare_generated_vs_original(generated, original, uri)
28
+ expected = original.scrape(uri)
29
+ actual = generated.scrape(uri)
32
30
  expect(actual).to match expected
33
31
  end
@@ -0,0 +1,114 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+ let(:res_dir) { File.expand_path('cli_resources', __dir__) }
6
+
7
+ describe 'cli scrape' do
8
+ it 'require --file or --json option' do
9
+ expect do
10
+ Yasuri::CLI.new.invoke(:scrape, [uri], {})
11
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
12
+ end
13
+
14
+ it 'only one of --file or --json option' do
15
+ expect do
16
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
17
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
18
+ end
19
+
20
+ it 'require --file option is not empty string' do
21
+ expect do
22
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
23
+ end.to output("ERROR: --file option require not empty argument.\n").to_stderr
24
+ end
25
+
26
+ it 'require --json option is not empty string' do
27
+ expect do
28
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
29
+ end.to output("ERROR: --json option require not empty argument.\n").to_stderr
30
+ end
31
+
32
+ it 'display text node as simple string' do
33
+ expect do
34
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
35
+ end.to output("Yasuri Test\n").to_stdout
36
+ end
37
+
38
+ it 'display texts in single json' do
39
+ expect do
40
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
41
+ end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
42
+ end
43
+
44
+ it 'display text node as simple string via json file' do
45
+ expect do
46
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
47
+ end.to output(
48
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
49
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
50
+ "\n"
51
+ ).to_stdout
52
+ end
53
+
54
+ it 'display text node as simple string via yaml file' do
55
+ expect do
56
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
57
+ end.to output(
58
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
59
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
60
+ "\n"
61
+ ).to_stdout
62
+ end
63
+
64
+ it 'interval option is effect for each request' do
65
+ allow(Kernel).to receive(:sleep)
66
+
67
+ expect do
68
+ Yasuri::CLI.new.invoke(
69
+ :scrape,
70
+ ["#{uri}/pagination/page01.html"],
71
+ { file: "#{res_dir}/tree.yml", interval: 500 }
72
+ )
73
+ end.to output(
74
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
75
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
76
+ "\n"
77
+ ).to_stdout
78
+
79
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
80
+ expect(interval_sec).to match 0.5
81
+ end
82
+ end
83
+
84
+ it 'display ERROR when json string is wrong' do
85
+ wrong_json = '{,,}'
86
+ expect do
87
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
88
+ end.to output(
89
+ 'ERROR: Failed to convert json to yasuri tree. ' \
90
+ "809: unexpected token at '#{wrong_json}'\n"
91
+ ).to_stderr
92
+ end
93
+
94
+ it 'display ERROR when json file contains is wrong' do
95
+ file_path = "#{res_dir}/tree_wrong.json"
96
+ expect do
97
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
98
+ end.to output(
99
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
100
+ "(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
101
+ ).to_stderr
102
+ end
103
+
104
+ it 'display ERROR when yaml file contains is wrong' do
105
+ file_path = "#{res_dir}/tree_wrong.yml"
106
+ expect do
107
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
108
+ end.to output(
109
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
110
+ "(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
111
+ ).to_stderr
112
+ end
113
+ end
114
+ end