yasuri 2.0.13 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ include Node
5
+ attr_reader :name, :children
6
+
7
+ def initialize(name, children, **opt)
8
+ @name = name
9
+ @children = children
10
+ @opt = opt
11
+ end
12
+
13
+ def inject(agent, page, opt = {}, _element = page)
14
+ child_results_kv = @children.map do |node|
15
+ [node.name, node.inject(agent, page, opt)]
16
+ end
17
+ Hash[child_results_kv]
18
+ end
19
+
20
+ def to_h
21
+ node_hash = {}
22
+ self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
23
+
24
+ children.each do |child|
25
+ child_node_name = "#{child.node_type_str}_#{child.name}"
26
+ node_hash[child_node_name] = child.to_h
27
+ end
28
+
29
+ node_hash
30
+ end
31
+
32
+ def opts
33
+ {}
34
+ end
35
+
36
+ def node_type_str
37
+ "map".freeze
38
+ end
39
+ end
40
+ end
@@ -1,21 +1,53 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  module Node
8
6
  attr_reader :url, :xpath, :name, :children
9
7
 
10
- def initialize(xpath, name, children = [], opt: {})
8
+ def initialize(xpath, name, children = [], **_opt)
11
9
  @xpath, @name, @children = xpath, name, children
12
10
  end
13
11
 
12
+ def scrape(uri, opt = {})
13
+ agent = Mechanize.new
14
+ scrape_with_agent(uri, agent, opt)
15
+ end
16
+
17
+ def scrape_with_agent(uri, agent, opt = {})
18
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
19
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
20
+
21
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
22
+ inject(agent, page, opt)
23
+ end
24
+
14
25
  def inject(agent, page, opt = {}, element = page)
15
- fail "#{Kernel.__method__} is not implemented."
26
+ fail "#{Kernel.__method__} is not implemented in included class."
16
27
  end
28
+
29
+ def to_h
30
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
31
+
32
+ node_hash = {}
33
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
34
+
35
+ node_hash[:path] = @xpath if @xpath
36
+
37
+ children.each do |child|
38
+ child_node_name = "#{child.node_type_str}_#{child.name}"
39
+ node_hash[child_node_name] = child.to_h
40
+ end
41
+
42
+ node_hash
43
+ end
44
+
17
45
  def opts
18
46
  {}
19
47
  end
48
+
49
+ def node_type_str
50
+ fail "#{Kernel.__method__} is not implemented in included class."
51
+ end
20
52
  end
21
53
  end
@@ -1,11 +1,10 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
  require_relative 'yasuri_text_node'
6
4
  require_relative 'yasuri_struct_node'
7
5
  require_relative 'yasuri_links_node'
8
6
  require_relative 'yasuri_paginate_node'
7
+ require_relative 'yasuri_map_node'
9
8
 
10
9
  module Yasuri
11
10
  class NodeGenerator
@@ -15,29 +14,33 @@ module Yasuri
15
14
  @nodes
16
15
  end
17
16
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
17
+ def method_missing(name, pattern=nil, **args, &block)
18
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
19
  raise "Undefined Node Name '#{name}'" if node == nil
21
20
  @nodes << node
22
21
  end
23
22
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
27
- children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
23
+ def self.gen(method_name, xpath, **opt, &block)
24
+ children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block
28
25
 
29
- case name
26
+ case method_name
30
27
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
28
+ # Todo raise error xpath is not valid
29
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
30
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
31
+ # Todo raise error xpath is not valid
32
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
33
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
34
+ # Todo raise error xpath is not valid
35
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
36
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
37
+ # Todo raise error xpath is not valid
38
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
39
+ when /^map_(.+)$/
40
+ Yasuri::MapNode.new($1, children, **opt)
38
41
  else
39
42
  nil
40
43
  end
41
- end # of self.gen(name, *args, &block)
44
+ end # of self.gen(method_name, xpath, **opt, &block)
42
45
  end # of class NodeGenerator
43
46
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -14,35 +12,47 @@ module Yasuri
14
12
  end
15
13
 
16
14
  def inject(agent, page, opt = {}, element = page)
17
- retry_count = opt[:retry_count] || 5
18
-
19
15
  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
16
 
21
- child_results = []
22
17
  limit = @limit.nil? ? Float::MAX : @limit
18
+ child_results = inject_child(agent, page, limit, opt)
19
+
20
+ return child_results.map(&:values).flatten if @flatten == true
21
+
22
+ child_results
23
+ end
24
+
25
+ def opts
26
+ { limit: @limit, flatten: @flatten }
27
+ end
28
+
29
+ def node_type_str
30
+ "pages".freeze
31
+ end
32
+
33
+ private
34
+
35
+ def inject_child(agent, page, limit, opt)
36
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
37
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
38
+
39
+ child_results = []
23
40
  while page
24
41
  child_results_kv = @children.map do |child_node|
25
- child_name = Yasuri.NodeName(child_node.name, opt)
42
+ child_name = Yasuri.node_name(child_node.name, opt)
26
43
  [child_name, child_node.inject(agent, page, opt)]
27
44
  end
28
45
  child_results << Hash[child_results_kv]
29
46
 
30
- link = page.search(@xpath).first
31
- break if link == nil
47
+ link = page.search(@xpath).first # Todo raise: link is not found
48
+ break if link.nil?
32
49
 
33
50
  link_button = Mechanize::Page::Link.new(link, agent, page)
34
- page = Yasuri.with_retry(retry_count) { link_button.click }
51
+ page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
35
52
  break if (limit -= 1) <= 0
36
53
  end
37
54
 
38
- if @flatten == true
39
- return child_results.map{|h| h.values}.flatten
40
- end
41
-
42
55
  child_results
43
56
  end
44
- def opts
45
- {limit:@limit, flatten:@flatten}
46
- end
47
57
  end
48
58
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -10,12 +8,16 @@ module Yasuri
10
8
  sub_tags = element.search(@xpath)
11
9
  tree = sub_tags.map do |sub_tag|
12
10
  child_results_kv = @children.map do |child_node|
13
- child_name = Yasuri.NodeName(child_node.name, opt)
11
+ child_name = Yasuri.node_name(child_node.name, opt)
14
12
  [child_name, child_node.inject(agent, page, opt, sub_tag)]
15
13
  end
16
14
  Hash[child_results_kv]
17
15
  end
18
16
  tree.size == 1 ? tree.first : tree
19
- end # inject
17
+ end
18
+
19
+ def node_type_str
20
+ "struct".freeze
21
+ end
20
22
  end
21
23
  end
@@ -1,24 +1,24 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  class TextNode
8
6
  include Node
9
7
 
10
- def initialize(xpath, name, children = [], truncate: nil, proc:nil)
8
+ def initialize(xpath, name, children = [], **opt)
11
9
  super(xpath, name, children)
12
10
 
13
- truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
11
+ truncate = opt[:truncate]
12
+ proc = opt[:proc]
13
+
14
+ truncate = Regexp.new(truncate) unless truncate.nil? # regexp or nil
14
15
  @truncate = truncate
15
- @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
16
+ @truncate = Regexp.new(@truncate.to_s) unless @truncate.nil?
16
17
 
17
18
  @proc = proc.nil? ? nil : proc.to_sym
18
-
19
19
  end
20
20
 
21
- def inject(agent, page, opt = {}, element = page)
21
+ def inject(_agent, page, _opt = {}, element = page)
22
22
  node = element.search(@xpath)
23
23
  text = node.text.to_s
24
24
 
@@ -28,11 +28,16 @@ module Yasuri
28
28
  end
29
29
 
30
30
  text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
31
+
31
32
  text
32
33
  end
33
34
 
34
35
  def opts
35
- {truncate:@truncate, proc:@proc}
36
+ { truncate: @truncate, proc: @proc }
37
+ end
38
+
39
+ def node_type_str
40
+ "text".freeze
36
41
  end
37
42
  end
38
43
  end
@@ -0,0 +1,8 @@
1
+ {
2
+ "pages_root": {
3
+ "path": "/html/body/nav/span/a[@class='next']",
4
+ "limit": 10,
5
+ "flatten": false,
6
+ "text_content": "/html/body/p"
7
+ }
8
+ }
@@ -0,0 +1,5 @@
1
+ pages_root:
2
+ path: "/html/body/nav/span/a[@class='next']"
3
+ limit: 10
4
+ flatten: false
5
+ text_content: "/html/body/p"
@@ -0,0 +1,9 @@
1
+ {
2
+ ,,,
3
+ "pages_root": {
4
+ "path": "/html/body/nav/span/a[@class='next']",
5
+ "limit": 10,
6
+ "flatten": false,
7
+ "text_content": "/html/body/p"
8
+ }
9
+ }
@@ -0,0 +1,6 @@
1
+ ,,,
2
+ pages_root:
3
+ path: "/html/body/nav/span/a[@class='next']"
4
+ limit: 10
5
+ flatten: false
6
+ text_content: "/html/body/p"
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
 
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- # Author:: TAC (tac@tac42.net)
3
1
 
4
2
  require 'glint'
5
3
  Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
@@ -12,16 +10,11 @@ shared_context 'httpserver' do
12
10
  }
13
11
  end
14
12
 
15
-
16
- # ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
17
- # require "codeclimate-test-reporter"
18
- # CodeClimate::TestReporter.start
19
-
20
13
  require 'simplecov'
21
14
  require 'coveralls'
22
15
  Coveralls.wear!
23
16
 
24
- SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
17
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
25
18
  SimpleCov::Formatter::HTMLFormatter,
26
19
  Coveralls::SimpleCov::Formatter
27
20
  ]
@@ -31,8 +24,8 @@ SimpleCov.start
31
24
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
32
25
  require 'yasuri'
33
26
 
34
- def compare_generated_vs_original(generated, original, page)
35
- expected = original.inject(@agent, page)
36
- actual = generated.inject(@agent, page)
27
+ def compare_generated_vs_original(generated, original, uri)
28
+ expected = original.scrape(uri)
29
+ actual = generated.scrape(uri)
37
30
  expect(actual).to match expected
38
31
  end
@@ -0,0 +1,114 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+ let(:res_dir) { File.expand_path('cli_resources', __dir__) }
6
+
7
+ describe 'cli scrape' do
8
+ it 'require --file or --json option' do
9
+ expect do
10
+ Yasuri::CLI.new.invoke(:scrape, [uri], {})
11
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
12
+ end
13
+
14
+ it 'only one of --file or --json option' do
15
+ expect do
16
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'path.json', json: '{"text_title": "/html/head/title"}' })
17
+ end.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
18
+ end
19
+
20
+ it 'require --file option is not empty string' do
21
+ expect do
22
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: 'file' })
23
+ end.to output("ERROR: --file option require not empty argument.\n").to_stderr
24
+ end
25
+
26
+ it 'require --json option is not empty string' do
27
+ expect do
28
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: 'json' })
29
+ end.to output("ERROR: --json option require not empty argument.\n").to_stderr
30
+ end
31
+
32
+ it 'display text node as simple string' do
33
+ expect do
34
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_title": "/html/head/title"}' })
35
+ end.to output("Yasuri Test\n").to_stdout
36
+ end
37
+
38
+ it 'display texts in single json' do
39
+ expect do
40
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}' })
41
+ end.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}' << "\n").to_stdout
42
+ end
43
+
44
+ it 'display text node as simple string via json file' do
45
+ expect do
46
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.json" })
47
+ end.to output(
48
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
49
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
50
+ "\n"
51
+ ).to_stdout
52
+ end
53
+
54
+ it 'display text node as simple string via yaml file' do
55
+ expect do
56
+ Yasuri::CLI.new.invoke(:scrape, ["#{uri}/pagination/page01.html"], { file: "#{res_dir}/tree.yml" })
57
+ end.to output(
58
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
59
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
60
+ "\n"
61
+ ).to_stdout
62
+ end
63
+
64
+ it 'interval option is effect for each request' do
65
+ allow(Kernel).to receive(:sleep)
66
+
67
+ expect do
68
+ Yasuri::CLI.new.invoke(
69
+ :scrape,
70
+ ["#{uri}/pagination/page01.html"],
71
+ { file: "#{res_dir}/tree.yml", interval: 500 }
72
+ )
73
+ end.to output(
74
+ '[{"content":"PaginationTest01"},{"content":"PaginationTest02"},' \
75
+ '{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' \
76
+ "\n"
77
+ ).to_stdout
78
+
79
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
80
+ expect(interval_sec).to match 0.5
81
+ end
82
+ end
83
+
84
+ it 'display ERROR when json string is wrong' do
85
+ wrong_json = '{,,}'
86
+ expect do
87
+ Yasuri::CLI.new.invoke(:scrape, [uri], { json: wrong_json })
88
+ end.to output(
89
+ 'ERROR: Failed to convert json to yasuri tree. ' \
90
+ "809: unexpected token at '#{wrong_json}'\n"
91
+ ).to_stderr
92
+ end
93
+
94
+ it 'display ERROR when json file contains is wrong' do
95
+ file_path = "#{res_dir}/tree_wrong.json"
96
+ expect do
97
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
98
+ end.to output(
99
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
100
+ "(<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n"
101
+ ).to_stderr
102
+ end
103
+
104
+ it 'display ERROR when yaml file contains is wrong' do
105
+ file_path = "#{res_dir}/tree_wrong.yml"
106
+ expect do
107
+ Yasuri::CLI.new.invoke(:scrape, [uri], { file: file_path })
108
+ end.to output(
109
+ "ERROR: Failed to convert to yasuri tree `#{file_path}`. " \
110
+ "(<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n"
111
+ ).to_stderr
112
+ end
113
+ end
114
+ end