yasuri 2.0.11 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.11"
2
+ VERSION = "3.2.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -4,28 +4,44 @@
4
4
 
5
5
  require 'mechanize'
6
6
  require 'json'
7
+ require 'yaml'
7
8
 
8
9
  require_relative 'yasuri_node'
9
10
  require_relative 'yasuri_text_node'
10
11
  require_relative 'yasuri_struct_node'
11
12
  require_relative 'yasuri_paginate_node'
12
13
  require_relative 'yasuri_links_node'
14
+ require_relative 'yasuri_map_node'
13
15
  require_relative 'yasuri_node_generator'
14
16
 
15
17
  module Yasuri
16
18
 
19
+ DefaultRetryCount = 5
20
+
17
21
  def self.json2tree(json_string)
18
- json = JSON.parse(json_string, {symbolize_names: true})
19
- Yasuri.hash2node(json)
22
+ raise RuntimeError if json_string.nil? or json_string.empty?
23
+
24
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
25
+ Yasuri.hash2node(node_hash)
20
26
  end
21
27
 
22
28
  def self.tree2json(node)
29
+ raise RuntimeError if node.nil?
30
+
23
31
  Yasuri.node2hash(node).to_json
24
32
  end
25
33
 
26
- def self.method_missing(name, *args, &block)
27
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
28
- generated || super(name, args)
34
+ def self.yaml2tree(yaml_string)
35
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
36
+
37
+ node_hash = YAML.load(yaml_string)
38
+ Yasuri.hash2node(node_hash.deep_symbolize_keys)
39
+ end
40
+
41
+ private
42
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
43
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
44
+ generated || super(method_name, **opt)
29
45
  end
30
46
 
31
47
  private
@@ -33,52 +49,66 @@ module Yasuri
33
49
  text: Yasuri::TextNode,
34
50
  struct: Yasuri::StructNode,
35
51
  links: Yasuri::LinksNode,
36
- pages: Yasuri::PaginateNode
52
+ pages: Yasuri::PaginateNode,
53
+ map: Yasuri::MapNode
37
54
  }
38
- Node2Text = Text2Node.invert
39
55
 
40
- ReservedKeys = %i|node name path children|
41
- def self.hash2node(node_h)
42
- node, name, path, children = ReservedKeys.map do |key|
43
- node_h[key]
56
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
57
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
58
+
59
+ node_prefixes = Text2Node.keys.freeze
60
+ child_nodes = []
61
+ opt = {}
62
+ path = nil
63
+
64
+ if node_hash.is_a?(String)
65
+ path = node_hash
66
+ else
67
+ node_hash.each do |key, value|
68
+ # is node?
69
+ node_regexps = Text2Node.keys.map do |node_type_sym|
70
+ /^(#{node_type_sym.to_s})_(.+)$/
71
+ end
72
+ node_regexp = node_regexps.find do |node_regexp|
73
+ key =~ node_regexp
74
+ end
75
+
76
+ case key
77
+ when node_regexp
78
+ node_type_sym = $1.to_sym
79
+ child_node_name = $2
80
+ child_node_type = Text2Node[node_type_sym]
81
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
82
+ when :path
83
+ path = value
84
+ else
85
+ opt[key] = value
86
+ end
87
+ end
44
88
  end
45
- children ||= []
46
89
 
47
- fail "Not found 'node' value in json" if node.nil?
48
- fail "Not found 'name' value in json" if name.nil?
49
- fail "Not found 'path' value in json" if path.nil?
90
+ # If only single node under root, return only the node.
91
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
50
92
 
51
- childnodes = children.map{|c| Yasuri.hash2node(c) }
52
- ReservedKeys.each{|key| node_h.delete(key)}
53
- opt = node_h
93
+ node = if node_type_class.nil?
94
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
95
+ else
96
+ node_type_class::new(path, node_name, child_nodes, **opt)
97
+ end
54
98
 
55
- klass = Text2Node[node.to_sym]
56
- fail "Undefined node type #{node}" if klass.nil?
57
- klass.new(path, name, childnodes, opt)
99
+ node
58
100
  end
59
101
 
60
102
  def self.node2hash(node)
61
- json = JSON.parse("{}")
62
- return json if node.nil?
103
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
63
104
 
64
- klass = node.class
65
- klass_str = Node2Text[klass]
66
-
67
- json["node"] = klass_str
68
- json["name"] = node.name
69
- json["path"] = node.xpath
70
-
71
- children = node.children.map{|c| Yasuri.node2hash(c)}
72
- json["children"] = children if not children.empty?
73
-
74
- node.opts.each do |key,value|
75
- json[key] = value if not value.nil?
76
- end
77
-
78
- json
105
+ {
106
+ "#{node.node_type_str}_#{node.name}" => node.to_h
107
+ }
79
108
  end
80
109
 
81
- def self.NodeName(name, symbolize_names:false)
110
+ def self.node_name(name, opt)
111
+ symbolize_names = opt[:symbolize_names]
82
112
  symbolize_names ? name.to_sym : name
83
113
  end
84
114
 
@@ -95,3 +125,14 @@ module Yasuri
95
125
  end
96
126
  end
97
127
  end
128
+
129
+ class Hash
130
+ def deep_symbolize_keys
131
+ Hash[
132
+ self.map do |k, v|
133
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
134
+ [k.to_sym, v]
135
+ end
136
+ ]
137
+ end
138
+ end
@@ -0,0 +1,64 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+ require 'mechanize'
5
+
6
+ module Yasuri
7
+ class CLI < Thor
8
+ package_name "yasuri"
9
+
10
+ default_command :scrape
11
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
12
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
13
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
14
+ def scrape(uri)
15
+ # argument validations
16
+ if [options[:file], options[:json]].compact.count != 1
17
+ $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
+ return -1
19
+ end
20
+ if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
+ $stderr.puts "ERROR: --file option require not empty argument."
22
+ return -1
23
+ end
24
+ if options[:json]&.empty? or options[:json] == "json"
25
+ $stderr.puts "ERROR: --json option require not empty argument."
26
+ return -1
27
+ end
28
+
29
+ tree = if options[:file]
30
+ src = File.read(options[:file])
31
+
32
+ begin
33
+ Yasuri.json2tree(src)
34
+ rescue
35
+ begin
36
+ Yasuri.yaml2tree(src)
37
+ rescue => e
38
+ $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
39
+ return -1
40
+ end
41
+ end
42
+ else
43
+ begin
44
+ Yasuri.json2tree(options[:json])
45
+ rescue => e
46
+ $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
47
+ return -1
48
+ end
49
+ end
50
+
51
+ agent = Mechanize.new
52
+ root_page = agent.get(uri)
53
+ result = tree.inject(agent, root_page)
54
+
55
+ if result.instance_of?(String)
56
+ puts result
57
+ else
58
+ j result
59
+ end
60
+
61
+ return 0
62
+ end
63
+ end
64
+ end
@@ -6,21 +6,25 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
10
- retry_count = opt[:retry_count] || 5
9
+ def inject(agent, page, opt = {}, element = page)
10
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
11
11
 
12
- links = page.search(@xpath) || [] # links expected
12
+ links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
16
16
 
17
17
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
18
+ child_name = Yasuri.node_name(child_node.name, opt)
19
19
  [child_name, child_node.inject(agent, child_page, opt)]
20
20
  end
21
21
 
22
22
  Hash[child_results_kv]
23
23
  end # each named child node
24
24
  end
25
- end
26
- end
25
+
26
+ def node_type_str
27
+ "links".freeze
28
+ end
29
+ end # class
30
+ end # module
@@ -0,0 +1,39 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ attr_reader :name, :children
5
+
6
+ def initialize(name, children, **opt)
7
+ @name = name
8
+ @children = children
9
+ @opt = opt
10
+ end
11
+
12
+ def inject(agent, page, opt = {}, element = page)
13
+ child_results_kv = @children.map do |node|
14
+ [node.name, node.inject(agent, page, opt)]
15
+ end
16
+ Hash[child_results_kv]
17
+ end
18
+
19
+ def to_h
20
+ node_hash = {}
21
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
22
+
23
+ children.each do |child|
24
+ child_node_name = "#{child.node_type_str}_#{child.name}"
25
+ node_hash[child_node_name] = child.to_h
26
+ end
27
+
28
+ node_hash
29
+ end
30
+
31
+ def opts
32
+ {}
33
+ end
34
+
35
+ def node_type_str
36
+ "map".freeze
37
+ end
38
+ end
39
+ end
@@ -7,15 +7,36 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt: {})
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {})
15
- fail "#{Kernel.__method__} is not implemented."
14
+ def inject(agent, page, opt = {}, element = page)
15
+ fail "#{Kernel.__method__} is not implemented in included class."
16
16
  end
17
+
18
+ def to_h
19
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
20
+
21
+ node_hash = {}
22
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
23
+
24
+ node_hash[:path] = @xpath if @xpath
25
+
26
+ children.each do |child|
27
+ child_node_name = "#{child.node_type_str}_#{child.name}"
28
+ node_hash[child_node_name] = child.to_h
29
+ end
30
+
31
+ node_hash
32
+ end
33
+
17
34
  def opts
18
35
  {}
19
36
  end
37
+
38
+ def node_type_str
39
+ fail "#{Kernel.__method__} is not implemented in included class."
40
+ end
20
41
  end
21
42
  end
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
6
6
  require_relative 'yasuri_struct_node'
7
7
  require_relative 'yasuri_links_node'
8
8
  require_relative 'yasuri_paginate_node'
9
+ require_relative 'yasuri_map_node'
9
10
 
10
11
  module Yasuri
11
12
  class NodeGenerator
@@ -15,29 +16,33 @@ module Yasuri
15
16
  @nodes
16
17
  end
17
18
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
19
+ def method_missing(name, pattern=nil, **args, &block)
20
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
21
  raise "Undefined Node Name '#{name}'" if node == nil
21
22
  @nodes << node
22
23
  end
23
24
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
25
+ def self.gen(method_name, xpath, **opt, &block)
27
26
  children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
28
27
 
29
- case name
28
+ case method_name
30
29
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
30
+ # Todo raise error xpath is not valid
31
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
32
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
33
+ # Todo raise error xpath is not valid
34
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
35
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
36
+ # Todo raise error xpath is not valid
37
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
38
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
39
+ # Todo raise error xpath is not valid
40
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
41
+ when /^map_(.+)$/
42
+ Yasuri::MapNode.new($1, children, **opt)
38
43
  else
39
44
  nil
40
45
  end
41
- end # of self.gen(name, *args, &block)
46
+ end # of self.gen(method_name, xpath, **opt, &block)
42
47
  end # of class NodeGenerator
43
48
  end
@@ -7,24 +7,27 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], limit: nil)
10
+ def initialize(xpath, name, children = [], limit: nil, flatten: false)
11
11
  super(xpath, name, children)
12
+ @flatten = flatten
12
13
  @limit = limit
13
14
  end
14
15
 
15
- def inject(agent, page, opt = {})
16
- retry_count = opt[:retry_count] || 5
16
+ def inject(agent, page, opt = {}, element = page)
17
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
+
19
+ raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
17
20
 
18
21
  child_results = []
19
22
  limit = @limit.nil? ? Float::MAX : @limit
20
23
  while page
21
24
  child_results_kv = @children.map do |child_node|
22
- child_name = Yasuri.NodeName(child_node.name, opt)
25
+ child_name = Yasuri.node_name(child_node.name, opt)
23
26
  [child_name, child_node.inject(agent, page, opt)]
24
27
  end
25
28
  child_results << Hash[child_results_kv]
26
29
 
27
- link = page.search(@xpath).first
30
+ link = page.search(@xpath).first # Todo raise: link is not found
28
31
  break if link == nil
29
32
 
30
33
  link_button = Mechanize::Page::Link.new(link, agent, page)
@@ -32,10 +35,19 @@ module Yasuri
32
35
  break if (limit -= 1) <= 0
33
36
  end
34
37
 
38
+ if @flatten == true
39
+ return child_results.map{|h| h.values}.flatten
40
+ end
41
+
35
42
  child_results
36
43
  end
44
+
37
45
  def opts
38
- {limit:@limit}
46
+ {limit:@limit, flatten:@flatten}
47
+ end
48
+
49
+ def node_type_str
50
+ "pages".freeze
39
51
  end
40
52
  end
41
53
  end