yasuri 2.0.11 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.11"
2
+ VERSION = "3.2.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -4,28 +4,44 @@
4
4
 
5
5
  require 'mechanize'
6
6
  require 'json'
7
+ require 'yaml'
7
8
 
8
9
  require_relative 'yasuri_node'
9
10
  require_relative 'yasuri_text_node'
10
11
  require_relative 'yasuri_struct_node'
11
12
  require_relative 'yasuri_paginate_node'
12
13
  require_relative 'yasuri_links_node'
14
+ require_relative 'yasuri_map_node'
13
15
  require_relative 'yasuri_node_generator'
14
16
 
15
17
  module Yasuri
16
18
 
19
+ DefaultRetryCount = 5
20
+
17
21
  def self.json2tree(json_string)
18
- json = JSON.parse(json_string, {symbolize_names: true})
19
- Yasuri.hash2node(json)
22
+ raise RuntimeError if json_string.nil? or json_string.empty?
23
+
24
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
25
+ Yasuri.hash2node(node_hash)
20
26
  end
21
27
 
22
28
  def self.tree2json(node)
29
+ raise RuntimeError if node.nil?
30
+
23
31
  Yasuri.node2hash(node).to_json
24
32
  end
25
33
 
26
- def self.method_missing(name, *args, &block)
27
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
28
- generated || super(name, args)
34
+ def self.yaml2tree(yaml_string)
35
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
36
+
37
+ node_hash = YAML.load(yaml_string)
38
+ Yasuri.hash2node(node_hash.deep_symbolize_keys)
39
+ end
40
+
41
+ private
42
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
43
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
44
+ generated || super(method_name, **opt)
29
45
  end
30
46
 
31
47
  private
@@ -33,52 +49,66 @@ module Yasuri
33
49
  text: Yasuri::TextNode,
34
50
  struct: Yasuri::StructNode,
35
51
  links: Yasuri::LinksNode,
36
- pages: Yasuri::PaginateNode
52
+ pages: Yasuri::PaginateNode,
53
+ map: Yasuri::MapNode
37
54
  }
38
- Node2Text = Text2Node.invert
39
55
 
40
- ReservedKeys = %i|node name path children|
41
- def self.hash2node(node_h)
42
- node, name, path, children = ReservedKeys.map do |key|
43
- node_h[key]
56
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
57
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
58
+
59
+ node_prefixes = Text2Node.keys.freeze
60
+ child_nodes = []
61
+ opt = {}
62
+ path = nil
63
+
64
+ if node_hash.is_a?(String)
65
+ path = node_hash
66
+ else
67
+ node_hash.each do |key, value|
68
+ # is node?
69
+ node_regexps = Text2Node.keys.map do |node_type_sym|
70
+ /^(#{node_type_sym.to_s})_(.+)$/
71
+ end
72
+ node_regexp = node_regexps.find do |node_regexp|
73
+ key =~ node_regexp
74
+ end
75
+
76
+ case key
77
+ when node_regexp
78
+ node_type_sym = $1.to_sym
79
+ child_node_name = $2
80
+ child_node_type = Text2Node[node_type_sym]
81
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
82
+ when :path
83
+ path = value
84
+ else
85
+ opt[key] = value
86
+ end
87
+ end
44
88
  end
45
- children ||= []
46
89
 
47
- fail "Not found 'node' value in json" if node.nil?
48
- fail "Not found 'name' value in json" if name.nil?
49
- fail "Not found 'path' value in json" if path.nil?
90
+ # If only single node under root, return only the node.
91
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
50
92
 
51
- childnodes = children.map{|c| Yasuri.hash2node(c) }
52
- ReservedKeys.each{|key| node_h.delete(key)}
53
- opt = node_h
93
+ node = if node_type_class.nil?
94
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
95
+ else
96
+ node_type_class::new(path, node_name, child_nodes, **opt)
97
+ end
54
98
 
55
- klass = Text2Node[node.to_sym]
56
- fail "Undefined node type #{node}" if klass.nil?
57
- klass.new(path, name, childnodes, opt)
99
+ node
58
100
  end
59
101
 
60
102
  def self.node2hash(node)
61
- json = JSON.parse("{}")
62
- return json if node.nil?
103
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
63
104
 
64
- klass = node.class
65
- klass_str = Node2Text[klass]
66
-
67
- json["node"] = klass_str
68
- json["name"] = node.name
69
- json["path"] = node.xpath
70
-
71
- children = node.children.map{|c| Yasuri.node2hash(c)}
72
- json["children"] = children if not children.empty?
73
-
74
- node.opts.each do |key,value|
75
- json[key] = value if not value.nil?
76
- end
77
-
78
- json
105
+ {
106
+ "#{node.node_type_str}_#{node.name}" => node.to_h
107
+ }
79
108
  end
80
109
 
81
- def self.NodeName(name, symbolize_names:false)
110
+ def self.node_name(name, opt)
111
+ symbolize_names = opt[:symbolize_names]
82
112
  symbolize_names ? name.to_sym : name
83
113
  end
84
114
 
@@ -95,3 +125,14 @@ module Yasuri
95
125
  end
96
126
  end
97
127
  end
128
+
129
+ class Hash
130
+ def deep_symbolize_keys
131
+ Hash[
132
+ self.map do |k, v|
133
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
134
+ [k.to_sym, v]
135
+ end
136
+ ]
137
+ end
138
+ end
@@ -0,0 +1,64 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+ require 'mechanize'
5
+
6
+ module Yasuri
7
+ class CLI < Thor
8
+ package_name "yasuri"
9
+
10
+ default_command :scrape
11
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
12
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
13
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
14
+ def scrape(uri)
15
+ # argument validations
16
+ if [options[:file], options[:json]].compact.count != 1
17
+ $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
+ return -1
19
+ end
20
+ if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
+ $stderr.puts "ERROR: --file option require not empty argument."
22
+ return -1
23
+ end
24
+ if options[:json]&.empty? or options[:json] == "json"
25
+ $stderr.puts "ERROR: --json option require not empty argument."
26
+ return -1
27
+ end
28
+
29
+ tree = if options[:file]
30
+ src = File.read(options[:file])
31
+
32
+ begin
33
+ Yasuri.json2tree(src)
34
+ rescue
35
+ begin
36
+ Yasuri.yaml2tree(src)
37
+ rescue => e
38
+ $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
39
+ return -1
40
+ end
41
+ end
42
+ else
43
+ begin
44
+ Yasuri.json2tree(options[:json])
45
+ rescue => e
46
+ $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
47
+ return -1
48
+ end
49
+ end
50
+
51
+ agent = Mechanize.new
52
+ root_page = agent.get(uri)
53
+ result = tree.inject(agent, root_page)
54
+
55
+ if result.instance_of?(String)
56
+ puts result
57
+ else
58
+ j result
59
+ end
60
+
61
+ return 0
62
+ end
63
+ end
64
+ end
@@ -6,21 +6,25 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
- def inject(agent, page, opt = {})
10
- retry_count = opt[:retry_count] || 5
9
+ def inject(agent, page, opt = {}, element = page)
10
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
11
11
 
12
- links = page.search(@xpath) || [] # links expected
12
+ links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
15
  child_page = Yasuri.with_retry(retry_count) { link_button.click }
16
16
 
17
17
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
18
+ child_name = Yasuri.node_name(child_node.name, opt)
19
19
  [child_name, child_node.inject(agent, child_page, opt)]
20
20
  end
21
21
 
22
22
  Hash[child_results_kv]
23
23
  end # each named child node
24
24
  end
25
- end
26
- end
25
+
26
+ def node_type_str
27
+ "links".freeze
28
+ end
29
+ end # class
30
+ end # module
@@ -0,0 +1,39 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ attr_reader :name, :children
5
+
6
+ def initialize(name, children, **opt)
7
+ @name = name
8
+ @children = children
9
+ @opt = opt
10
+ end
11
+
12
+ def inject(agent, page, opt = {}, element = page)
13
+ child_results_kv = @children.map do |node|
14
+ [node.name, node.inject(agent, page, opt)]
15
+ end
16
+ Hash[child_results_kv]
17
+ end
18
+
19
+ def to_h
20
+ node_hash = {}
21
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
22
+
23
+ children.each do |child|
24
+ child_node_name = "#{child.node_type_str}_#{child.name}"
25
+ node_hash[child_node_name] = child.to_h
26
+ end
27
+
28
+ node_hash
29
+ end
30
+
31
+ def opts
32
+ {}
33
+ end
34
+
35
+ def node_type_str
36
+ "map".freeze
37
+ end
38
+ end
39
+ end
@@ -7,15 +7,36 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt: {})
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
- def inject(agent, page, opt = {})
15
- fail "#{Kernel.__method__} is not implemented."
14
+ def inject(agent, page, opt = {}, element = page)
15
+ fail "#{Kernel.__method__} is not implemented in included class."
16
16
  end
17
+
18
+ def to_h
19
+ return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
20
+
21
+ node_hash = {}
22
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
23
+
24
+ node_hash[:path] = @xpath if @xpath
25
+
26
+ children.each do |child|
27
+ child_node_name = "#{child.node_type_str}_#{child.name}"
28
+ node_hash[child_node_name] = child.to_h
29
+ end
30
+
31
+ node_hash
32
+ end
33
+
17
34
  def opts
18
35
  {}
19
36
  end
37
+
38
+ def node_type_str
39
+ fail "#{Kernel.__method__} is not implemented in included class."
40
+ end
20
41
  end
21
42
  end
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
6
6
  require_relative 'yasuri_struct_node'
7
7
  require_relative 'yasuri_links_node'
8
8
  require_relative 'yasuri_paginate_node'
9
+ require_relative 'yasuri_map_node'
9
10
 
10
11
  module Yasuri
11
12
  class NodeGenerator
@@ -15,29 +16,33 @@ module Yasuri
15
16
  @nodes
16
17
  end
17
18
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
19
+ def method_missing(name, pattern=nil, **args, &block)
20
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
21
  raise "Undefined Node Name '#{name}'" if node == nil
21
22
  @nodes << node
22
23
  end
23
24
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
25
+ def self.gen(method_name, xpath, **opt, &block)
27
26
  children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
28
27
 
29
- case name
28
+ case method_name
30
29
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
30
+ # Todo raise error xpath is not valid
31
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
32
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
33
+ # Todo raise error xpath is not valid
34
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
35
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
36
+ # Todo raise error xpath is not valid
37
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
38
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
39
+ # Todo raise error xpath is not valid
40
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
41
+ when /^map_(.+)$/
42
+ Yasuri::MapNode.new($1, children, **opt)
38
43
  else
39
44
  nil
40
45
  end
41
- end # of self.gen(name, *args, &block)
46
+ end # of self.gen(method_name, xpath, **opt, &block)
42
47
  end # of class NodeGenerator
43
48
  end
@@ -7,24 +7,27 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], limit: nil)
10
+ def initialize(xpath, name, children = [], limit: nil, flatten: false)
11
11
  super(xpath, name, children)
12
+ @flatten = flatten
12
13
  @limit = limit
13
14
  end
14
15
 
15
- def inject(agent, page, opt = {})
16
- retry_count = opt[:retry_count] || 5
16
+ def inject(agent, page, opt = {}, element = page)
17
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
+
19
+ raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
17
20
 
18
21
  child_results = []
19
22
  limit = @limit.nil? ? Float::MAX : @limit
20
23
  while page
21
24
  child_results_kv = @children.map do |child_node|
22
- child_name = Yasuri.NodeName(child_node.name, opt)
25
+ child_name = Yasuri.node_name(child_node.name, opt)
23
26
  [child_name, child_node.inject(agent, page, opt)]
24
27
  end
25
28
  child_results << Hash[child_results_kv]
26
29
 
27
- link = page.search(@xpath).first
30
+ link = page.search(@xpath).first # Todo raise: link is not found
28
31
  break if link == nil
29
32
 
30
33
  link_button = Mechanize::Page::Link.new(link, agent, page)
@@ -32,10 +35,19 @@ module Yasuri
32
35
  break if (limit -= 1) <= 0
33
36
  end
34
37
 
38
+ if @flatten == true
39
+ return child_results.map{|h| h.values}.flatten
40
+ end
41
+
35
42
  child_results
36
43
  end
44
+
37
45
  def opts
38
- {limit:@limit}
46
+ {limit:@limit, flatten:@flatten}
47
+ end
48
+
49
+ def node_type_str
50
+ "pages".freeze
39
51
  end
40
52
  end
41
53
  end