yasuri 2.0.11 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +88 -19
- data/USAGE.ja.md +325 -63
- data/USAGE.md +335 -69
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +80 -39
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +39 -0
- data/lib/yasuri/yasuri_node.rb +24 -3
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +18 -6
- data/lib/yasuri/yasuri_struct_node.rb +8 -4
- data/lib/yasuri/yasuri_text_node.rb +11 -4
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/spec_helper.rb +1 -6
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_map_spec.rb +76 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +199 -84
- data/spec/yasuri_struct_node_spec.rb +42 -1
- data/yasuri.gemspec +5 -3
- metadata +52 -19
data/exe/yasuri
ADDED
data/lib/yasuri.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -4,28 +4,44 @@
|
|
4
4
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
require_relative 'yasuri_node'
|
9
10
|
require_relative 'yasuri_text_node'
|
10
11
|
require_relative 'yasuri_struct_node'
|
11
12
|
require_relative 'yasuri_paginate_node'
|
12
13
|
require_relative 'yasuri_links_node'
|
14
|
+
require_relative 'yasuri_map_node'
|
13
15
|
require_relative 'yasuri_node_generator'
|
14
16
|
|
15
17
|
module Yasuri
|
16
18
|
|
19
|
+
DefaultRetryCount = 5
|
20
|
+
|
17
21
|
def self.json2tree(json_string)
|
18
|
-
|
19
|
-
|
22
|
+
raise RuntimeError if json_string.nil? or json_string.empty?
|
23
|
+
|
24
|
+
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
25
|
+
Yasuri.hash2node(node_hash)
|
20
26
|
end
|
21
27
|
|
22
28
|
def self.tree2json(node)
|
29
|
+
raise RuntimeError if node.nil?
|
30
|
+
|
23
31
|
Yasuri.node2hash(node).to_json
|
24
32
|
end
|
25
33
|
|
26
|
-
def self.
|
27
|
-
|
28
|
-
|
34
|
+
def self.yaml2tree(yaml_string)
|
35
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
36
|
+
|
37
|
+
node_hash = YAML.load(yaml_string)
|
38
|
+
Yasuri.hash2node(node_hash.deep_symbolize_keys)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
43
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
44
|
+
generated || super(method_name, **opt)
|
29
45
|
end
|
30
46
|
|
31
47
|
private
|
@@ -33,52 +49,66 @@ module Yasuri
|
|
33
49
|
text: Yasuri::TextNode,
|
34
50
|
struct: Yasuri::StructNode,
|
35
51
|
links: Yasuri::LinksNode,
|
36
|
-
pages: Yasuri::PaginateNode
|
52
|
+
pages: Yasuri::PaginateNode,
|
53
|
+
map: Yasuri::MapNode
|
37
54
|
}
|
38
|
-
Node2Text = Text2Node.invert
|
39
55
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
56
|
+
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
57
|
+
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
58
|
+
|
59
|
+
node_prefixes = Text2Node.keys.freeze
|
60
|
+
child_nodes = []
|
61
|
+
opt = {}
|
62
|
+
path = nil
|
63
|
+
|
64
|
+
if node_hash.is_a?(String)
|
65
|
+
path = node_hash
|
66
|
+
else
|
67
|
+
node_hash.each do |key, value|
|
68
|
+
# is node?
|
69
|
+
node_regexps = Text2Node.keys.map do |node_type_sym|
|
70
|
+
/^(#{node_type_sym.to_s})_(.+)$/
|
71
|
+
end
|
72
|
+
node_regexp = node_regexps.find do |node_regexp|
|
73
|
+
key =~ node_regexp
|
74
|
+
end
|
75
|
+
|
76
|
+
case key
|
77
|
+
when node_regexp
|
78
|
+
node_type_sym = $1.to_sym
|
79
|
+
child_node_name = $2
|
80
|
+
child_node_type = Text2Node[node_type_sym]
|
81
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
82
|
+
when :path
|
83
|
+
path = value
|
84
|
+
else
|
85
|
+
opt[key] = value
|
86
|
+
end
|
87
|
+
end
|
44
88
|
end
|
45
|
-
children ||= []
|
46
89
|
|
47
|
-
|
48
|
-
|
49
|
-
fail "Not found 'path' value in json" if path.nil?
|
90
|
+
# If only single node under root, return only the node.
|
91
|
+
return child_nodes.first if node_name.nil? and child_nodes.size == 1
|
50
92
|
|
51
|
-
|
52
|
-
|
53
|
-
|
93
|
+
node = if node_type_class.nil?
|
94
|
+
Yasuri::MapNode.new(node_name, child_nodes, **opt)
|
95
|
+
else
|
96
|
+
node_type_class::new(path, node_name, child_nodes, **opt)
|
97
|
+
end
|
54
98
|
|
55
|
-
|
56
|
-
fail "Undefined node type #{node}" if klass.nil?
|
57
|
-
klass.new(path, name, childnodes, opt)
|
99
|
+
node
|
58
100
|
end
|
59
101
|
|
60
102
|
def self.node2hash(node)
|
61
|
-
|
62
|
-
return json if node.nil?
|
103
|
+
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
63
104
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
json["node"] = klass_str
|
68
|
-
json["name"] = node.name
|
69
|
-
json["path"] = node.xpath
|
70
|
-
|
71
|
-
children = node.children.map{|c| Yasuri.node2hash(c)}
|
72
|
-
json["children"] = children if not children.empty?
|
73
|
-
|
74
|
-
node.opts.each do |key,value|
|
75
|
-
json[key] = value if not value.nil?
|
76
|
-
end
|
77
|
-
|
78
|
-
json
|
105
|
+
{
|
106
|
+
"#{node.node_type_str}_#{node.name}" => node.to_h
|
107
|
+
}
|
79
108
|
end
|
80
109
|
|
81
|
-
def self.
|
110
|
+
def self.node_name(name, opt)
|
111
|
+
symbolize_names = opt[:symbolize_names]
|
82
112
|
symbolize_names ? name.to_sym : name
|
83
113
|
end
|
84
114
|
|
@@ -95,3 +125,14 @@ module Yasuri
|
|
95
125
|
end
|
96
126
|
end
|
97
127
|
end
|
128
|
+
|
129
|
+
class Hash
|
130
|
+
def deep_symbolize_keys
|
131
|
+
Hash[
|
132
|
+
self.map do |k, v|
|
133
|
+
v = v.deep_symbolize_keys if v.kind_of?(Hash)
|
134
|
+
[k.to_sym, v]
|
135
|
+
end
|
136
|
+
]
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
require 'mechanize'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class CLI < Thor
|
8
|
+
package_name "yasuri"
|
9
|
+
|
10
|
+
default_command :scrape
|
11
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
def scrape(uri)
|
15
|
+
# argument validations
|
16
|
+
if [options[:file], options[:json]].compact.count != 1
|
17
|
+
$stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
|
18
|
+
return -1
|
19
|
+
end
|
20
|
+
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
+
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
+
return -1
|
23
|
+
end
|
24
|
+
if options[:json]&.empty? or options[:json] == "json"
|
25
|
+
$stderr.puts "ERROR: --json option require not empty argument."
|
26
|
+
return -1
|
27
|
+
end
|
28
|
+
|
29
|
+
tree = if options[:file]
|
30
|
+
src = File.read(options[:file])
|
31
|
+
|
32
|
+
begin
|
33
|
+
Yasuri.json2tree(src)
|
34
|
+
rescue
|
35
|
+
begin
|
36
|
+
Yasuri.yaml2tree(src)
|
37
|
+
rescue => e
|
38
|
+
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
39
|
+
return -1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
else
|
43
|
+
begin
|
44
|
+
Yasuri.json2tree(options[:json])
|
45
|
+
rescue => e
|
46
|
+
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
47
|
+
return -1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
agent = Mechanize.new
|
52
|
+
root_page = agent.get(uri)
|
53
|
+
result = tree.inject(agent, root_page)
|
54
|
+
|
55
|
+
if result.instance_of?(String)
|
56
|
+
puts result
|
57
|
+
else
|
58
|
+
j result
|
59
|
+
end
|
60
|
+
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -6,21 +6,25 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
retry_count = opt[:retry_count] ||
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
22
22
|
Hash[child_results_kv]
|
23
23
|
end # each named child node
|
24
24
|
end
|
25
|
-
|
26
|
-
|
25
|
+
|
26
|
+
def node_type_str
|
27
|
+
"links".freeze
|
28
|
+
end
|
29
|
+
end # class
|
30
|
+
end # module
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
module Yasuri
|
3
|
+
class MapNode
|
4
|
+
attr_reader :name, :children
|
5
|
+
|
6
|
+
def initialize(name, children, **opt)
|
7
|
+
@name = name
|
8
|
+
@children = children
|
9
|
+
@opt = opt
|
10
|
+
end
|
11
|
+
|
12
|
+
def inject(agent, page, opt = {}, element = page)
|
13
|
+
child_results_kv = @children.map do |node|
|
14
|
+
[node.name, node.inject(agent, page, opt)]
|
15
|
+
end
|
16
|
+
Hash[child_results_kv]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
node_hash = {}
|
21
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
22
|
+
|
23
|
+
children.each do |child|
|
24
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
25
|
+
node_hash[child_node_name] = child.to_h
|
26
|
+
end
|
27
|
+
|
28
|
+
node_hash
|
29
|
+
end
|
30
|
+
|
31
|
+
def opts
|
32
|
+
{}
|
33
|
+
end
|
34
|
+
|
35
|
+
def node_type_str
|
36
|
+
"map".freeze
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,15 +7,36 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
-
def inject(agent, page, opt = {})
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
14
|
+
def inject(agent, page, opt = {}, element = page)
|
15
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
16
16
|
end
|
17
|
+
|
18
|
+
def to_h
|
19
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
20
|
+
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
23
|
+
|
24
|
+
node_hash[:path] = @xpath if @xpath
|
25
|
+
|
26
|
+
children.each do |child|
|
27
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
28
|
+
node_hash[child_node_name] = child.to_h
|
29
|
+
end
|
30
|
+
|
31
|
+
node_hash
|
32
|
+
end
|
33
|
+
|
17
34
|
def opts
|
18
35
|
{}
|
19
36
|
end
|
37
|
+
|
38
|
+
def node_type_str
|
39
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
40
|
+
end
|
20
41
|
end
|
21
42
|
end
|
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
|
|
6
6
|
require_relative 'yasuri_struct_node'
|
7
7
|
require_relative 'yasuri_links_node'
|
8
8
|
require_relative 'yasuri_paginate_node'
|
9
|
+
require_relative 'yasuri_map_node'
|
9
10
|
|
10
11
|
module Yasuri
|
11
12
|
class NodeGenerator
|
@@ -15,29 +16,33 @@ module Yasuri
|
|
15
16
|
@nodes
|
16
17
|
end
|
17
18
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
19
|
+
def method_missing(name, pattern=nil, **args, &block)
|
20
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
21
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
22
|
@nodes << node
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.gen(
|
25
|
-
xpath, opt = *args
|
26
|
-
opt = [opt].flatten.compact
|
25
|
+
def self.gen(method_name, xpath, **opt, &block)
|
27
26
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
27
|
|
29
|
-
case
|
28
|
+
case method_name
|
30
29
|
when /^text_(.+)$/
|
31
|
-
|
30
|
+
# Todo raise error xpath is not valid
|
31
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
32
|
when /^struct_(.+)$/
|
33
|
-
|
33
|
+
# Todo raise error xpath is not valid
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
35
|
when /^links_(.+)$/
|
35
|
-
|
36
|
+
# Todo raise error xpath is not valid
|
37
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
38
|
when /^pages_(.+)$/
|
37
|
-
|
39
|
+
# Todo raise error xpath is not valid
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
41
|
+
when /^map_(.+)$/
|
42
|
+
Yasuri::MapNode.new($1, children, **opt)
|
38
43
|
else
|
39
44
|
nil
|
40
45
|
end
|
41
|
-
end # of self.gen(
|
46
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
42
47
|
end # of class NodeGenerator
|
43
48
|
end
|
@@ -7,24 +7,27 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], limit: nil)
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
11
11
|
super(xpath, name, children)
|
12
|
+
@flatten = flatten
|
12
13
|
@limit = limit
|
13
14
|
end
|
14
15
|
|
15
|
-
def inject(agent, page, opt = {})
|
16
|
-
retry_count = opt[:retry_count] ||
|
16
|
+
def inject(agent, page, opt = {}, element = page)
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
|
+
|
19
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
17
20
|
|
18
21
|
child_results = []
|
19
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
20
23
|
while page
|
21
24
|
child_results_kv = @children.map do |child_node|
|
22
|
-
child_name = Yasuri.
|
25
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
23
26
|
[child_name, child_node.inject(agent, page, opt)]
|
24
27
|
end
|
25
28
|
child_results << Hash[child_results_kv]
|
26
29
|
|
27
|
-
link = page.search(@xpath).first
|
30
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
28
31
|
break if link == nil
|
29
32
|
|
30
33
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
@@ -32,10 +35,19 @@ module Yasuri
|
|
32
35
|
break if (limit -= 1) <= 0
|
33
36
|
end
|
34
37
|
|
38
|
+
if @flatten == true
|
39
|
+
return child_results.map{|h| h.values}.flatten
|
40
|
+
end
|
41
|
+
|
35
42
|
child_results
|
36
43
|
end
|
44
|
+
|
37
45
|
def opts
|
38
|
-
{limit:@limit}
|
46
|
+
{limit:@limit, flatten:@flatten}
|
47
|
+
end
|
48
|
+
|
49
|
+
def node_type_str
|
50
|
+
"pages".freeze
|
39
51
|
end
|
40
52
|
end
|
41
53
|
end
|