yasuri 2.0.11 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +88 -19
- data/USAGE.ja.md +325 -63
- data/USAGE.md +335 -69
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +80 -39
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +39 -0
- data/lib/yasuri/yasuri_node.rb +24 -3
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +18 -6
- data/lib/yasuri/yasuri_struct_node.rb +8 -4
- data/lib/yasuri/yasuri_text_node.rb +11 -4
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/spec_helper.rb +1 -6
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_map_spec.rb +76 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +199 -84
- data/spec/yasuri_struct_node_spec.rb +42 -1
- data/yasuri.gemspec +5 -3
- metadata +52 -19
data/exe/yasuri
ADDED
data/lib/yasuri.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -4,28 +4,44 @@
|
|
4
4
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
require_relative 'yasuri_node'
|
9
10
|
require_relative 'yasuri_text_node'
|
10
11
|
require_relative 'yasuri_struct_node'
|
11
12
|
require_relative 'yasuri_paginate_node'
|
12
13
|
require_relative 'yasuri_links_node'
|
14
|
+
require_relative 'yasuri_map_node'
|
13
15
|
require_relative 'yasuri_node_generator'
|
14
16
|
|
15
17
|
module Yasuri
|
16
18
|
|
19
|
+
DefaultRetryCount = 5
|
20
|
+
|
17
21
|
def self.json2tree(json_string)
|
18
|
-
|
19
|
-
|
22
|
+
raise RuntimeError if json_string.nil? or json_string.empty?
|
23
|
+
|
24
|
+
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
25
|
+
Yasuri.hash2node(node_hash)
|
20
26
|
end
|
21
27
|
|
22
28
|
def self.tree2json(node)
|
29
|
+
raise RuntimeError if node.nil?
|
30
|
+
|
23
31
|
Yasuri.node2hash(node).to_json
|
24
32
|
end
|
25
33
|
|
26
|
-
def self.
|
27
|
-
|
28
|
-
|
34
|
+
def self.yaml2tree(yaml_string)
|
35
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
36
|
+
|
37
|
+
node_hash = YAML.load(yaml_string)
|
38
|
+
Yasuri.hash2node(node_hash.deep_symbolize_keys)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
43
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
44
|
+
generated || super(method_name, **opt)
|
29
45
|
end
|
30
46
|
|
31
47
|
private
|
@@ -33,52 +49,66 @@ module Yasuri
|
|
33
49
|
text: Yasuri::TextNode,
|
34
50
|
struct: Yasuri::StructNode,
|
35
51
|
links: Yasuri::LinksNode,
|
36
|
-
pages: Yasuri::PaginateNode
|
52
|
+
pages: Yasuri::PaginateNode,
|
53
|
+
map: Yasuri::MapNode
|
37
54
|
}
|
38
|
-
Node2Text = Text2Node.invert
|
39
55
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
56
|
+
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
57
|
+
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
58
|
+
|
59
|
+
node_prefixes = Text2Node.keys.freeze
|
60
|
+
child_nodes = []
|
61
|
+
opt = {}
|
62
|
+
path = nil
|
63
|
+
|
64
|
+
if node_hash.is_a?(String)
|
65
|
+
path = node_hash
|
66
|
+
else
|
67
|
+
node_hash.each do |key, value|
|
68
|
+
# is node?
|
69
|
+
node_regexps = Text2Node.keys.map do |node_type_sym|
|
70
|
+
/^(#{node_type_sym.to_s})_(.+)$/
|
71
|
+
end
|
72
|
+
node_regexp = node_regexps.find do |node_regexp|
|
73
|
+
key =~ node_regexp
|
74
|
+
end
|
75
|
+
|
76
|
+
case key
|
77
|
+
when node_regexp
|
78
|
+
node_type_sym = $1.to_sym
|
79
|
+
child_node_name = $2
|
80
|
+
child_node_type = Text2Node[node_type_sym]
|
81
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
82
|
+
when :path
|
83
|
+
path = value
|
84
|
+
else
|
85
|
+
opt[key] = value
|
86
|
+
end
|
87
|
+
end
|
44
88
|
end
|
45
|
-
children ||= []
|
46
89
|
|
47
|
-
|
48
|
-
|
49
|
-
fail "Not found 'path' value in json" if path.nil?
|
90
|
+
# If only single node under root, return only the node.
|
91
|
+
return child_nodes.first if node_name.nil? and child_nodes.size == 1
|
50
92
|
|
51
|
-
|
52
|
-
|
53
|
-
|
93
|
+
node = if node_type_class.nil?
|
94
|
+
Yasuri::MapNode.new(node_name, child_nodes, **opt)
|
95
|
+
else
|
96
|
+
node_type_class::new(path, node_name, child_nodes, **opt)
|
97
|
+
end
|
54
98
|
|
55
|
-
|
56
|
-
fail "Undefined node type #{node}" if klass.nil?
|
57
|
-
klass.new(path, name, childnodes, opt)
|
99
|
+
node
|
58
100
|
end
|
59
101
|
|
60
102
|
def self.node2hash(node)
|
61
|
-
|
62
|
-
return json if node.nil?
|
103
|
+
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
63
104
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
json["node"] = klass_str
|
68
|
-
json["name"] = node.name
|
69
|
-
json["path"] = node.xpath
|
70
|
-
|
71
|
-
children = node.children.map{|c| Yasuri.node2hash(c)}
|
72
|
-
json["children"] = children if not children.empty?
|
73
|
-
|
74
|
-
node.opts.each do |key,value|
|
75
|
-
json[key] = value if not value.nil?
|
76
|
-
end
|
77
|
-
|
78
|
-
json
|
105
|
+
{
|
106
|
+
"#{node.node_type_str}_#{node.name}" => node.to_h
|
107
|
+
}
|
79
108
|
end
|
80
109
|
|
81
|
-
def self.
|
110
|
+
def self.node_name(name, opt)
|
111
|
+
symbolize_names = opt[:symbolize_names]
|
82
112
|
symbolize_names ? name.to_sym : name
|
83
113
|
end
|
84
114
|
|
@@ -95,3 +125,14 @@ module Yasuri
|
|
95
125
|
end
|
96
126
|
end
|
97
127
|
end
|
128
|
+
|
129
|
+
class Hash
|
130
|
+
def deep_symbolize_keys
|
131
|
+
Hash[
|
132
|
+
self.map do |k, v|
|
133
|
+
v = v.deep_symbolize_keys if v.kind_of?(Hash)
|
134
|
+
[k.to_sym, v]
|
135
|
+
end
|
136
|
+
]
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
require 'mechanize'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class CLI < Thor
|
8
|
+
package_name "yasuri"
|
9
|
+
|
10
|
+
default_command :scrape
|
11
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
def scrape(uri)
|
15
|
+
# argument validations
|
16
|
+
if [options[:file], options[:json]].compact.count != 1
|
17
|
+
$stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
|
18
|
+
return -1
|
19
|
+
end
|
20
|
+
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
+
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
+
return -1
|
23
|
+
end
|
24
|
+
if options[:json]&.empty? or options[:json] == "json"
|
25
|
+
$stderr.puts "ERROR: --json option require not empty argument."
|
26
|
+
return -1
|
27
|
+
end
|
28
|
+
|
29
|
+
tree = if options[:file]
|
30
|
+
src = File.read(options[:file])
|
31
|
+
|
32
|
+
begin
|
33
|
+
Yasuri.json2tree(src)
|
34
|
+
rescue
|
35
|
+
begin
|
36
|
+
Yasuri.yaml2tree(src)
|
37
|
+
rescue => e
|
38
|
+
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
39
|
+
return -1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
else
|
43
|
+
begin
|
44
|
+
Yasuri.json2tree(options[:json])
|
45
|
+
rescue => e
|
46
|
+
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
47
|
+
return -1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
agent = Mechanize.new
|
52
|
+
root_page = agent.get(uri)
|
53
|
+
result = tree.inject(agent, root_page)
|
54
|
+
|
55
|
+
if result.instance_of?(String)
|
56
|
+
puts result
|
57
|
+
else
|
58
|
+
j result
|
59
|
+
end
|
60
|
+
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -6,21 +6,25 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
retry_count = opt[:retry_count] ||
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
22
22
|
Hash[child_results_kv]
|
23
23
|
end # each named child node
|
24
24
|
end
|
25
|
-
|
26
|
-
|
25
|
+
|
26
|
+
def node_type_str
|
27
|
+
"links".freeze
|
28
|
+
end
|
29
|
+
end # class
|
30
|
+
end # module
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
module Yasuri
|
3
|
+
class MapNode
|
4
|
+
attr_reader :name, :children
|
5
|
+
|
6
|
+
def initialize(name, children, **opt)
|
7
|
+
@name = name
|
8
|
+
@children = children
|
9
|
+
@opt = opt
|
10
|
+
end
|
11
|
+
|
12
|
+
def inject(agent, page, opt = {}, element = page)
|
13
|
+
child_results_kv = @children.map do |node|
|
14
|
+
[node.name, node.inject(agent, page, opt)]
|
15
|
+
end
|
16
|
+
Hash[child_results_kv]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
node_hash = {}
|
21
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
22
|
+
|
23
|
+
children.each do |child|
|
24
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
25
|
+
node_hash[child_node_name] = child.to_h
|
26
|
+
end
|
27
|
+
|
28
|
+
node_hash
|
29
|
+
end
|
30
|
+
|
31
|
+
def opts
|
32
|
+
{}
|
33
|
+
end
|
34
|
+
|
35
|
+
def node_type_str
|
36
|
+
"map".freeze
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,15 +7,36 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
-
def inject(agent, page, opt = {})
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
14
|
+
def inject(agent, page, opt = {}, element = page)
|
15
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
16
16
|
end
|
17
|
+
|
18
|
+
def to_h
|
19
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
20
|
+
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
23
|
+
|
24
|
+
node_hash[:path] = @xpath if @xpath
|
25
|
+
|
26
|
+
children.each do |child|
|
27
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
28
|
+
node_hash[child_node_name] = child.to_h
|
29
|
+
end
|
30
|
+
|
31
|
+
node_hash
|
32
|
+
end
|
33
|
+
|
17
34
|
def opts
|
18
35
|
{}
|
19
36
|
end
|
37
|
+
|
38
|
+
def node_type_str
|
39
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
40
|
+
end
|
20
41
|
end
|
21
42
|
end
|
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
|
|
6
6
|
require_relative 'yasuri_struct_node'
|
7
7
|
require_relative 'yasuri_links_node'
|
8
8
|
require_relative 'yasuri_paginate_node'
|
9
|
+
require_relative 'yasuri_map_node'
|
9
10
|
|
10
11
|
module Yasuri
|
11
12
|
class NodeGenerator
|
@@ -15,29 +16,33 @@ module Yasuri
|
|
15
16
|
@nodes
|
16
17
|
end
|
17
18
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
19
|
+
def method_missing(name, pattern=nil, **args, &block)
|
20
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
21
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
22
|
@nodes << node
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.gen(
|
25
|
-
xpath, opt = *args
|
26
|
-
opt = [opt].flatten.compact
|
25
|
+
def self.gen(method_name, xpath, **opt, &block)
|
27
26
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
27
|
|
29
|
-
case
|
28
|
+
case method_name
|
30
29
|
when /^text_(.+)$/
|
31
|
-
|
30
|
+
# Todo raise error xpath is not valid
|
31
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
32
|
when /^struct_(.+)$/
|
33
|
-
|
33
|
+
# Todo raise error xpath is not valid
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
35
|
when /^links_(.+)$/
|
35
|
-
|
36
|
+
# Todo raise error xpath is not valid
|
37
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
38
|
when /^pages_(.+)$/
|
37
|
-
|
39
|
+
# Todo raise error xpath is not valid
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
41
|
+
when /^map_(.+)$/
|
42
|
+
Yasuri::MapNode.new($1, children, **opt)
|
38
43
|
else
|
39
44
|
nil
|
40
45
|
end
|
41
|
-
end # of self.gen(
|
46
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
42
47
|
end # of class NodeGenerator
|
43
48
|
end
|
@@ -7,24 +7,27 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], limit: nil)
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
11
11
|
super(xpath, name, children)
|
12
|
+
@flatten = flatten
|
12
13
|
@limit = limit
|
13
14
|
end
|
14
15
|
|
15
|
-
def inject(agent, page, opt = {})
|
16
|
-
retry_count = opt[:retry_count] ||
|
16
|
+
def inject(agent, page, opt = {}, element = page)
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
|
+
|
19
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
17
20
|
|
18
21
|
child_results = []
|
19
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
20
23
|
while page
|
21
24
|
child_results_kv = @children.map do |child_node|
|
22
|
-
child_name = Yasuri.
|
25
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
23
26
|
[child_name, child_node.inject(agent, page, opt)]
|
24
27
|
end
|
25
28
|
child_results << Hash[child_results_kv]
|
26
29
|
|
27
|
-
link = page.search(@xpath).first
|
30
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
28
31
|
break if link == nil
|
29
32
|
|
30
33
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
@@ -32,10 +35,19 @@ module Yasuri
|
|
32
35
|
break if (limit -= 1) <= 0
|
33
36
|
end
|
34
37
|
|
38
|
+
if @flatten == true
|
39
|
+
return child_results.map{|h| h.values}.flatten
|
40
|
+
end
|
41
|
+
|
35
42
|
child_results
|
36
43
|
end
|
44
|
+
|
37
45
|
def opts
|
38
|
-
{limit:@limit}
|
46
|
+
{limit:@limit, flatten:@flatten}
|
47
|
+
end
|
48
|
+
|
49
|
+
def node_type_str
|
50
|
+
"pages".freeze
|
39
51
|
end
|
40
52
|
end
|
41
53
|
end
|