yasuri 2.0.12 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +87 -21
- data/USAGE.ja.md +368 -120
- data/USAGE.md +375 -125
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +86 -41
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +11 -5
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +37 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +10 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +9 -2
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +4 -9
- data/spec/yasuri_cli_spec.rb +96 -0
- data/spec/yasuri_links_node_spec.rb +34 -12
- data/spec/yasuri_map_spec.rb +75 -0
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +244 -94
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- data/yasuri.gemspec +5 -3
- metadata +52 -18
- data/app.rb +0 -52
data/examples/example.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require 'yasuri'
|
5
|
+
uri = "https://github.com/tac0x2a?tab=repositories"
|
6
|
+
|
7
|
+
# Node tree constructing by DSL
|
8
|
+
root = Yasuri.map_root do
|
9
|
+
text_title '/html/head/title'
|
10
|
+
links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
|
11
|
+
text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
|
12
|
+
text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
|
13
|
+
text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
|
14
|
+
text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Node tree constructing by YAML
|
19
|
+
# src = <<-EOYML
|
20
|
+
# text_title: /html/head/title
|
21
|
+
# links_repo:
|
22
|
+
# path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
23
|
+
# text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
24
|
+
# text_desc:
|
25
|
+
# path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
26
|
+
# proc: :strip
|
27
|
+
# text_stars:
|
28
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
29
|
+
# proc: :to_i
|
30
|
+
# text_forks:
|
31
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
32
|
+
# proc: :to_i
|
33
|
+
# EOYML
|
34
|
+
# root = Yasuri.yaml2tree(src)
|
35
|
+
|
36
|
+
contents = root.scrape(uri, interval_ms: 100)
|
37
|
+
# jj contents
|
38
|
+
# {
|
39
|
+
# "title": "tac0x2a (TAC) / Repositories · GitHub",
|
40
|
+
# "repo": [
|
41
|
+
# {
|
42
|
+
# "name": "o-namazu",
|
43
|
+
# "desc": "Oh Namazu (Catfish) in datalake",
|
44
|
+
# "stars": 1,
|
45
|
+
# "forks": 0
|
46
|
+
# },
|
47
|
+
# {
|
48
|
+
# "name": "grebe",
|
49
|
+
# "desc": "grebe in datalake",
|
50
|
+
# "stars": 2,
|
51
|
+
# "forks": 0
|
52
|
+
# },
|
53
|
+
# {
|
54
|
+
# "name": "yasuri",
|
55
|
+
# "desc": "Yasuri (鑢) is easy web scraping library.",
|
56
|
+
# "stars": 43,
|
57
|
+
# "forks": 1
|
58
|
+
# },
|
59
|
+
# {
|
60
|
+
# "name": "dotfiles",
|
61
|
+
# "desc": "dotfiles",
|
62
|
+
# "stars": 0,
|
63
|
+
# "forks": 0
|
64
|
+
# }
|
65
|
+
# ...
|
66
|
+
# ]
|
67
|
+
# }
|
68
|
+
|
69
|
+
# Output as markdown
|
70
|
+
puts "# #{contents['title']}"
|
71
|
+
contents['repo'].each do |h|
|
72
|
+
puts "-----"
|
73
|
+
puts "## #{h['name']}"
|
74
|
+
puts h['desc']
|
75
|
+
puts ""
|
76
|
+
puts "* Stars: #{h['stars']}"
|
77
|
+
puts "* Forks: #{h['forks']}"
|
78
|
+
puts ""
|
79
|
+
end
|
data/examples/github.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
# yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
|
3
|
+
text_title: /html/head/title
|
4
|
+
links_repo:
|
5
|
+
path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
6
|
+
text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
7
|
+
text_desc:
|
8
|
+
path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
9
|
+
proc: :strip
|
10
|
+
text_stars:
|
11
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
12
|
+
proc: :to_i
|
13
|
+
text_forks:
|
14
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
15
|
+
proc: :to_i
|
data/examples/sample.yml
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# yasuri scrape "https://www.tac42.net/" -f sample.yml
|
2
|
+
links_each:
|
3
|
+
path: //*[@id="posts"]/article/header/h1/a
|
4
|
+
text_title: //*[@id="content"]/article/header/h1
|
5
|
+
text_description: /html/head/meta[12]/@content
|
6
|
+
text_date:
|
7
|
+
path: //*[@id="content"]/article/header/div/span
|
8
|
+
proc: :strip
|
9
|
+
text_length:
|
10
|
+
path: //*[@id="content"]
|
11
|
+
proc: :size
|
data/exe/yasuri
ADDED
data/lib/yasuri.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -4,28 +4,45 @@
|
|
4
4
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
require_relative 'yasuri_node'
|
9
10
|
require_relative 'yasuri_text_node'
|
10
11
|
require_relative 'yasuri_struct_node'
|
11
12
|
require_relative 'yasuri_paginate_node'
|
12
13
|
require_relative 'yasuri_links_node'
|
14
|
+
require_relative 'yasuri_map_node'
|
13
15
|
require_relative 'yasuri_node_generator'
|
14
16
|
|
15
17
|
module Yasuri
|
16
18
|
|
19
|
+
DefaultRetryCount = 5
|
20
|
+
DefaultInterval_ms = 0
|
21
|
+
|
17
22
|
def self.json2tree(json_string)
|
18
|
-
|
19
|
-
|
23
|
+
raise RuntimeError if json_string.nil? or json_string.empty?
|
24
|
+
|
25
|
+
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
26
|
+
Yasuri.hash2node(node_hash)
|
20
27
|
end
|
21
28
|
|
22
29
|
def self.tree2json(node)
|
30
|
+
raise RuntimeError if node.nil?
|
31
|
+
|
23
32
|
Yasuri.node2hash(node).to_json
|
24
33
|
end
|
25
34
|
|
26
|
-
def self.
|
27
|
-
|
28
|
-
|
35
|
+
def self.yaml2tree(yaml_string)
|
36
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
37
|
+
|
38
|
+
node_hash = YAML.load(yaml_string)
|
39
|
+
Yasuri.hash2node(node_hash.deep_symbolize_keys)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
44
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
45
|
+
generated || super(method_name, **opt)
|
29
46
|
end
|
30
47
|
|
31
48
|
private
|
@@ -33,61 +50,78 @@ module Yasuri
|
|
33
50
|
text: Yasuri::TextNode,
|
34
51
|
struct: Yasuri::StructNode,
|
35
52
|
links: Yasuri::LinksNode,
|
36
|
-
pages: Yasuri::PaginateNode
|
53
|
+
pages: Yasuri::PaginateNode,
|
54
|
+
map: Yasuri::MapNode
|
37
55
|
}
|
38
|
-
Node2Text = Text2Node.invert
|
39
56
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
57
|
+
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
58
|
+
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
59
|
+
|
60
|
+
node_prefixes = Text2Node.keys.freeze
|
61
|
+
child_nodes = []
|
62
|
+
opt = {}
|
63
|
+
path = nil
|
64
|
+
|
65
|
+
if node_hash.is_a?(String)
|
66
|
+
path = node_hash
|
67
|
+
else
|
68
|
+
node_hash.each do |key, value|
|
69
|
+
# is node?
|
70
|
+
node_regexps = Text2Node.keys.map do |node_type_sym|
|
71
|
+
/^(#{node_type_sym.to_s})_(.+)$/
|
72
|
+
end
|
73
|
+
node_regexp = node_regexps.find do |node_regexp|
|
74
|
+
key =~ node_regexp
|
75
|
+
end
|
76
|
+
|
77
|
+
case key
|
78
|
+
when node_regexp
|
79
|
+
node_type_sym = $1.to_sym
|
80
|
+
child_node_name = $2
|
81
|
+
child_node_type = Text2Node[node_type_sym]
|
82
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
83
|
+
when :path
|
84
|
+
path = value
|
85
|
+
else
|
86
|
+
opt[key] = value
|
87
|
+
end
|
88
|
+
end
|
44
89
|
end
|
45
|
-
children ||= []
|
46
90
|
|
47
|
-
|
48
|
-
|
49
|
-
fail "Not found 'path' value in json" if path.nil?
|
91
|
+
# If only single node under root, return only the node.
|
92
|
+
return child_nodes.first if node_name.nil? and child_nodes.size == 1
|
50
93
|
|
51
|
-
|
52
|
-
|
53
|
-
|
94
|
+
node = if node_type_class.nil?
|
95
|
+
Yasuri::MapNode.new(node_name, child_nodes, **opt)
|
96
|
+
else
|
97
|
+
node_type_class::new(path, node_name, child_nodes, **opt)
|
98
|
+
end
|
54
99
|
|
55
|
-
|
56
|
-
fail "Undefined node type #{node}" if klass.nil?
|
57
|
-
klass.new(path, name, childnodes, opt)
|
100
|
+
node
|
58
101
|
end
|
59
102
|
|
60
103
|
def self.node2hash(node)
|
61
|
-
|
62
|
-
return json if node.nil?
|
63
|
-
|
64
|
-
klass = node.class
|
65
|
-
klass_str = Node2Text[klass]
|
104
|
+
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
66
105
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
children = node.children.map{|c| Yasuri.node2hash(c)}
|
72
|
-
json["children"] = children if not children.empty?
|
73
|
-
|
74
|
-
node.opts.each do |key,value|
|
75
|
-
json[key] = value if not value.nil?
|
76
|
-
end
|
77
|
-
|
78
|
-
json
|
106
|
+
{
|
107
|
+
"#{node.node_type_str}_#{node.name}" => node.to_h
|
108
|
+
}
|
79
109
|
end
|
80
110
|
|
81
|
-
def self.
|
111
|
+
def self.node_name(name, opt)
|
112
|
+
symbolize_names = opt[:symbolize_names]
|
82
113
|
symbolize_names ? name.to_sym : name
|
83
114
|
end
|
84
115
|
|
85
|
-
def self.with_retry(
|
116
|
+
def self.with_retry(
|
117
|
+
retry_count = DefaultRetryCount,
|
118
|
+
interval_ms = DefaultInterval_ms)
|
119
|
+
|
86
120
|
begin
|
121
|
+
Kernel.sleep(interval_ms * 0.001)
|
87
122
|
return yield() if block_given?
|
88
123
|
rescue => e
|
89
124
|
if retry_count > 0
|
90
|
-
pp "retry #{retry_count}"
|
91
125
|
retry_count -= 1
|
92
126
|
retry
|
93
127
|
end
|
@@ -95,3 +129,14 @@ module Yasuri
|
|
95
129
|
end
|
96
130
|
end
|
97
131
|
end
|
132
|
+
|
133
|
+
class Hash
|
134
|
+
def deep_symbolize_keys
|
135
|
+
Hash[
|
136
|
+
self.map do |k, v|
|
137
|
+
v = v.deep_symbolize_keys if v.kind_of?(Hash)
|
138
|
+
[k.to_sym, v]
|
139
|
+
end
|
140
|
+
]
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
|
5
|
+
module Yasuri
|
6
|
+
class CLI < Thor
|
7
|
+
package_name "yasuri"
|
8
|
+
|
9
|
+
default_command :scrape
|
10
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
11
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
12
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
13
|
+
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
14
|
+
def scrape(uri)
|
15
|
+
# argument validations
|
16
|
+
if [options[:file], options[:json]].compact.count != 1
|
17
|
+
$stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
|
18
|
+
return -1
|
19
|
+
end
|
20
|
+
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
+
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
+
return -1
|
23
|
+
end
|
24
|
+
if options[:json]&.empty? or options[:json] == "json"
|
25
|
+
$stderr.puts "ERROR: --json option require not empty argument."
|
26
|
+
return -1
|
27
|
+
end
|
28
|
+
|
29
|
+
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
30
|
+
|
31
|
+
tree = if options[:file]
|
32
|
+
src = File.read(options[:file])
|
33
|
+
|
34
|
+
begin
|
35
|
+
Yasuri.json2tree(src)
|
36
|
+
rescue
|
37
|
+
begin
|
38
|
+
Yasuri.yaml2tree(src)
|
39
|
+
rescue => e
|
40
|
+
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
41
|
+
return -1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
else
|
45
|
+
begin
|
46
|
+
Yasuri.json2tree(options[:json])
|
47
|
+
rescue => e
|
48
|
+
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
49
|
+
return -1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
54
|
+
|
55
|
+
if result.instance_of?(String)
|
56
|
+
puts result
|
57
|
+
else
|
58
|
+
j result
|
59
|
+
end
|
60
|
+
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -6,21 +6,27 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
+
|
9
10
|
def inject(agent, page, opt = {}, element = page)
|
10
|
-
retry_count = opt[:retry_count] ||
|
11
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
12
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
11
13
|
|
12
14
|
links = element.search(@xpath) || [] # links expected
|
13
15
|
links.map do |link|
|
14
16
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
17
|
+
child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
16
18
|
|
17
19
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
20
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
21
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
22
|
end
|
21
23
|
|
22
24
|
Hash[child_results_kv]
|
23
25
|
end # each named child node
|
24
26
|
end
|
25
|
-
|
26
|
-
|
27
|
+
|
28
|
+
def node_type_str
|
29
|
+
"links".freeze
|
30
|
+
end
|
31
|
+
end # class
|
32
|
+
end # module
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
module Yasuri
|
3
|
+
class MapNode
|
4
|
+
include Node
|
5
|
+
attr_reader :name, :children
|
6
|
+
|
7
|
+
def initialize(name, children, **opt)
|
8
|
+
@name = name
|
9
|
+
@children = children
|
10
|
+
@opt = opt
|
11
|
+
end
|
12
|
+
|
13
|
+
def inject(agent, page, opt = {}, element = page)
|
14
|
+
child_results_kv = @children.map do |node|
|
15
|
+
[node.name, node.inject(agent, page, opt)]
|
16
|
+
end
|
17
|
+
Hash[child_results_kv]
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
23
|
+
|
24
|
+
children.each do |child|
|
25
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
26
|
+
node_hash[child_node_name] = child.to_h
|
27
|
+
end
|
28
|
+
|
29
|
+
node_hash
|
30
|
+
end
|
31
|
+
|
32
|
+
def opts
|
33
|
+
{}
|
34
|
+
end
|
35
|
+
|
36
|
+
def node_type_str
|
37
|
+
"map".freeze
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|