yasuri 3.0.0 → 3.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +70 -27
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -76
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +14 -9
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +9 -7
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -6
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +82 -58
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +105 -15
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
data/examples/example.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yasuri'
|
4
|
+
uri = "https://github.com/tac0x2a?tab=repositories"
|
5
|
+
|
6
|
+
# Node tree constructing by DSL
|
7
|
+
root = Yasuri.map_root do
|
8
|
+
text_title '/html/head/title'
|
9
|
+
links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
|
10
|
+
text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
|
11
|
+
text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
|
12
|
+
text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
|
13
|
+
text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Node tree constructing by YAML
|
18
|
+
# src = <<-EOYML
|
19
|
+
# text_title: /html/head/title
|
20
|
+
# links_repo:
|
21
|
+
# path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
22
|
+
# text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
23
|
+
# text_desc:
|
24
|
+
# path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
25
|
+
# proc: :strip
|
26
|
+
# text_stars:
|
27
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
28
|
+
# proc: :to_i
|
29
|
+
# text_forks:
|
30
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
31
|
+
# proc: :to_i
|
32
|
+
# EOYML
|
33
|
+
# root = Yasuri.yaml2tree(src)
|
34
|
+
|
35
|
+
contents = root.scrape(uri, interval_ms: 100)
|
36
|
+
# jj contents
|
37
|
+
# {
|
38
|
+
# "title": "tac0x2a (TAC) / Repositories · GitHub",
|
39
|
+
# "repo": [
|
40
|
+
# {
|
41
|
+
# "name": "o-namazu",
|
42
|
+
# "desc": "Oh Namazu (Catfish) in datalake",
|
43
|
+
# "stars": 1,
|
44
|
+
# "forks": 0
|
45
|
+
# },
|
46
|
+
# {
|
47
|
+
# "name": "grebe",
|
48
|
+
# "desc": "grebe in datalake",
|
49
|
+
# "stars": 2,
|
50
|
+
# "forks": 0
|
51
|
+
# },
|
52
|
+
# {
|
53
|
+
# "name": "yasuri",
|
54
|
+
# "desc": "Yasuri (鑢) is easy web scraping library.",
|
55
|
+
# "stars": 43,
|
56
|
+
# "forks": 1
|
57
|
+
# },
|
58
|
+
# {
|
59
|
+
# "name": "dotfiles",
|
60
|
+
# "desc": "dotfiles",
|
61
|
+
# "stars": 0,
|
62
|
+
# "forks": 0
|
63
|
+
# }
|
64
|
+
# ...
|
65
|
+
# ]
|
66
|
+
# }
|
67
|
+
|
68
|
+
# Output as markdown
|
69
|
+
puts "# #{contents['title']}"
|
70
|
+
contents['repo'].each do |h|
|
71
|
+
puts "-----"
|
72
|
+
puts "## #{h['name']}"
|
73
|
+
puts h['desc']
|
74
|
+
puts ""
|
75
|
+
puts "* Stars: #{h['stars']}"
|
76
|
+
puts "* Forks: #{h['forks']}"
|
77
|
+
puts ""
|
78
|
+
end
|
data/examples/github.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
# yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
|
3
|
+
text_title: /html/head/title
|
4
|
+
links_repo:
|
5
|
+
path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
6
|
+
text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
7
|
+
text_desc:
|
8
|
+
path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
9
|
+
proc: :strip
|
10
|
+
text_stars:
|
11
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
12
|
+
proc: :to_i
|
13
|
+
text_forks:
|
14
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
15
|
+
proc: :to_i
|
data/examples/sample.yml
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# yasuri scrape "https://www.tac42.net/" -f sample.yml
|
2
|
+
links_each:
|
3
|
+
path: //*[@id="posts"]/article/header/h1/a
|
4
|
+
text_title: //*[@id="content"]/article/header/h1
|
5
|
+
text_description: /html/head/meta[12]/@content
|
6
|
+
text_date:
|
7
|
+
path: //*[@id="content"]/article/header/div/span
|
8
|
+
proc: :strip
|
9
|
+
text_length:
|
10
|
+
path: //*[@id="content"]
|
11
|
+
proc: :size
|
data/exe/yasuri
ADDED
data/lib/yasuri.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
1
|
|
5
2
|
require 'mechanize'
|
6
3
|
require 'json'
|
@@ -11,119 +8,142 @@ require_relative 'yasuri_text_node'
|
|
11
8
|
require_relative 'yasuri_struct_node'
|
12
9
|
require_relative 'yasuri_paginate_node'
|
13
10
|
require_relative 'yasuri_links_node'
|
11
|
+
require_relative 'yasuri_map_node'
|
14
12
|
require_relative 'yasuri_node_generator'
|
15
13
|
|
16
14
|
module Yasuri
|
17
15
|
|
16
|
+
DefaultRetryCount = 5
|
17
|
+
DefaultInterval_ms = 0
|
18
|
+
|
18
19
|
def self.json2tree(json_string)
|
19
|
-
|
20
|
-
Yasuri.hash2node(json)
|
21
|
-
end
|
20
|
+
raise RuntimeError if json_string.nil? or json_string.empty?
|
22
21
|
|
23
|
-
|
24
|
-
|
22
|
+
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
23
|
+
self.hash2node(node_hash)
|
25
24
|
end
|
26
25
|
|
27
26
|
def self.yaml2tree(yaml_string)
|
28
27
|
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
|
30
|
+
self.hash2node(node_hash.deep_symbolize_keys)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.tree2json(node)
|
34
|
+
raise RuntimeError if node.nil?
|
35
|
+
|
36
|
+
self.node2hash(node).to_json
|
37
|
+
end
|
32
38
|
|
33
|
-
|
34
|
-
|
39
|
+
def self.with_retry(
|
40
|
+
retry_count = DefaultRetryCount,
|
41
|
+
interval_ms = DefaultInterval_ms)
|
42
|
+
|
43
|
+
begin
|
44
|
+
Kernel.sleep(interval_ms * 0.001)
|
45
|
+
return yield() if block_given?
|
46
|
+
rescue => e
|
47
|
+
if retry_count > 0
|
48
|
+
retry_count -= 1
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
fail e
|
52
|
+
end
|
53
|
+
end
|
35
54
|
|
36
|
-
|
55
|
+
def self.node_name(name, opt)
|
56
|
+
symbolize_names = opt[:symbolize_names]
|
57
|
+
symbolize_names ? name.to_sym : name
|
37
58
|
end
|
38
59
|
|
39
|
-
private
|
40
|
-
def self.yaml2tree_sub(name, body)
|
41
|
-
return nil if name.nil? or body.nil?
|
60
|
+
# private
|
42
61
|
|
43
|
-
|
44
|
-
|
45
|
-
body = new_body
|
62
|
+
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
63
|
+
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
46
64
|
|
47
|
-
|
65
|
+
child_nodes = []
|
66
|
+
opt = {}
|
67
|
+
path = nil
|
48
68
|
|
49
|
-
|
50
|
-
|
51
|
-
|
69
|
+
if node_hash.is_a?(String)
|
70
|
+
path = node_hash
|
71
|
+
else
|
72
|
+
child_nodes, opt, path = self.hash2child_node(node_hash)
|
52
73
|
end
|
53
74
|
|
54
|
-
|
55
|
-
|
75
|
+
# If only single node under root, return only the node.
|
76
|
+
return child_nodes.first if node_name.nil? and child_nodes.size == 1
|
56
77
|
|
57
|
-
|
58
|
-
|
59
|
-
|
78
|
+
node = if node_type_class.nil?
|
79
|
+
Yasuri::MapNode.new(node_name, child_nodes, **opt)
|
80
|
+
else
|
81
|
+
node_type_class::new(path, node_name, child_nodes, **opt)
|
82
|
+
end
|
83
|
+
|
84
|
+
node
|
60
85
|
end
|
61
86
|
|
62
|
-
private
|
63
87
|
Text2Node = {
|
64
88
|
text: Yasuri::TextNode,
|
65
89
|
struct: Yasuri::StructNode,
|
66
90
|
links: Yasuri::LinksNode,
|
67
|
-
pages: Yasuri::PaginateNode
|
91
|
+
pages: Yasuri::PaginateNode,
|
92
|
+
map: Yasuri::MapNode
|
68
93
|
}
|
69
|
-
Node2Text = Text2Node.invert
|
70
|
-
|
71
|
-
ReservedKeys = %i|node name path children|
|
72
|
-
def self.hash2node(node_h)
|
73
|
-
node, name, path, children = ReservedKeys.map do |key|
|
74
|
-
node_h[key]
|
75
|
-
end
|
76
|
-
children ||= []
|
77
94
|
|
78
|
-
|
79
|
-
fail "Not found 'name' value in map" if name.nil?
|
80
|
-
fail "Not found 'path' value in map" if path.nil?
|
95
|
+
NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
|
81
96
|
|
82
|
-
|
83
|
-
|
84
|
-
opt =
|
97
|
+
def self.hash2child_node(node_hash)
|
98
|
+
child_nodes = []
|
99
|
+
opt = {}
|
100
|
+
path = nil
|
85
101
|
|
86
|
-
|
87
|
-
|
88
|
-
klass.new(path, name, childnodes, **opt)
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.node2hash(node)
|
92
|
-
json = JSON.parse("{}")
|
93
|
-
return json if node.nil?
|
102
|
+
node_hash.each do |key, value|
|
103
|
+
# is node?
|
94
104
|
|
95
|
-
|
96
|
-
klass_str = Node2Text[klass]
|
105
|
+
node_regexp = NodeRegexps.find { |r| key =~ r }
|
97
106
|
|
98
|
-
|
99
|
-
|
100
|
-
|
107
|
+
case key
|
108
|
+
when node_regexp
|
109
|
+
node_type_sym = $1.to_sym
|
110
|
+
child_node_name = $2
|
111
|
+
child_node_type = Text2Node[node_type_sym]
|
112
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
113
|
+
when :path
|
114
|
+
path = value
|
115
|
+
else
|
116
|
+
opt[key] = value
|
117
|
+
end
|
118
|
+
end
|
101
119
|
|
102
|
-
|
103
|
-
|
120
|
+
[child_nodes, opt, path]
|
121
|
+
end
|
104
122
|
|
105
|
-
|
106
|
-
|
107
|
-
end
|
123
|
+
def self.node2hash(node)
|
124
|
+
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
108
125
|
|
109
|
-
|
126
|
+
{
|
127
|
+
"#{node.node_type_str}_#{node.name}" => node.to_h
|
128
|
+
}
|
110
129
|
end
|
111
130
|
|
112
|
-
def self.
|
113
|
-
|
114
|
-
|
131
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
132
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
133
|
+
generated || super(method_name, **opt)
|
115
134
|
end
|
116
135
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
136
|
+
private_constant :Text2Node, :NodeRegexps
|
137
|
+
private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
|
138
|
+
end
|
139
|
+
|
140
|
+
class Hash
|
141
|
+
def deep_symbolize_keys
|
142
|
+
Hash[
|
143
|
+
self.map do |k, v|
|
144
|
+
v = v.deep_symbolize_keys if v.kind_of?(Hash)
|
145
|
+
[k.to_sym, v]
|
125
146
|
end
|
126
|
-
|
127
|
-
end
|
147
|
+
]
|
128
148
|
end
|
129
149
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
|
5
|
+
module Yasuri
|
6
|
+
class CLI < Thor
|
7
|
+
package_name "yasuri"
|
8
|
+
|
9
|
+
default_command :scrape
|
10
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
11
|
+
"Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
15
|
+
def scrape(uri)
|
16
|
+
begin
|
17
|
+
test_arguments(options)
|
18
|
+
rescue => e
|
19
|
+
$stderr.puts e.message
|
20
|
+
return -1
|
21
|
+
end
|
22
|
+
|
23
|
+
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
24
|
+
file_path = options[:file]
|
25
|
+
json_string = options[:json]
|
26
|
+
|
27
|
+
begin
|
28
|
+
tree = make_tree(file_path, json_string)
|
29
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
30
|
+
rescue => e
|
31
|
+
$stderr.puts e.message
|
32
|
+
return -1
|
33
|
+
end
|
34
|
+
|
35
|
+
if result.instance_of?(String)
|
36
|
+
puts result
|
37
|
+
else
|
38
|
+
j result
|
39
|
+
end
|
40
|
+
|
41
|
+
return 0
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def test_arguments(options)
|
47
|
+
too_many_options = [options[:file], options[:json]].compact.count != 1
|
48
|
+
raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
|
49
|
+
|
50
|
+
empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
|
51
|
+
raise "ERROR: --file option require not empty argument." if empty_file_argument
|
52
|
+
|
53
|
+
empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
|
54
|
+
raise "ERROR: --json option require not empty argument." if empty_json_string_argument
|
55
|
+
end
|
56
|
+
|
57
|
+
def make_tree(file_path, json_string)
|
58
|
+
if file_path
|
59
|
+
begin
|
60
|
+
src = File.read(file_path)
|
61
|
+
make_tree_from_file(src)
|
62
|
+
rescue => e
|
63
|
+
raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
|
64
|
+
end
|
65
|
+
else
|
66
|
+
begin
|
67
|
+
Yasuri.json2tree(json_string)
|
68
|
+
rescue => e
|
69
|
+
raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def make_tree_from_file(src)
|
75
|
+
Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -1,26 +1,30 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
class LinksNode
|
8
6
|
include Node
|
7
|
+
|
9
8
|
def inject(agent, page, opt = {}, element = page)
|
10
|
-
retry_count = opt[:retry_count] ||
|
9
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
10
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
11
11
|
|
12
12
|
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
15
|
+
child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
22
22
|
Hash[child_results_kv]
|
23
|
-
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def node_type_str
|
27
|
+
"links".freeze
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|