yasuri 2.0.13 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/.ruby-version +1 -1
- data/README.md +82 -31
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -75
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +17 -14
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +13 -8
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -11
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +92 -60
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +108 -19
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
data/examples/example.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yasuri'
|
4
|
+
uri = "https://github.com/tac0x2a?tab=repositories"
|
5
|
+
|
6
|
+
# Node tree constructing by DSL
|
7
|
+
root = Yasuri.map_root do
|
8
|
+
text_title '/html/head/title'
|
9
|
+
links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
|
10
|
+
text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
|
11
|
+
text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
|
12
|
+
text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
|
13
|
+
text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Node tree constructing by YAML
|
18
|
+
# src = <<-EOYML
|
19
|
+
# text_title: /html/head/title
|
20
|
+
# links_repo:
|
21
|
+
# path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
22
|
+
# text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
23
|
+
# text_desc:
|
24
|
+
# path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
25
|
+
# proc: :strip
|
26
|
+
# text_stars:
|
27
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
28
|
+
# proc: :to_i
|
29
|
+
# text_forks:
|
30
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
31
|
+
# proc: :to_i
|
32
|
+
# EOYML
|
33
|
+
# root = Yasuri.yaml2tree(src)
|
34
|
+
|
35
|
+
contents = root.scrape(uri, interval_ms: 100)
|
36
|
+
# jj contents
|
37
|
+
# {
|
38
|
+
# "title": "tac0x2a (TAC) / Repositories · GitHub",
|
39
|
+
# "repo": [
|
40
|
+
# {
|
41
|
+
# "name": "o-namazu",
|
42
|
+
# "desc": "Oh Namazu (Catfish) in datalake",
|
43
|
+
# "stars": 1,
|
44
|
+
# "forks": 0
|
45
|
+
# },
|
46
|
+
# {
|
47
|
+
# "name": "grebe",
|
48
|
+
# "desc": "grebe in datalake",
|
49
|
+
# "stars": 2,
|
50
|
+
# "forks": 0
|
51
|
+
# },
|
52
|
+
# {
|
53
|
+
# "name": "yasuri",
|
54
|
+
# "desc": "Yasuri (鑢) is easy web scraping library.",
|
55
|
+
# "stars": 43,
|
56
|
+
# "forks": 1
|
57
|
+
# },
|
58
|
+
# {
|
59
|
+
# "name": "dotfiles",
|
60
|
+
# "desc": "dotfiles",
|
61
|
+
# "stars": 0,
|
62
|
+
# "forks": 0
|
63
|
+
# }
|
64
|
+
# ...
|
65
|
+
# ]
|
66
|
+
# }
|
67
|
+
|
68
|
+
# Output as markdown
|
69
|
+
puts "# #{contents['title']}"
|
70
|
+
contents['repo'].each do |h|
|
71
|
+
puts "-----"
|
72
|
+
puts "## #{h['name']}"
|
73
|
+
puts h['desc']
|
74
|
+
puts ""
|
75
|
+
puts "* Stars: #{h['stars']}"
|
76
|
+
puts "* Forks: #{h['forks']}"
|
77
|
+
puts ""
|
78
|
+
end
|
data/examples/github.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
# yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
|
3
|
+
text_title: /html/head/title
|
4
|
+
links_repo:
|
5
|
+
path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
6
|
+
text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
7
|
+
text_desc:
|
8
|
+
path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
9
|
+
proc: :strip
|
10
|
+
text_stars:
|
11
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
12
|
+
proc: :to_i
|
13
|
+
text_forks:
|
14
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
15
|
+
proc: :to_i
|
data/examples/sample.yml
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# yasuri scrape "https://www.tac42.net/" -f sample.yml
|
2
|
+
links_each:
|
3
|
+
path: //*[@id="posts"]/article/header/h1/a
|
4
|
+
text_title: //*[@id="content"]/article/header/h1
|
5
|
+
text_description: /html/head/meta[12]/@content
|
6
|
+
text_date:
|
7
|
+
path: //*[@id="content"]/article/header/div/span
|
8
|
+
proc: :strip
|
9
|
+
text_length:
|
10
|
+
path: //*[@id="content"]
|
11
|
+
proc: :size
|
data/exe/yasuri
ADDED
data/lib/yasuri.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
1
|
|
5
2
|
require 'mechanize'
|
6
3
|
require 'json'
|
@@ -11,118 +8,142 @@ require_relative 'yasuri_text_node'
|
|
11
8
|
require_relative 'yasuri_struct_node'
|
12
9
|
require_relative 'yasuri_paginate_node'
|
13
10
|
require_relative 'yasuri_links_node'
|
11
|
+
require_relative 'yasuri_map_node'
|
14
12
|
require_relative 'yasuri_node_generator'
|
15
13
|
|
16
14
|
module Yasuri
|
17
15
|
|
16
|
+
DefaultRetryCount = 5
|
17
|
+
DefaultInterval_ms = 0
|
18
|
+
|
18
19
|
def self.json2tree(json_string)
|
19
|
-
|
20
|
-
Yasuri.hash2node(json)
|
21
|
-
end
|
20
|
+
raise RuntimeError if json_string.nil? or json_string.empty?
|
22
21
|
|
23
|
-
|
24
|
-
|
22
|
+
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
23
|
+
self.hash2node(node_hash)
|
25
24
|
end
|
26
25
|
|
27
26
|
def self.yaml2tree(yaml_string)
|
28
27
|
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
|
30
|
+
self.hash2node(node_hash.deep_symbolize_keys)
|
31
|
+
end
|
32
32
|
|
33
|
-
|
34
|
-
|
33
|
+
def self.tree2json(node)
|
34
|
+
raise RuntimeError if node.nil?
|
35
35
|
|
36
|
-
|
36
|
+
self.node2hash(node).to_json
|
37
37
|
end
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
39
|
+
def self.with_retry(
|
40
|
+
retry_count = DefaultRetryCount,
|
41
|
+
interval_ms = DefaultInterval_ms)
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
begin
|
44
|
+
Kernel.sleep(interval_ms * 0.001)
|
45
|
+
return yield() if block_given?
|
46
|
+
rescue => e
|
47
|
+
if retry_count > 0
|
48
|
+
retry_count -= 1
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
fail e
|
52
|
+
end
|
53
|
+
end
|
46
54
|
|
47
|
-
|
55
|
+
def self.node_name(name, opt)
|
56
|
+
symbolize_names = opt[:symbolize_names]
|
57
|
+
symbolize_names ? name.to_sym : name
|
58
|
+
end
|
48
59
|
|
49
|
-
|
50
|
-
|
51
|
-
|
60
|
+
# private
|
61
|
+
|
62
|
+
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
63
|
+
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
64
|
+
|
65
|
+
child_nodes = []
|
66
|
+
opt = {}
|
67
|
+
path = nil
|
68
|
+
|
69
|
+
if node_hash.is_a?(String)
|
70
|
+
path = node_hash
|
71
|
+
else
|
72
|
+
child_nodes, opt, path = self.hash2child_node(node_hash)
|
52
73
|
end
|
53
74
|
|
54
|
-
|
55
|
-
|
75
|
+
# If only single node under root, return only the node.
|
76
|
+
return child_nodes.first if node_name.nil? and child_nodes.size == 1
|
56
77
|
|
57
|
-
|
58
|
-
|
59
|
-
|
78
|
+
node = if node_type_class.nil?
|
79
|
+
Yasuri::MapNode.new(node_name, child_nodes, **opt)
|
80
|
+
else
|
81
|
+
node_type_class::new(path, node_name, child_nodes, **opt)
|
82
|
+
end
|
83
|
+
|
84
|
+
node
|
60
85
|
end
|
61
86
|
|
62
|
-
private
|
63
87
|
Text2Node = {
|
64
88
|
text: Yasuri::TextNode,
|
65
89
|
struct: Yasuri::StructNode,
|
66
90
|
links: Yasuri::LinksNode,
|
67
|
-
pages: Yasuri::PaginateNode
|
91
|
+
pages: Yasuri::PaginateNode,
|
92
|
+
map: Yasuri::MapNode
|
68
93
|
}
|
69
|
-
Node2Text = Text2Node.invert
|
70
|
-
|
71
|
-
ReservedKeys = %i|node name path children|
|
72
|
-
def self.hash2node(node_h)
|
73
|
-
node, name, path, children = ReservedKeys.map do |key|
|
74
|
-
node_h[key]
|
75
|
-
end
|
76
|
-
children ||= []
|
77
94
|
|
78
|
-
|
79
|
-
fail "Not found 'name' value in map" if name.nil?
|
80
|
-
fail "Not found 'path' value in map" if path.nil?
|
95
|
+
NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
|
81
96
|
|
82
|
-
|
83
|
-
|
84
|
-
opt =
|
97
|
+
def self.hash2child_node(node_hash)
|
98
|
+
child_nodes = []
|
99
|
+
opt = {}
|
100
|
+
path = nil
|
85
101
|
|
86
|
-
|
87
|
-
|
88
|
-
klass.new(path, name, childnodes, opt)
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.node2hash(node)
|
92
|
-
json = JSON.parse("{}")
|
93
|
-
return json if node.nil?
|
102
|
+
node_hash.each do |key, value|
|
103
|
+
# is node?
|
94
104
|
|
95
|
-
|
96
|
-
klass_str = Node2Text[klass]
|
105
|
+
node_regexp = NodeRegexps.find { |r| key =~ r }
|
97
106
|
|
98
|
-
|
99
|
-
|
100
|
-
|
107
|
+
case key
|
108
|
+
when node_regexp
|
109
|
+
node_type_sym = $1.to_sym
|
110
|
+
child_node_name = $2
|
111
|
+
child_node_type = Text2Node[node_type_sym]
|
112
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
113
|
+
when :path
|
114
|
+
path = value
|
115
|
+
else
|
116
|
+
opt[key] = value
|
117
|
+
end
|
118
|
+
end
|
101
119
|
|
102
|
-
|
103
|
-
|
120
|
+
[child_nodes, opt, path]
|
121
|
+
end
|
104
122
|
|
105
|
-
|
106
|
-
|
107
|
-
end
|
123
|
+
def self.node2hash(node)
|
124
|
+
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
108
125
|
|
109
|
-
|
126
|
+
{
|
127
|
+
"#{node.node_type_str}_#{node.name}" => node.to_h
|
128
|
+
}
|
110
129
|
end
|
111
130
|
|
112
|
-
def self.
|
113
|
-
|
131
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
132
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
133
|
+
generated || super(method_name, **opt)
|
114
134
|
end
|
115
135
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
136
|
+
private_constant :Text2Node, :NodeRegexps
|
137
|
+
private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
|
138
|
+
end
|
139
|
+
|
140
|
+
class Hash
|
141
|
+
def deep_symbolize_keys
|
142
|
+
Hash[
|
143
|
+
self.map do |k, v|
|
144
|
+
v = v.deep_symbolize_keys if v.kind_of?(Hash)
|
145
|
+
[k.to_sym, v]
|
124
146
|
end
|
125
|
-
|
126
|
-
end
|
147
|
+
]
|
127
148
|
end
|
128
149
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
|
5
|
+
module Yasuri
|
6
|
+
class CLI < Thor
|
7
|
+
package_name "yasuri"
|
8
|
+
|
9
|
+
default_command :scrape
|
10
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
11
|
+
"Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
15
|
+
def scrape(uri)
|
16
|
+
begin
|
17
|
+
test_arguments(options)
|
18
|
+
rescue => e
|
19
|
+
$stderr.puts e.message
|
20
|
+
return -1
|
21
|
+
end
|
22
|
+
|
23
|
+
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
24
|
+
file_path = options[:file]
|
25
|
+
json_string = options[:json]
|
26
|
+
|
27
|
+
begin
|
28
|
+
tree = make_tree(file_path, json_string)
|
29
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
30
|
+
rescue => e
|
31
|
+
$stderr.puts e.message
|
32
|
+
return -1
|
33
|
+
end
|
34
|
+
|
35
|
+
if result.instance_of?(String)
|
36
|
+
puts result
|
37
|
+
else
|
38
|
+
j result
|
39
|
+
end
|
40
|
+
|
41
|
+
return 0
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def test_arguments(options)
|
47
|
+
too_many_options = [options[:file], options[:json]].compact.count != 1
|
48
|
+
raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
|
49
|
+
|
50
|
+
empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
|
51
|
+
raise "ERROR: --file option require not empty argument." if empty_file_argument
|
52
|
+
|
53
|
+
empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
|
54
|
+
raise "ERROR: --json option require not empty argument." if empty_json_string_argument
|
55
|
+
end
|
56
|
+
|
57
|
+
def make_tree(file_path, json_string)
|
58
|
+
if file_path
|
59
|
+
begin
|
60
|
+
src = File.read(file_path)
|
61
|
+
make_tree_from_file(src)
|
62
|
+
rescue => e
|
63
|
+
raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
|
64
|
+
end
|
65
|
+
else
|
66
|
+
begin
|
67
|
+
Yasuri.json2tree(json_string)
|
68
|
+
rescue => e
|
69
|
+
raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def make_tree_from_file(src)
|
75
|
+
Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -1,26 +1,30 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
class LinksNode
|
8
6
|
include Node
|
7
|
+
|
9
8
|
def inject(agent, page, opt = {}, element = page)
|
10
|
-
retry_count = opt[:retry_count] ||
|
9
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
10
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
11
11
|
|
12
12
|
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
15
|
+
child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
22
22
|
Hash[child_results_kv]
|
23
|
-
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def node_type_str
|
27
|
+
"links".freeze
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|