yasuri 3.3.0 → 3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ec1d3a8cc766976c1d1329448a52df1696ccb50c41bf7c714b28cd265470d54
|
4
|
+
data.tar.gz: a87a1cd109c0e3dd8d820d3f35e34ef3096ef493d53d9c6bab0a79f1ccd7df9a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f7787bfdc549e70b9e5a5ae45f84f3a0316142deea3c75f17d9781c8140e597d41ccbfd1679e8186799f76e771a24a1d6129402c818f5d6e3ea0b94128a2185
|
7
|
+
data.tar.gz: 61e430d7dae6fda7c28a9456f1751206cfcd01321ff71feeeba7447606a1eb10d7270dda5c8947a4cc8edb3dae60d6d57fc79f127e61920249fcab0ca47f4e37
|
data/.github/workflows/ruby.yml
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
|
2
|
+
# inherit_from: .rubocop_todo.yml
|
3
|
+
|
4
|
+
inherit_mode:
|
5
|
+
merge:
|
6
|
+
- Exclude
|
7
|
+
|
8
|
+
require:
|
9
|
+
- rubocop-performance
|
10
|
+
- rubocop-rspec
|
11
|
+
- rubocop-rubycw
|
12
|
+
|
13
|
+
AllCops:
|
14
|
+
DisabledByDefault: true
|
15
|
+
DisplayCopNames: true
|
16
|
+
Exclude:
|
17
|
+
- 'gems/**/*'
|
18
|
+
- 'pkg/**/*'
|
19
|
+
- 'coverage/**/*'
|
20
|
+
- 'exe/**/*'
|
21
|
+
|
22
|
+
NewCops: enable
|
23
|
+
|
24
|
+
Bundler:
|
25
|
+
Enabled: true
|
26
|
+
|
27
|
+
Gemspec:
|
28
|
+
Enabled: true
|
29
|
+
|
30
|
+
Lint:
|
31
|
+
Enabled: true
|
32
|
+
|
33
|
+
Performance:
|
34
|
+
Enabled: true
|
35
|
+
|
36
|
+
Rubycw:
|
37
|
+
Enabled: true
|
38
|
+
|
39
|
+
Security:
|
40
|
+
Enabled: true
|
41
|
+
|
42
|
+
Style/HashSyntax:
|
43
|
+
EnforcedStyle: ruby19
|
44
|
+
Style/HashEachMethods:
|
45
|
+
Enabled: true
|
46
|
+
Style/HashTransformKeys:
|
47
|
+
Enabled: true
|
48
|
+
Style/HashTransformValues:
|
49
|
+
Enabled: true
|
data/.rubocop_todo.yml
ADDED
File without changes
|
data/README.md
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
[![Build Status](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml/badge.svg)](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
|
3
3
|
[![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Maintainability](https://api.codeclimate.com/v1/badges/c29480fea1305afe999f/maintainability)](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
|
4
4
|
|
5
|
-
Yasuri (鑢) is
|
5
|
+
Yasuri (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
|
6
|
+
It performs scraping by simply describing the expected result in a simple declarative notation.
|
6
7
|
|
7
|
-
Yasuri
|
8
|
+
Yasuri makes it easy to write common scraping operations.
|
9
|
+
For example, the following processes can be easily implemented.
|
8
10
|
|
9
11
|
For example,
|
10
12
|
|
data/Rakefile
CHANGED
data/examples/example.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
1
|
|
5
2
|
require 'mechanize'
|
6
3
|
require 'json'
|
@@ -23,41 +20,48 @@ module Yasuri
|
|
23
20
|
raise RuntimeError if json_string.nil? or json_string.empty?
|
24
21
|
|
25
22
|
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
26
|
-
|
23
|
+
self.hash2node(node_hash)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.yaml2tree(yaml_string)
|
27
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
28
|
+
|
29
|
+
node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
|
30
|
+
self.hash2node(node_hash.deep_symbolize_keys)
|
27
31
|
end
|
28
32
|
|
29
33
|
def self.tree2json(node)
|
30
34
|
raise RuntimeError if node.nil?
|
31
35
|
|
32
|
-
|
36
|
+
self.node2hash(node).to_json
|
33
37
|
end
|
34
38
|
|
35
|
-
def self.
|
36
|
-
|
39
|
+
def self.with_retry(
|
40
|
+
retry_count = DefaultRetryCount,
|
41
|
+
interval_ms = DefaultInterval_ms)
|
37
42
|
|
38
|
-
|
39
|
-
|
43
|
+
begin
|
44
|
+
Kernel.sleep(interval_ms * 0.001)
|
45
|
+
return yield() if block_given?
|
46
|
+
rescue => e
|
47
|
+
if retry_count > 0
|
48
|
+
retry_count -= 1
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
fail e
|
52
|
+
end
|
40
53
|
end
|
41
54
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
generated || super(method_name, **opt)
|
55
|
+
def self.node_name(name, opt)
|
56
|
+
symbolize_names = opt[:symbolize_names]
|
57
|
+
symbolize_names ? name.to_sym : name
|
46
58
|
end
|
47
59
|
|
48
|
-
private
|
49
|
-
Text2Node = {
|
50
|
-
text: Yasuri::TextNode,
|
51
|
-
struct: Yasuri::StructNode,
|
52
|
-
links: Yasuri::LinksNode,
|
53
|
-
pages: Yasuri::PaginateNode,
|
54
|
-
map: Yasuri::MapNode
|
55
|
-
}
|
60
|
+
# private
|
56
61
|
|
57
62
|
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
58
63
|
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
59
64
|
|
60
|
-
node_prefixes = Text2Node.keys.freeze
|
61
65
|
child_nodes = []
|
62
66
|
opt = {}
|
63
67
|
path = nil
|
@@ -65,27 +69,7 @@ module Yasuri
|
|
65
69
|
if node_hash.is_a?(String)
|
66
70
|
path = node_hash
|
67
71
|
else
|
68
|
-
|
69
|
-
# is node?
|
70
|
-
node_regexps = Text2Node.keys.map do |node_type_sym|
|
71
|
-
/^(#{node_type_sym.to_s})_(.+)$/
|
72
|
-
end
|
73
|
-
node_regexp = node_regexps.find do |node_regexp|
|
74
|
-
key =~ node_regexp
|
75
|
-
end
|
76
|
-
|
77
|
-
case key
|
78
|
-
when node_regexp
|
79
|
-
node_type_sym = $1.to_sym
|
80
|
-
child_node_name = $2
|
81
|
-
child_node_type = Text2Node[node_type_sym]
|
82
|
-
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
83
|
-
when :path
|
84
|
-
path = value
|
85
|
-
else
|
86
|
-
opt[key] = value
|
87
|
-
end
|
88
|
-
end
|
72
|
+
child_nodes, opt, path = self.hash2child_node(node_hash)
|
89
73
|
end
|
90
74
|
|
91
75
|
# If only single node under root, return only the node.
|
@@ -100,6 +84,42 @@ module Yasuri
|
|
100
84
|
node
|
101
85
|
end
|
102
86
|
|
87
|
+
Text2Node = {
|
88
|
+
text: Yasuri::TextNode,
|
89
|
+
struct: Yasuri::StructNode,
|
90
|
+
links: Yasuri::LinksNode,
|
91
|
+
pages: Yasuri::PaginateNode,
|
92
|
+
map: Yasuri::MapNode
|
93
|
+
}
|
94
|
+
|
95
|
+
NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
|
96
|
+
|
97
|
+
def self.hash2child_node(node_hash)
|
98
|
+
child_nodes = []
|
99
|
+
opt = {}
|
100
|
+
path = nil
|
101
|
+
|
102
|
+
node_hash.each do |key, value|
|
103
|
+
# is node?
|
104
|
+
|
105
|
+
node_regexp = NodeRegexps.find { |r| key =~ r }
|
106
|
+
|
107
|
+
case key
|
108
|
+
when node_regexp
|
109
|
+
node_type_sym = $1.to_sym
|
110
|
+
child_node_name = $2
|
111
|
+
child_node_type = Text2Node[node_type_sym]
|
112
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
113
|
+
when :path
|
114
|
+
path = value
|
115
|
+
else
|
116
|
+
opt[key] = value
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
[child_nodes, opt, path]
|
121
|
+
end
|
122
|
+
|
103
123
|
def self.node2hash(node)
|
104
124
|
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
105
125
|
|
@@ -108,26 +128,13 @@ module Yasuri
|
|
108
128
|
}
|
109
129
|
end
|
110
130
|
|
111
|
-
def self.
|
112
|
-
|
113
|
-
|
131
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
132
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
133
|
+
generated || super(method_name, **opt)
|
114
134
|
end
|
115
135
|
|
116
|
-
|
117
|
-
|
118
|
-
interval_ms = DefaultInterval_ms)
|
119
|
-
|
120
|
-
begin
|
121
|
-
Kernel.sleep(interval_ms * 0.001)
|
122
|
-
return yield() if block_given?
|
123
|
-
rescue => e
|
124
|
-
if retry_count > 0
|
125
|
-
retry_count -= 1
|
126
|
-
retry
|
127
|
-
end
|
128
|
-
fail e
|
129
|
-
end
|
130
|
-
end
|
136
|
+
private_constant :Text2Node, :NodeRegexps
|
137
|
+
private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
|
131
138
|
end
|
132
139
|
|
133
140
|
class Hash
|
data/lib/yasuri/yasuri_cli.rb
CHANGED
@@ -7,50 +7,30 @@ module Yasuri
|
|
7
7
|
package_name "yasuri"
|
8
8
|
|
9
9
|
default_command :scrape
|
10
|
-
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
10
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
11
|
+
"Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
11
12
|
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
12
13
|
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
13
14
|
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
14
15
|
def scrape(uri)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
-
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
-
return -1
|
23
|
-
end
|
24
|
-
if options[:json]&.empty? or options[:json] == "json"
|
25
|
-
$stderr.puts "ERROR: --json option require not empty argument."
|
16
|
+
begin
|
17
|
+
test_arguments(options)
|
18
|
+
rescue => e
|
19
|
+
$stderr.puts e.message
|
26
20
|
return -1
|
27
21
|
end
|
28
22
|
|
29
23
|
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
24
|
+
file_path = options[:file]
|
25
|
+
json_string = options[:json]
|
30
26
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
Yasuri.yaml2tree(src)
|
39
|
-
rescue => e
|
40
|
-
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
41
|
-
return -1
|
42
|
-
end
|
43
|
-
end
|
44
|
-
else
|
45
|
-
begin
|
46
|
-
Yasuri.json2tree(options[:json])
|
47
|
-
rescue => e
|
48
|
-
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
49
|
-
return -1
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
result = tree.scrape(uri, interval_ms: interval_ms)
|
27
|
+
begin
|
28
|
+
tree = make_tree(file_path, json_string)
|
29
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
30
|
+
rescue => e
|
31
|
+
$stderr.puts e.message
|
32
|
+
return -1
|
33
|
+
end
|
54
34
|
|
55
35
|
if result.instance_of?(String)
|
56
36
|
puts result
|
@@ -60,5 +40,39 @@ module Yasuri
|
|
60
40
|
|
61
41
|
return 0
|
62
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def test_arguments(options)
|
47
|
+
too_many_options = [options[:file], options[:json]].compact.count != 1
|
48
|
+
raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
|
49
|
+
|
50
|
+
empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
|
51
|
+
raise "ERROR: --file option require not empty argument." if empty_file_argument
|
52
|
+
|
53
|
+
empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
|
54
|
+
raise "ERROR: --json option require not empty argument." if empty_json_string_argument
|
55
|
+
end
|
56
|
+
|
57
|
+
def make_tree(file_path, json_string)
|
58
|
+
if file_path
|
59
|
+
begin
|
60
|
+
src = File.read(file_path)
|
61
|
+
make_tree_from_file(src)
|
62
|
+
rescue => e
|
63
|
+
raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
|
64
|
+
end
|
65
|
+
else
|
66
|
+
begin
|
67
|
+
Yasuri.json2tree(json_string)
|
68
|
+
rescue => e
|
69
|
+
raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def make_tree_from_file(src)
|
75
|
+
Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
|
76
|
+
end
|
63
77
|
end
|
64
78
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -22,11 +20,11 @@ module Yasuri
|
|
22
20
|
end
|
23
21
|
|
24
22
|
Hash[child_results_kv]
|
25
|
-
end
|
23
|
+
end
|
26
24
|
end
|
27
25
|
|
28
26
|
def node_type_str
|
29
27
|
"links".freeze
|
30
28
|
end
|
31
|
-
end
|
32
|
-
end
|
29
|
+
end
|
30
|
+
end
|
@@ -10,7 +10,7 @@ module Yasuri
|
|
10
10
|
@opt = opt
|
11
11
|
end
|
12
12
|
|
13
|
-
def inject(agent, page, opt = {},
|
13
|
+
def inject(agent, page, opt = {}, _element = page)
|
14
14
|
child_results_kv = @children.map do |node|
|
15
15
|
[node.name, node.inject(agent, page, opt)]
|
16
16
|
end
|
@@ -19,7 +19,7 @@ module Yasuri
|
|
19
19
|
|
20
20
|
def to_h
|
21
21
|
node_hash = {}
|
22
|
-
self.opts.each{|k, v| node_hash[k] = v
|
22
|
+
self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
|
23
23
|
|
24
24
|
children.each do |child|
|
25
25
|
child_node_name = "#{child.node_type_str}_#{child.name}"
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -1,27 +1,24 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
module Node
|
8
6
|
attr_reader :url, :xpath, :name, :children
|
9
7
|
|
10
|
-
def initialize(xpath, name, children = [], **
|
8
|
+
def initialize(xpath, name, children = [], **_opt)
|
11
9
|
@xpath, @name, @children = xpath, name, children
|
12
10
|
end
|
13
11
|
|
14
12
|
def scrape(uri, opt = {})
|
15
|
-
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
16
|
-
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
17
|
-
|
18
13
|
agent = Mechanize.new
|
19
|
-
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
20
14
|
scrape_with_agent(uri, agent, opt)
|
21
15
|
end
|
22
16
|
|
23
17
|
def scrape_with_agent(uri, agent, opt = {})
|
24
|
-
|
18
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
19
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
20
|
+
|
21
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
25
22
|
inject(agent, page, opt)
|
26
23
|
end
|
27
24
|
|