yasuri 3.3.0 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -1
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/README.md +4 -2
- data/Rakefile +1 -1
- data/examples/example.rb +0 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +68 -61
- data/lib/yasuri/yasuri_cli.rb +49 -35
- data/lib/yasuri/yasuri_links_node.rb +3 -5
- data/lib/yasuri/yasuri_map_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +5 -8
- data/lib/yasuri/yasuri_node_generator.rb +1 -3
- data/lib/yasuri/yasuri_paginate_node.rb +22 -18
- data/lib/yasuri/yasuri_struct_node.rb +1 -3
- data/lib/yasuri/yasuri_text_node.rb +4 -6
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +0 -2
- data/spec/yasuri_cli_spec.rb +79 -61
- data/spec/yasuri_links_node_spec.rb +72 -62
- data/spec/yasuri_map_spec.rb +10 -14
- data/spec/yasuri_paginate_node_spec.rb +89 -90
- data/spec/yasuri_spec.rb +15 -24
- data/spec/yasuri_struct_node_spec.rb +120 -96
- data/spec/yasuri_text_node_spec.rb +22 -31
- data/yasuri.gemspec +29 -24
- metadata +67 -11
- data/spec/yasuri_node_spec.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ec1d3a8cc766976c1d1329448a52df1696ccb50c41bf7c714b28cd265470d54
|
4
|
+
data.tar.gz: a87a1cd109c0e3dd8d820d3f35e34ef3096ef493d53d9c6bab0a79f1ccd7df9a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f7787bfdc549e70b9e5a5ae45f84f3a0316142deea3c75f17d9781c8140e597d41ccbfd1679e8186799f76e771a24a1d6129402c818f5d6e3ea0b94128a2185
|
7
|
+
data.tar.gz: 61e430d7dae6fda7c28a9456f1751206cfcd01321ff71feeeba7447606a1eb10d7270dda5c8947a4cc8edb3dae60d6d57fc79f127e61920249fcab0ca47f4e37
|
data/.github/workflows/ruby.yml
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
|
2
|
+
# inherit_from: .rubocop_todo.yml
|
3
|
+
|
4
|
+
inherit_mode:
|
5
|
+
merge:
|
6
|
+
- Exclude
|
7
|
+
|
8
|
+
require:
|
9
|
+
- rubocop-performance
|
10
|
+
- rubocop-rspec
|
11
|
+
- rubocop-rubycw
|
12
|
+
|
13
|
+
AllCops:
|
14
|
+
DisabledByDefault: true
|
15
|
+
DisplayCopNames: true
|
16
|
+
Exclude:
|
17
|
+
- 'gems/**/*'
|
18
|
+
- 'pkg/**/*'
|
19
|
+
- 'coverage/**/*'
|
20
|
+
- 'exe/**/*'
|
21
|
+
|
22
|
+
NewCops: enable
|
23
|
+
|
24
|
+
Bundler:
|
25
|
+
Enabled: true
|
26
|
+
|
27
|
+
Gemspec:
|
28
|
+
Enabled: true
|
29
|
+
|
30
|
+
Lint:
|
31
|
+
Enabled: true
|
32
|
+
|
33
|
+
Performance:
|
34
|
+
Enabled: true
|
35
|
+
|
36
|
+
Rubycw:
|
37
|
+
Enabled: true
|
38
|
+
|
39
|
+
Security:
|
40
|
+
Enabled: true
|
41
|
+
|
42
|
+
Style/HashSyntax:
|
43
|
+
EnforcedStyle: ruby19
|
44
|
+
Style/HashEachMethods:
|
45
|
+
Enabled: true
|
46
|
+
Style/HashTransformKeys:
|
47
|
+
Enabled: true
|
48
|
+
Style/HashTransformValues:
|
49
|
+
Enabled: true
|
data/.rubocop_todo.yml
ADDED
File without changes
|
data/README.md
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
[](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
|
3
3
|
[](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
|
4
4
|
|
5
|
-
Yasuri (鑢) is
|
5
|
+
Yasuri (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
|
6
|
+
It performs scraping by simply describing the expected result in a simple declarative notation.
|
6
7
|
|
7
|
-
Yasuri
|
8
|
+
Yasuri makes it easy to write common scraping operations.
|
9
|
+
For example, the following processes can be easily implemented.
|
8
10
|
|
9
11
|
For example,
|
10
12
|
|
data/Rakefile
CHANGED
data/examples/example.rb
CHANGED
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
1
|
|
5
2
|
require 'mechanize'
|
6
3
|
require 'json'
|
@@ -23,41 +20,48 @@ module Yasuri
|
|
23
20
|
raise RuntimeError if json_string.nil? or json_string.empty?
|
24
21
|
|
25
22
|
node_hash = JSON.parse(json_string, {symbolize_names: true})
|
26
|
-
|
23
|
+
self.hash2node(node_hash)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.yaml2tree(yaml_string)
|
27
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
28
|
+
|
29
|
+
node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
|
30
|
+
self.hash2node(node_hash.deep_symbolize_keys)
|
27
31
|
end
|
28
32
|
|
29
33
|
def self.tree2json(node)
|
30
34
|
raise RuntimeError if node.nil?
|
31
35
|
|
32
|
-
|
36
|
+
self.node2hash(node).to_json
|
33
37
|
end
|
34
38
|
|
35
|
-
def self.
|
36
|
-
|
39
|
+
def self.with_retry(
|
40
|
+
retry_count = DefaultRetryCount,
|
41
|
+
interval_ms = DefaultInterval_ms)
|
37
42
|
|
38
|
-
|
39
|
-
|
43
|
+
begin
|
44
|
+
Kernel.sleep(interval_ms * 0.001)
|
45
|
+
return yield() if block_given?
|
46
|
+
rescue => e
|
47
|
+
if retry_count > 0
|
48
|
+
retry_count -= 1
|
49
|
+
retry
|
50
|
+
end
|
51
|
+
fail e
|
52
|
+
end
|
40
53
|
end
|
41
54
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
generated || super(method_name, **opt)
|
55
|
+
def self.node_name(name, opt)
|
56
|
+
symbolize_names = opt[:symbolize_names]
|
57
|
+
symbolize_names ? name.to_sym : name
|
46
58
|
end
|
47
59
|
|
48
|
-
private
|
49
|
-
Text2Node = {
|
50
|
-
text: Yasuri::TextNode,
|
51
|
-
struct: Yasuri::StructNode,
|
52
|
-
links: Yasuri::LinksNode,
|
53
|
-
pages: Yasuri::PaginateNode,
|
54
|
-
map: Yasuri::MapNode
|
55
|
-
}
|
60
|
+
# private
|
56
61
|
|
57
62
|
def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
|
58
63
|
raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
|
59
64
|
|
60
|
-
node_prefixes = Text2Node.keys.freeze
|
61
65
|
child_nodes = []
|
62
66
|
opt = {}
|
63
67
|
path = nil
|
@@ -65,27 +69,7 @@ module Yasuri
|
|
65
69
|
if node_hash.is_a?(String)
|
66
70
|
path = node_hash
|
67
71
|
else
|
68
|
-
|
69
|
-
# is node?
|
70
|
-
node_regexps = Text2Node.keys.map do |node_type_sym|
|
71
|
-
/^(#{node_type_sym.to_s})_(.+)$/
|
72
|
-
end
|
73
|
-
node_regexp = node_regexps.find do |node_regexp|
|
74
|
-
key =~ node_regexp
|
75
|
-
end
|
76
|
-
|
77
|
-
case key
|
78
|
-
when node_regexp
|
79
|
-
node_type_sym = $1.to_sym
|
80
|
-
child_node_name = $2
|
81
|
-
child_node_type = Text2Node[node_type_sym]
|
82
|
-
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
83
|
-
when :path
|
84
|
-
path = value
|
85
|
-
else
|
86
|
-
opt[key] = value
|
87
|
-
end
|
88
|
-
end
|
72
|
+
child_nodes, opt, path = self.hash2child_node(node_hash)
|
89
73
|
end
|
90
74
|
|
91
75
|
# If only single node under root, return only the node.
|
@@ -100,6 +84,42 @@ module Yasuri
|
|
100
84
|
node
|
101
85
|
end
|
102
86
|
|
87
|
+
Text2Node = {
|
88
|
+
text: Yasuri::TextNode,
|
89
|
+
struct: Yasuri::StructNode,
|
90
|
+
links: Yasuri::LinksNode,
|
91
|
+
pages: Yasuri::PaginateNode,
|
92
|
+
map: Yasuri::MapNode
|
93
|
+
}
|
94
|
+
|
95
|
+
NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
|
96
|
+
|
97
|
+
def self.hash2child_node(node_hash)
|
98
|
+
child_nodes = []
|
99
|
+
opt = {}
|
100
|
+
path = nil
|
101
|
+
|
102
|
+
node_hash.each do |key, value|
|
103
|
+
# is node?
|
104
|
+
|
105
|
+
node_regexp = NodeRegexps.find { |r| key =~ r }
|
106
|
+
|
107
|
+
case key
|
108
|
+
when node_regexp
|
109
|
+
node_type_sym = $1.to_sym
|
110
|
+
child_node_name = $2
|
111
|
+
child_node_type = Text2Node[node_type_sym]
|
112
|
+
child_nodes << self.hash2node(value, child_node_name, child_node_type)
|
113
|
+
when :path
|
114
|
+
path = value
|
115
|
+
else
|
116
|
+
opt[key] = value
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
[child_nodes, opt, path]
|
121
|
+
end
|
122
|
+
|
103
123
|
def self.node2hash(node)
|
104
124
|
return node.to_h if node.instance_of?(Yasuri::MapNode)
|
105
125
|
|
@@ -108,26 +128,13 @@ module Yasuri
|
|
108
128
|
}
|
109
129
|
end
|
110
130
|
|
111
|
-
def self.
|
112
|
-
|
113
|
-
|
131
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
132
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
133
|
+
generated || super(method_name, **opt)
|
114
134
|
end
|
115
135
|
|
116
|
-
|
117
|
-
|
118
|
-
interval_ms = DefaultInterval_ms)
|
119
|
-
|
120
|
-
begin
|
121
|
-
Kernel.sleep(interval_ms * 0.001)
|
122
|
-
return yield() if block_given?
|
123
|
-
rescue => e
|
124
|
-
if retry_count > 0
|
125
|
-
retry_count -= 1
|
126
|
-
retry
|
127
|
-
end
|
128
|
-
fail e
|
129
|
-
end
|
130
|
-
end
|
136
|
+
private_constant :Text2Node, :NodeRegexps
|
137
|
+
private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
|
131
138
|
end
|
132
139
|
|
133
140
|
class Hash
|
data/lib/yasuri/yasuri_cli.rb
CHANGED
@@ -7,50 +7,30 @@ module Yasuri
|
|
7
7
|
package_name "yasuri"
|
8
8
|
|
9
9
|
default_command :scrape
|
10
|
-
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
10
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
|
11
|
+
"Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
11
12
|
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
12
13
|
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
13
14
|
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
14
15
|
def scrape(uri)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
-
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
-
return -1
|
23
|
-
end
|
24
|
-
if options[:json]&.empty? or options[:json] == "json"
|
25
|
-
$stderr.puts "ERROR: --json option require not empty argument."
|
16
|
+
begin
|
17
|
+
test_arguments(options)
|
18
|
+
rescue => e
|
19
|
+
$stderr.puts e.message
|
26
20
|
return -1
|
27
21
|
end
|
28
22
|
|
29
23
|
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
24
|
+
file_path = options[:file]
|
25
|
+
json_string = options[:json]
|
30
26
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
Yasuri.yaml2tree(src)
|
39
|
-
rescue => e
|
40
|
-
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
41
|
-
return -1
|
42
|
-
end
|
43
|
-
end
|
44
|
-
else
|
45
|
-
begin
|
46
|
-
Yasuri.json2tree(options[:json])
|
47
|
-
rescue => e
|
48
|
-
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
49
|
-
return -1
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
result = tree.scrape(uri, interval_ms: interval_ms)
|
27
|
+
begin
|
28
|
+
tree = make_tree(file_path, json_string)
|
29
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
30
|
+
rescue => e
|
31
|
+
$stderr.puts e.message
|
32
|
+
return -1
|
33
|
+
end
|
54
34
|
|
55
35
|
if result.instance_of?(String)
|
56
36
|
puts result
|
@@ -60,5 +40,39 @@ module Yasuri
|
|
60
40
|
|
61
41
|
return 0
|
62
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def test_arguments(options)
|
47
|
+
too_many_options = [options[:file], options[:json]].compact.count != 1
|
48
|
+
raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
|
49
|
+
|
50
|
+
empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
|
51
|
+
raise "ERROR: --file option require not empty argument." if empty_file_argument
|
52
|
+
|
53
|
+
empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
|
54
|
+
raise "ERROR: --json option require not empty argument." if empty_json_string_argument
|
55
|
+
end
|
56
|
+
|
57
|
+
def make_tree(file_path, json_string)
|
58
|
+
if file_path
|
59
|
+
begin
|
60
|
+
src = File.read(file_path)
|
61
|
+
make_tree_from_file(src)
|
62
|
+
rescue => e
|
63
|
+
raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
|
64
|
+
end
|
65
|
+
else
|
66
|
+
begin
|
67
|
+
Yasuri.json2tree(json_string)
|
68
|
+
rescue => e
|
69
|
+
raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def make_tree_from_file(src)
|
75
|
+
Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
|
76
|
+
end
|
63
77
|
end
|
64
78
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
@@ -22,11 +20,11 @@ module Yasuri
|
|
22
20
|
end
|
23
21
|
|
24
22
|
Hash[child_results_kv]
|
25
|
-
end
|
23
|
+
end
|
26
24
|
end
|
27
25
|
|
28
26
|
def node_type_str
|
29
27
|
"links".freeze
|
30
28
|
end
|
31
|
-
end
|
32
|
-
end
|
29
|
+
end
|
30
|
+
end
|
@@ -10,7 +10,7 @@ module Yasuri
|
|
10
10
|
@opt = opt
|
11
11
|
end
|
12
12
|
|
13
|
-
def inject(agent, page, opt = {},
|
13
|
+
def inject(agent, page, opt = {}, _element = page)
|
14
14
|
child_results_kv = @children.map do |node|
|
15
15
|
[node.name, node.inject(agent, page, opt)]
|
16
16
|
end
|
@@ -19,7 +19,7 @@ module Yasuri
|
|
19
19
|
|
20
20
|
def to_h
|
21
21
|
node_hash = {}
|
22
|
-
self.opts.each{|k, v| node_hash[k] = v
|
22
|
+
self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
|
23
23
|
|
24
24
|
children.each do |child|
|
25
25
|
child_node_name = "#{child.node_type_str}_#{child.name}"
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -1,27 +1,24 @@
|
|
1
1
|
|
2
|
-
# Author:: TAC (tac@tac42.net)
|
3
|
-
|
4
2
|
require_relative 'yasuri_node'
|
5
3
|
|
6
4
|
module Yasuri
|
7
5
|
module Node
|
8
6
|
attr_reader :url, :xpath, :name, :children
|
9
7
|
|
10
|
-
def initialize(xpath, name, children = [], **
|
8
|
+
def initialize(xpath, name, children = [], **_opt)
|
11
9
|
@xpath, @name, @children = xpath, name, children
|
12
10
|
end
|
13
11
|
|
14
12
|
def scrape(uri, opt = {})
|
15
|
-
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
16
|
-
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
17
|
-
|
18
13
|
agent = Mechanize.new
|
19
|
-
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
20
14
|
scrape_with_agent(uri, agent, opt)
|
21
15
|
end
|
22
16
|
|
23
17
|
def scrape_with_agent(uri, agent, opt = {})
|
24
|
-
|
18
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
19
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
20
|
+
|
21
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
25
22
|
inject(agent, page, opt)
|
26
23
|
end
|
27
24
|
|