yasuri 3.3.0 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a7bf438a08fc83fec7e78cb5543577c98f6cc98b4f5fae7b0dd969f2049c0531
4
- data.tar.gz: e399c6b57589b7d8ba2e8eff7a1d204fa7f8e676f82f631057e19a9377333060
3
+ metadata.gz: 0ec1d3a8cc766976c1d1329448a52df1696ccb50c41bf7c714b28cd265470d54
4
+ data.tar.gz: a87a1cd109c0e3dd8d820d3f35e34ef3096ef493d53d9c6bab0a79f1ccd7df9a
5
5
  SHA512:
6
- metadata.gz: 56f39994972657712cb7d95e5ceaadefca8de41e06c2cd4759363b496d7c8531fad7517f9df99bf2446c144f01c5cd82cbc94146c432d6b5b552f092b975ecd7
7
- data.tar.gz: cf74a25615187ecbe5f8ca5f2072679fa9cc1902dfa3bf2190b87e11104f332688cdaee16f4be6cb00f9ed63fa18f2ec8f27cf32b0b27389f81d98229fa212e6
6
+ metadata.gz: 5f7787bfdc549e70b9e5a5ae45f84f3a0316142deea3c75f17d9781c8140e597d41ccbfd1679e8186799f76e771a24a1d6129402c818f5d6e3ea0b94128a2185
7
+ data.tar.gz: 61e430d7dae6fda7c28a9456f1751206cfcd01321ff71feeeba7447606a1eb10d7270dda5c8947a4cc8edb3dae60d6d57fc79f127e61920249fcab0ca47f4e37
@@ -19,7 +19,7 @@ jobs:
19
19
  runs-on: ubuntu-latest
20
20
  strategy:
21
21
  matrix:
22
- ruby-version: ['2.6', '2.7', '3.0']
22
+ ruby-version: ['2.7', '3.0']
23
23
 
24
24
  steps:
25
25
  - uses: actions/checkout@v2
data/.rubocop.yml ADDED
@@ -0,0 +1,49 @@
1
+
2
+ # inherit_from: .rubocop_todo.yml
3
+
4
+ inherit_mode:
5
+ merge:
6
+ - Exclude
7
+
8
+ require:
9
+ - rubocop-performance
10
+ - rubocop-rspec
11
+ - rubocop-rubycw
12
+
13
+ AllCops:
14
+ DisabledByDefault: true
15
+ DisplayCopNames: true
16
+ Exclude:
17
+ - 'gems/**/*'
18
+ - 'pkg/**/*'
19
+ - 'coverage/**/*'
20
+ - 'exe/**/*'
21
+
22
+ NewCops: enable
23
+
24
+ Bundler:
25
+ Enabled: true
26
+
27
+ Gemspec:
28
+ Enabled: true
29
+
30
+ Lint:
31
+ Enabled: true
32
+
33
+ Performance:
34
+ Enabled: true
35
+
36
+ Rubycw:
37
+ Enabled: true
38
+
39
+ Security:
40
+ Enabled: true
41
+
42
+ Style/HashSyntax:
43
+ EnforcedStyle: ruby19
44
+ Style/HashEachMethods:
45
+ Enabled: true
46
+ Style/HashTransformKeys:
47
+ Enabled: true
48
+ Style/HashTransformValues:
49
+ Enabled: true
data/.rubocop_todo.yml ADDED
File without changes
data/README.md CHANGED
@@ -2,9 +2,11 @@
2
2
  [![Build Status](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml/badge.svg)](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
3
3
  [![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Maintainability](https://api.codeclimate.com/v1/badges/c29480fea1305afe999f/maintainability)](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
4
4
 
5
- Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)", and CLI tool using it.
5
+ Yasuri (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
6
+ It performs scraping by simply describing the expected result in a simple declarative notation.
6
7
 
7
- Yasuri can reduce frequently processes in Scraping.
8
+ Yasuri makes it easy to write common scraping operations.
9
+ For example, the following processes can be easily implemented.
8
10
 
9
11
  For example,
10
12
 
data/Rakefile CHANGED
@@ -3,5 +3,5 @@ require "rspec/core/rake_task"
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
7
7
 
data/examples/example.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- # Author:: TAC (tac@tac42.net)
3
2
 
4
3
  require 'yasuri'
5
4
  uri = "https://github.com/tac0x2a?tab=repositories"
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "3.3.0"
2
+ VERSION = "3.3.1"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -1,6 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # Author:: TAC (tac@tac42.net)
4
1
 
5
2
  require 'mechanize'
6
3
  require 'json'
@@ -23,41 +20,48 @@ module Yasuri
23
20
  raise RuntimeError if json_string.nil? or json_string.empty?
24
21
 
25
22
  node_hash = JSON.parse(json_string, {symbolize_names: true})
26
- Yasuri.hash2node(node_hash)
23
+ self.hash2node(node_hash)
24
+ end
25
+
26
+ def self.yaml2tree(yaml_string)
27
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
28
+
29
+ node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
30
+ self.hash2node(node_hash.deep_symbolize_keys)
27
31
  end
28
32
 
29
33
  def self.tree2json(node)
30
34
  raise RuntimeError if node.nil?
31
35
 
32
- Yasuri.node2hash(node).to_json
36
+ self.node2hash(node).to_json
33
37
  end
34
38
 
35
- def self.yaml2tree(yaml_string)
36
- raise RuntimeError if yaml_string.nil? or yaml_string.empty?
39
+ def self.with_retry(
40
+ retry_count = DefaultRetryCount,
41
+ interval_ms = DefaultInterval_ms)
37
42
 
38
- node_hash = YAML.load(yaml_string)
39
- Yasuri.hash2node(node_hash.deep_symbolize_keys)
43
+ begin
44
+ Kernel.sleep(interval_ms * 0.001)
45
+ return yield() if block_given?
46
+ rescue => e
47
+ if retry_count > 0
48
+ retry_count -= 1
49
+ retry
50
+ end
51
+ fail e
52
+ end
40
53
  end
41
54
 
42
- private
43
- def self.method_missing(method_name, pattern=nil, **opt, &block)
44
- generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
45
- generated || super(method_name, **opt)
55
+ def self.node_name(name, opt)
56
+ symbolize_names = opt[:symbolize_names]
57
+ symbolize_names ? name.to_sym : name
46
58
  end
47
59
 
48
- private
49
- Text2Node = {
50
- text: Yasuri::TextNode,
51
- struct: Yasuri::StructNode,
52
- links: Yasuri::LinksNode,
53
- pages: Yasuri::PaginateNode,
54
- map: Yasuri::MapNode
55
- }
60
+ # private
56
61
 
57
62
  def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
58
63
  raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
59
64
 
60
- node_prefixes = Text2Node.keys.freeze
61
65
  child_nodes = []
62
66
  opt = {}
63
67
  path = nil
@@ -65,27 +69,7 @@ module Yasuri
65
69
  if node_hash.is_a?(String)
66
70
  path = node_hash
67
71
  else
68
- node_hash.each do |key, value|
69
- # is node?
70
- node_regexps = Text2Node.keys.map do |node_type_sym|
71
- /^(#{node_type_sym.to_s})_(.+)$/
72
- end
73
- node_regexp = node_regexps.find do |node_regexp|
74
- key =~ node_regexp
75
- end
76
-
77
- case key
78
- when node_regexp
79
- node_type_sym = $1.to_sym
80
- child_node_name = $2
81
- child_node_type = Text2Node[node_type_sym]
82
- child_nodes << self.hash2node(value, child_node_name, child_node_type)
83
- when :path
84
- path = value
85
- else
86
- opt[key] = value
87
- end
88
- end
72
+ child_nodes, opt, path = self.hash2child_node(node_hash)
89
73
  end
90
74
 
91
75
  # If only single node under root, return only the node.
@@ -100,6 +84,42 @@ module Yasuri
100
84
  node
101
85
  end
102
86
 
87
+ Text2Node = {
88
+ text: Yasuri::TextNode,
89
+ struct: Yasuri::StructNode,
90
+ links: Yasuri::LinksNode,
91
+ pages: Yasuri::PaginateNode,
92
+ map: Yasuri::MapNode
93
+ }
94
+
95
+ NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
96
+
97
+ def self.hash2child_node(node_hash)
98
+ child_nodes = []
99
+ opt = {}
100
+ path = nil
101
+
102
+ node_hash.each do |key, value|
103
+ # is node?
104
+
105
+ node_regexp = NodeRegexps.find { |r| key =~ r }
106
+
107
+ case key
108
+ when node_regexp
109
+ node_type_sym = $1.to_sym
110
+ child_node_name = $2
111
+ child_node_type = Text2Node[node_type_sym]
112
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
113
+ when :path
114
+ path = value
115
+ else
116
+ opt[key] = value
117
+ end
118
+ end
119
+
120
+ [child_nodes, opt, path]
121
+ end
122
+
103
123
  def self.node2hash(node)
104
124
  return node.to_h if node.instance_of?(Yasuri::MapNode)
105
125
 
@@ -108,26 +128,13 @@ module Yasuri
108
128
  }
109
129
  end
110
130
 
111
- def self.node_name(name, opt)
112
- symbolize_names = opt[:symbolize_names]
113
- symbolize_names ? name.to_sym : name
131
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
132
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
133
+ generated || super(method_name, **opt)
114
134
  end
115
135
 
116
- def self.with_retry(
117
- retry_count = DefaultRetryCount,
118
- interval_ms = DefaultInterval_ms)
119
-
120
- begin
121
- Kernel.sleep(interval_ms * 0.001)
122
- return yield() if block_given?
123
- rescue => e
124
- if retry_count > 0
125
- retry_count -= 1
126
- retry
127
- end
128
- fail e
129
- end
130
- end
136
+ private_constant :Text2Node, :NodeRegexps
137
+ private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
131
138
  end
132
139
 
133
140
  class Hash
@@ -7,50 +7,30 @@ module Yasuri
7
7
  package_name "yasuri"
8
8
 
9
9
  default_command :scrape
10
- desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
11
+ "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
11
12
  option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
12
13
  option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
13
14
  option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
14
15
  def scrape(uri)
15
- # argument validations
16
- if [options[:file], options[:json]].compact.count != 1
17
- $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
- return -1
19
- end
20
- if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
- $stderr.puts "ERROR: --file option require not empty argument."
22
- return -1
23
- end
24
- if options[:json]&.empty? or options[:json] == "json"
25
- $stderr.puts "ERROR: --json option require not empty argument."
16
+ begin
17
+ test_arguments(options)
18
+ rescue => e
19
+ $stderr.puts e.message
26
20
  return -1
27
21
  end
28
22
 
29
23
  interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
24
+ file_path = options[:file]
25
+ json_string = options[:json]
30
26
 
31
- tree = if options[:file]
32
- src = File.read(options[:file])
33
-
34
- begin
35
- Yasuri.json2tree(src)
36
- rescue
37
- begin
38
- Yasuri.yaml2tree(src)
39
- rescue => e
40
- $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
41
- return -1
42
- end
43
- end
44
- else
45
- begin
46
- Yasuri.json2tree(options[:json])
47
- rescue => e
48
- $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
49
- return -1
50
- end
51
- end
52
-
53
- result = tree.scrape(uri, interval_ms: interval_ms)
27
+ begin
28
+ tree = make_tree(file_path, json_string)
29
+ result = tree.scrape(uri, interval_ms: interval_ms)
30
+ rescue => e
31
+ $stderr.puts e.message
32
+ return -1
33
+ end
54
34
 
55
35
  if result.instance_of?(String)
56
36
  puts result
@@ -60,5 +40,39 @@ module Yasuri
60
40
 
61
41
  return 0
62
42
  end
43
+
44
+ private
45
+
46
+ def test_arguments(options)
47
+ too_many_options = [options[:file], options[:json]].compact.count != 1
48
+ raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
49
+
50
+ empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
51
+ raise "ERROR: --file option require not empty argument." if empty_file_argument
52
+
53
+ empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
54
+ raise "ERROR: --json option require not empty argument." if empty_json_string_argument
55
+ end
56
+
57
+ def make_tree(file_path, json_string)
58
+ if file_path
59
+ begin
60
+ src = File.read(file_path)
61
+ make_tree_from_file(src)
62
+ rescue => e
63
+ raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
64
+ end
65
+ else
66
+ begin
67
+ Yasuri.json2tree(json_string)
68
+ rescue => e
69
+ raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
70
+ end
71
+ end
72
+ end
73
+
74
+ def make_tree_from_file(src)
75
+ Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
76
+ end
63
77
  end
64
78
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -22,11 +20,11 @@ module Yasuri
22
20
  end
23
21
 
24
22
  Hash[child_results_kv]
25
- end # each named child node
23
+ end
26
24
  end
27
25
 
28
26
  def node_type_str
29
27
  "links".freeze
30
28
  end
31
- end # class
32
- end # module
29
+ end
30
+ end
@@ -10,7 +10,7 @@ module Yasuri
10
10
  @opt = opt
11
11
  end
12
12
 
13
- def inject(agent, page, opt = {}, element = page)
13
+ def inject(agent, page, opt = {}, _element = page)
14
14
  child_results_kv = @children.map do |node|
15
15
  [node.name, node.inject(agent, page, opt)]
16
16
  end
@@ -19,7 +19,7 @@ module Yasuri
19
19
 
20
20
  def to_h
21
21
  node_hash = {}
22
- self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
22
+ self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
23
23
 
24
24
  children.each do |child|
25
25
  child_node_name = "#{child.node_type_str}_#{child.name}"
@@ -1,27 +1,24 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  module Node
8
6
  attr_reader :url, :xpath, :name, :children
9
7
 
10
- def initialize(xpath, name, children = [], **opt)
8
+ def initialize(xpath, name, children = [], **_opt)
11
9
  @xpath, @name, @children = xpath, name, children
12
10
  end
13
11
 
14
12
  def scrape(uri, opt = {})
15
- retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
16
- interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
17
-
18
13
  agent = Mechanize.new
19
- page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
20
14
  scrape_with_agent(uri, agent, opt)
21
15
  end
22
16
 
23
17
  def scrape_with_agent(uri, agent, opt = {})
24
- page = agent.get(uri)
18
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
19
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
20
+
21
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
25
22
  inject(agent, page, opt)
26
23
  end
27
24