yasuri 3.3.0 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a7bf438a08fc83fec7e78cb5543577c98f6cc98b4f5fae7b0dd969f2049c0531
4
- data.tar.gz: e399c6b57589b7d8ba2e8eff7a1d204fa7f8e676f82f631057e19a9377333060
3
+ metadata.gz: 0ec1d3a8cc766976c1d1329448a52df1696ccb50c41bf7c714b28cd265470d54
4
+ data.tar.gz: a87a1cd109c0e3dd8d820d3f35e34ef3096ef493d53d9c6bab0a79f1ccd7df9a
5
5
  SHA512:
6
- metadata.gz: 56f39994972657712cb7d95e5ceaadefca8de41e06c2cd4759363b496d7c8531fad7517f9df99bf2446c144f01c5cd82cbc94146c432d6b5b552f092b975ecd7
7
- data.tar.gz: cf74a25615187ecbe5f8ca5f2072679fa9cc1902dfa3bf2190b87e11104f332688cdaee16f4be6cb00f9ed63fa18f2ec8f27cf32b0b27389f81d98229fa212e6
6
+ metadata.gz: 5f7787bfdc549e70b9e5a5ae45f84f3a0316142deea3c75f17d9781c8140e597d41ccbfd1679e8186799f76e771a24a1d6129402c818f5d6e3ea0b94128a2185
7
+ data.tar.gz: 61e430d7dae6fda7c28a9456f1751206cfcd01321ff71feeeba7447606a1eb10d7270dda5c8947a4cc8edb3dae60d6d57fc79f127e61920249fcab0ca47f4e37
@@ -19,7 +19,7 @@ jobs:
19
19
  runs-on: ubuntu-latest
20
20
  strategy:
21
21
  matrix:
22
- ruby-version: ['2.6', '2.7', '3.0']
22
+ ruby-version: ['2.7', '3.0']
23
23
 
24
24
  steps:
25
25
  - uses: actions/checkout@v2
data/.rubocop.yml ADDED
@@ -0,0 +1,49 @@
1
+
2
+ # inherit_from: .rubocop_todo.yml
3
+
4
+ inherit_mode:
5
+ merge:
6
+ - Exclude
7
+
8
+ require:
9
+ - rubocop-performance
10
+ - rubocop-rspec
11
+ - rubocop-rubycw
12
+
13
+ AllCops:
14
+ DisabledByDefault: true
15
+ DisplayCopNames: true
16
+ Exclude:
17
+ - 'gems/**/*'
18
+ - 'pkg/**/*'
19
+ - 'coverage/**/*'
20
+ - 'exe/**/*'
21
+
22
+ NewCops: enable
23
+
24
+ Bundler:
25
+ Enabled: true
26
+
27
+ Gemspec:
28
+ Enabled: true
29
+
30
+ Lint:
31
+ Enabled: true
32
+
33
+ Performance:
34
+ Enabled: true
35
+
36
+ Rubycw:
37
+ Enabled: true
38
+
39
+ Security:
40
+ Enabled: true
41
+
42
+ Style/HashSyntax:
43
+ EnforcedStyle: ruby19
44
+ Style/HashEachMethods:
45
+ Enabled: true
46
+ Style/HashTransformKeys:
47
+ Enabled: true
48
+ Style/HashTransformValues:
49
+ Enabled: true
data/.rubocop_todo.yml ADDED
File without changes
data/README.md CHANGED
@@ -2,9 +2,11 @@
2
2
  [![Build Status](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml/badge.svg)](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
3
3
  [![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Maintainability](https://api.codeclimate.com/v1/badges/c29480fea1305afe999f/maintainability)](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
4
4
 
5
- Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)", and CLI tool using it.
5
+ Yasuri (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
6
+ It performs scraping by simply describing the expected result in a simple declarative notation.
6
7
 
7
- Yasuri can reduce frequently processes in Scraping.
8
+ Yasuri makes it easy to write common scraping operations.
9
+ For example, the following processes can be easily implemented.
8
10
 
9
11
  For example,
10
12
 
data/Rakefile CHANGED
@@ -3,5 +3,5 @@ require "rspec/core/rake_task"
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
7
7
 
data/examples/example.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- # Author:: TAC (tac@tac42.net)
3
2
 
4
3
  require 'yasuri'
5
4
  uri = "https://github.com/tac0x2a?tab=repositories"
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "3.3.0"
2
+ VERSION = "3.3.1"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -1,6 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # Author:: TAC (tac@tac42.net)
4
1
 
5
2
  require 'mechanize'
6
3
  require 'json'
@@ -23,41 +20,48 @@ module Yasuri
23
20
  raise RuntimeError if json_string.nil? or json_string.empty?
24
21
 
25
22
  node_hash = JSON.parse(json_string, {symbolize_names: true})
26
- Yasuri.hash2node(node_hash)
23
+ self.hash2node(node_hash)
24
+ end
25
+
26
+ def self.yaml2tree(yaml_string)
27
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
28
+
29
+ node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
30
+ self.hash2node(node_hash.deep_symbolize_keys)
27
31
  end
28
32
 
29
33
  def self.tree2json(node)
30
34
  raise RuntimeError if node.nil?
31
35
 
32
- Yasuri.node2hash(node).to_json
36
+ self.node2hash(node).to_json
33
37
  end
34
38
 
35
- def self.yaml2tree(yaml_string)
36
- raise RuntimeError if yaml_string.nil? or yaml_string.empty?
39
+ def self.with_retry(
40
+ retry_count = DefaultRetryCount,
41
+ interval_ms = DefaultInterval_ms)
37
42
 
38
- node_hash = YAML.load(yaml_string)
39
- Yasuri.hash2node(node_hash.deep_symbolize_keys)
43
+ begin
44
+ Kernel.sleep(interval_ms * 0.001)
45
+ return yield() if block_given?
46
+ rescue => e
47
+ if retry_count > 0
48
+ retry_count -= 1
49
+ retry
50
+ end
51
+ fail e
52
+ end
40
53
  end
41
54
 
42
- private
43
- def self.method_missing(method_name, pattern=nil, **opt, &block)
44
- generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
45
- generated || super(method_name, **opt)
55
+ def self.node_name(name, opt)
56
+ symbolize_names = opt[:symbolize_names]
57
+ symbolize_names ? name.to_sym : name
46
58
  end
47
59
 
48
- private
49
- Text2Node = {
50
- text: Yasuri::TextNode,
51
- struct: Yasuri::StructNode,
52
- links: Yasuri::LinksNode,
53
- pages: Yasuri::PaginateNode,
54
- map: Yasuri::MapNode
55
- }
60
+ # private
56
61
 
57
62
  def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
58
63
  raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
59
64
 
60
- node_prefixes = Text2Node.keys.freeze
61
65
  child_nodes = []
62
66
  opt = {}
63
67
  path = nil
@@ -65,27 +69,7 @@ module Yasuri
65
69
  if node_hash.is_a?(String)
66
70
  path = node_hash
67
71
  else
68
- node_hash.each do |key, value|
69
- # is node?
70
- node_regexps = Text2Node.keys.map do |node_type_sym|
71
- /^(#{node_type_sym.to_s})_(.+)$/
72
- end
73
- node_regexp = node_regexps.find do |node_regexp|
74
- key =~ node_regexp
75
- end
76
-
77
- case key
78
- when node_regexp
79
- node_type_sym = $1.to_sym
80
- child_node_name = $2
81
- child_node_type = Text2Node[node_type_sym]
82
- child_nodes << self.hash2node(value, child_node_name, child_node_type)
83
- when :path
84
- path = value
85
- else
86
- opt[key] = value
87
- end
88
- end
72
+ child_nodes, opt, path = self.hash2child_node(node_hash)
89
73
  end
90
74
 
91
75
  # If only single node under root, return only the node.
@@ -100,6 +84,42 @@ module Yasuri
100
84
  node
101
85
  end
102
86
 
87
+ Text2Node = {
88
+ text: Yasuri::TextNode,
89
+ struct: Yasuri::StructNode,
90
+ links: Yasuri::LinksNode,
91
+ pages: Yasuri::PaginateNode,
92
+ map: Yasuri::MapNode
93
+ }
94
+
95
+ NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
96
+
97
+ def self.hash2child_node(node_hash)
98
+ child_nodes = []
99
+ opt = {}
100
+ path = nil
101
+
102
+ node_hash.each do |key, value|
103
+ # is node?
104
+
105
+ node_regexp = NodeRegexps.find { |r| key =~ r }
106
+
107
+ case key
108
+ when node_regexp
109
+ node_type_sym = $1.to_sym
110
+ child_node_name = $2
111
+ child_node_type = Text2Node[node_type_sym]
112
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
113
+ when :path
114
+ path = value
115
+ else
116
+ opt[key] = value
117
+ end
118
+ end
119
+
120
+ [child_nodes, opt, path]
121
+ end
122
+
103
123
  def self.node2hash(node)
104
124
  return node.to_h if node.instance_of?(Yasuri::MapNode)
105
125
 
@@ -108,26 +128,13 @@ module Yasuri
108
128
  }
109
129
  end
110
130
 
111
- def self.node_name(name, opt)
112
- symbolize_names = opt[:symbolize_names]
113
- symbolize_names ? name.to_sym : name
131
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
132
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
133
+ generated || super(method_name, **opt)
114
134
  end
115
135
 
116
- def self.with_retry(
117
- retry_count = DefaultRetryCount,
118
- interval_ms = DefaultInterval_ms)
119
-
120
- begin
121
- Kernel.sleep(interval_ms * 0.001)
122
- return yield() if block_given?
123
- rescue => e
124
- if retry_count > 0
125
- retry_count -= 1
126
- retry
127
- end
128
- fail e
129
- end
130
- end
136
+ private_constant :Text2Node, :NodeRegexps
137
+ private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
131
138
  end
132
139
 
133
140
  class Hash
@@ -7,50 +7,30 @@ module Yasuri
7
7
  package_name "yasuri"
8
8
 
9
9
  default_command :scrape
10
- desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
11
+ "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
11
12
  option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
12
13
  option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
13
14
  option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
14
15
  def scrape(uri)
15
- # argument validations
16
- if [options[:file], options[:json]].compact.count != 1
17
- $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
- return -1
19
- end
20
- if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
- $stderr.puts "ERROR: --file option require not empty argument."
22
- return -1
23
- end
24
- if options[:json]&.empty? or options[:json] == "json"
25
- $stderr.puts "ERROR: --json option require not empty argument."
16
+ begin
17
+ test_arguments(options)
18
+ rescue => e
19
+ $stderr.puts e.message
26
20
  return -1
27
21
  end
28
22
 
29
23
  interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
24
+ file_path = options[:file]
25
+ json_string = options[:json]
30
26
 
31
- tree = if options[:file]
32
- src = File.read(options[:file])
33
-
34
- begin
35
- Yasuri.json2tree(src)
36
- rescue
37
- begin
38
- Yasuri.yaml2tree(src)
39
- rescue => e
40
- $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
41
- return -1
42
- end
43
- end
44
- else
45
- begin
46
- Yasuri.json2tree(options[:json])
47
- rescue => e
48
- $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
49
- return -1
50
- end
51
- end
52
-
53
- result = tree.scrape(uri, interval_ms: interval_ms)
27
+ begin
28
+ tree = make_tree(file_path, json_string)
29
+ result = tree.scrape(uri, interval_ms: interval_ms)
30
+ rescue => e
31
+ $stderr.puts e.message
32
+ return -1
33
+ end
54
34
 
55
35
  if result.instance_of?(String)
56
36
  puts result
@@ -60,5 +40,39 @@ module Yasuri
60
40
 
61
41
  return 0
62
42
  end
43
+
44
+ private
45
+
46
+ def test_arguments(options)
47
+ too_many_options = [options[:file], options[:json]].compact.count != 1
48
+ raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
49
+
50
+ empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
51
+ raise "ERROR: --file option require not empty argument." if empty_file_argument
52
+
53
+ empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
54
+ raise "ERROR: --json option require not empty argument." if empty_json_string_argument
55
+ end
56
+
57
+ def make_tree(file_path, json_string)
58
+ if file_path
59
+ begin
60
+ src = File.read(file_path)
61
+ make_tree_from_file(src)
62
+ rescue => e
63
+ raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
64
+ end
65
+ else
66
+ begin
67
+ Yasuri.json2tree(json_string)
68
+ rescue => e
69
+ raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
70
+ end
71
+ end
72
+ end
73
+
74
+ def make_tree_from_file(src)
75
+ Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
76
+ end
63
77
  end
64
78
  end
@@ -1,6 +1,4 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
@@ -22,11 +20,11 @@ module Yasuri
22
20
  end
23
21
 
24
22
  Hash[child_results_kv]
25
- end # each named child node
23
+ end
26
24
  end
27
25
 
28
26
  def node_type_str
29
27
  "links".freeze
30
28
  end
31
- end # class
32
- end # module
29
+ end
30
+ end
@@ -10,7 +10,7 @@ module Yasuri
10
10
  @opt = opt
11
11
  end
12
12
 
13
- def inject(agent, page, opt = {}, element = page)
13
+ def inject(agent, page, opt = {}, _element = page)
14
14
  child_results_kv = @children.map do |node|
15
15
  [node.name, node.inject(agent, page, opt)]
16
16
  end
@@ -19,7 +19,7 @@ module Yasuri
19
19
 
20
20
  def to_h
21
21
  node_hash = {}
22
- self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
22
+ self.opts.each { |k, v| node_hash[k] = v unless v.nil? }
23
23
 
24
24
  children.each do |child|
25
25
  child_node_name = "#{child.node_type_str}_#{child.name}"
@@ -1,27 +1,24 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  module Node
8
6
  attr_reader :url, :xpath, :name, :children
9
7
 
10
- def initialize(xpath, name, children = [], **opt)
8
+ def initialize(xpath, name, children = [], **_opt)
11
9
  @xpath, @name, @children = xpath, name, children
12
10
  end
13
11
 
14
12
  def scrape(uri, opt = {})
15
- retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
16
- interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
17
-
18
13
  agent = Mechanize.new
19
- page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
20
14
  scrape_with_agent(uri, agent, opt)
21
15
  end
22
16
 
23
17
  def scrape_with_agent(uri, agent, opt = {})
24
- page = agent.get(uri)
18
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
19
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
20
+
21
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
25
22
  inject(agent, page, opt)
26
23
  end
27
24