yasuri 3.0.0 → 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yasuri'
4
+ uri = "https://github.com/tac0x2a?tab=repositories"
5
+
6
+ # Node tree constructing by DSL
7
+ root = Yasuri.map_root do
8
+ text_title '/html/head/title'
9
+ links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
10
+ text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
11
+ text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
12
+ text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
13
+ text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
14
+ end
15
+ end
16
+
17
+ # Node tree constructing by YAML
18
+ # src = <<-EOYML
19
+ # text_title: /html/head/title
20
+ # links_repo:
21
+ # path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
22
+ # text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
23
+ # text_desc:
24
+ # path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
25
+ # proc: :strip
26
+ # text_stars:
27
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
28
+ # proc: :to_i
29
+ # text_forks:
30
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
31
+ # proc: :to_i
32
+ # EOYML
33
+ # root = Yasuri.yaml2tree(src)
34
+
35
+ contents = root.scrape(uri, interval_ms: 100)
36
+ # jj contents
37
+ # {
38
+ # "title": "tac0x2a (TAC) / Repositories · GitHub",
39
+ # "repo": [
40
+ # {
41
+ # "name": "o-namazu",
42
+ # "desc": "Oh Namazu (Catfish) in datalake",
43
+ # "stars": 1,
44
+ # "forks": 0
45
+ # },
46
+ # {
47
+ # "name": "grebe",
48
+ # "desc": "grebe in datalake",
49
+ # "stars": 2,
50
+ # "forks": 0
51
+ # },
52
+ # {
53
+ # "name": "yasuri",
54
+ # "desc": "Yasuri (鑢) is easy web scraping library.",
55
+ # "stars": 43,
56
+ # "forks": 1
57
+ # },
58
+ # {
59
+ # "name": "dotfiles",
60
+ # "desc": "dotfiles",
61
+ # "stars": 0,
62
+ # "forks": 0
63
+ # }
64
+ # ...
65
+ # ]
66
+ # }
67
+
68
+ # Output as markdown
69
+ puts "# #{contents['title']}"
70
+ contents['repo'].each do |h|
71
+ puts "-----"
72
+ puts "## #{h['name']}"
73
+ puts h['desc']
74
+ puts ""
75
+ puts "* Stars: #{h['stars']}"
76
+ puts "* Forks: #{h['forks']}"
77
+ puts ""
78
+ end
@@ -0,0 +1,15 @@
1
+
2
+ # yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
3
+ text_title: /html/head/title
4
+ links_repo:
5
+ path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
6
+ text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
7
+ text_desc:
8
+ path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
9
+ proc: :strip
10
+ text_stars:
11
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
12
+ proc: :to_i
13
+ text_forks:
14
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
15
+ proc: :to_i
@@ -0,0 +1,4 @@
1
+ {
2
+ "text_title": "/html/head/title",
3
+ "text_desc": "//*[@id=\"intro\"]/p"
4
+ }
@@ -0,0 +1,11 @@
1
+ # yasuri scrape "https://www.tac42.net/" -f sample.yml
2
+ links_each:
3
+ path: //*[@id="posts"]/article/header/h1/a
4
+ text_title: //*[@id="content"]/article/header/h1
5
+ text_description: /html/head/meta[12]/@content
6
+ text_date:
7
+ path: //*[@id="content"]/article/header/div/span
8
+ proc: :strip
9
+ text_length:
10
+ path: //*[@id="content"]
11
+ proc: :size
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "3.0.0"
2
+ VERSION = "3.3.2"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -1,6 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # Author:: TAC (tac@tac42.net)
4
1
 
5
2
  require 'mechanize'
6
3
  require 'json'
@@ -11,119 +8,142 @@ require_relative 'yasuri_text_node'
11
8
  require_relative 'yasuri_struct_node'
12
9
  require_relative 'yasuri_paginate_node'
13
10
  require_relative 'yasuri_links_node'
11
+ require_relative 'yasuri_map_node'
14
12
  require_relative 'yasuri_node_generator'
15
13
 
16
14
  module Yasuri
17
15
 
16
+ DefaultRetryCount = 5
17
+ DefaultInterval_ms = 0
18
+
18
19
  def self.json2tree(json_string)
19
- json = JSON.parse(json_string, {symbolize_names: true})
20
- Yasuri.hash2node(json)
21
- end
20
+ raise RuntimeError if json_string.nil? or json_string.empty?
22
21
 
23
- def self.tree2json(node)
24
- Yasuri.node2hash(node).to_json
22
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
23
+ self.hash2node(node_hash)
25
24
  end
26
25
 
27
26
  def self.yaml2tree(yaml_string)
28
27
  raise RuntimeError if yaml_string.nil? or yaml_string.empty?
29
28
 
30
- yaml = YAML.load(yaml_string)
31
- raise RuntimeError if yaml.keys.size < 1
29
+ node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
30
+ self.hash2node(node_hash.deep_symbolize_keys)
31
+ end
32
+
33
+ def self.tree2json(node)
34
+ raise RuntimeError if node.nil?
35
+
36
+ self.node2hash(node).to_json
37
+ end
32
38
 
33
- root_key, root = yaml.keys.first, yaml.values.first
34
- hash = Yasuri.yaml2tree_sub(root_key, root)
39
+ def self.with_retry(
40
+ retry_count = DefaultRetryCount,
41
+ interval_ms = DefaultInterval_ms)
42
+
43
+ begin
44
+ Kernel.sleep(interval_ms * 0.001)
45
+ return yield() if block_given?
46
+ rescue => e
47
+ if retry_count > 0
48
+ retry_count -= 1
49
+ retry
50
+ end
51
+ fail e
52
+ end
53
+ end
35
54
 
36
- Yasuri.hash2node(hash)
55
+ def self.node_name(name, opt)
56
+ symbolize_names = opt[:symbolize_names]
57
+ symbolize_names ? name.to_sym : name
37
58
  end
38
59
 
39
- private
40
- def self.yaml2tree_sub(name, body)
41
- return nil if name.nil? or body.nil?
60
+ # private
42
61
 
43
- new_body = Hash[:name, name]
44
- body.each{|k,v| new_body[k.to_sym] = v}
45
- body = new_body
62
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
63
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
46
64
 
47
- return body if body[:children].nil?
65
+ child_nodes = []
66
+ opt = {}
67
+ path = nil
48
68
 
49
- body[:children] = body[:children].map do |c|
50
- k, b = c.keys.first, c.values.first
51
- Yasuri.yaml2tree_sub(k, b)
69
+ if node_hash.is_a?(String)
70
+ path = node_hash
71
+ else
72
+ child_nodes, opt, path = self.hash2child_node(node_hash)
52
73
  end
53
74
 
54
- body
55
- end
75
+ # If only single node under root, return only the node.
76
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
56
77
 
57
- def self.method_missing(node_name, pattern, **opt, &block)
58
- generated = Yasuri::NodeGenerator.gen(node_name, pattern, **opt, &block)
59
- generated || super(node_name, **opt)
78
+ node = if node_type_class.nil?
79
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
80
+ else
81
+ node_type_class::new(path, node_name, child_nodes, **opt)
82
+ end
83
+
84
+ node
60
85
  end
61
86
 
62
- private
63
87
  Text2Node = {
64
88
  text: Yasuri::TextNode,
65
89
  struct: Yasuri::StructNode,
66
90
  links: Yasuri::LinksNode,
67
- pages: Yasuri::PaginateNode
91
+ pages: Yasuri::PaginateNode,
92
+ map: Yasuri::MapNode
68
93
  }
69
- Node2Text = Text2Node.invert
70
-
71
- ReservedKeys = %i|node name path children|
72
- def self.hash2node(node_h)
73
- node, name, path, children = ReservedKeys.map do |key|
74
- node_h[key]
75
- end
76
- children ||= []
77
94
 
78
- fail "Not found 'node' value in map" if node.nil?
79
- fail "Not found 'name' value in map" if name.nil?
80
- fail "Not found 'path' value in map" if path.nil?
95
+ NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
81
96
 
82
- childnodes = children.map{|c| Yasuri.hash2node(c) }
83
- ReservedKeys.each{|key| node_h.delete(key)}
84
- opt = node_h
97
+ def self.hash2child_node(node_hash)
98
+ child_nodes = []
99
+ opt = {}
100
+ path = nil
85
101
 
86
- klass = Text2Node[node.to_sym]
87
- fail "Undefined node type #{node}" if klass.nil?
88
- klass.new(path, name, childnodes, **opt)
89
- end
90
-
91
- def self.node2hash(node)
92
- json = JSON.parse("{}")
93
- return json if node.nil?
102
+ node_hash.each do |key, value|
103
+ # is node?
94
104
 
95
- klass = node.class
96
- klass_str = Node2Text[klass]
105
+ node_regexp = NodeRegexps.find { |r| key =~ r }
97
106
 
98
- json["node"] = klass_str
99
- json["name"] = node.name
100
- json["path"] = node.xpath
107
+ case key
108
+ when node_regexp
109
+ node_type_sym = $1.to_sym
110
+ child_node_name = $2
111
+ child_node_type = Text2Node[node_type_sym]
112
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
113
+ when :path
114
+ path = value
115
+ else
116
+ opt[key] = value
117
+ end
118
+ end
101
119
 
102
- children = node.children.map{|c| Yasuri.node2hash(c)}
103
- json["children"] = children if not children.empty?
120
+ [child_nodes, opt, path]
121
+ end
104
122
 
105
- node.opts.each do |key,value|
106
- json[key] = value if not value.nil?
107
- end
123
+ def self.node2hash(node)
124
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
108
125
 
109
- json
126
+ {
127
+ "#{node.node_type_str}_#{node.name}" => node.to_h
128
+ }
110
129
  end
111
130
 
112
- def self.NodeName(name, opt)
113
- symbolize_names = opt[:symbolize_names]
114
- symbolize_names ? name.to_sym : name
131
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
132
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
133
+ generated || super(method_name, **opt)
115
134
  end
116
135
 
117
- def self.with_retry(retry_count = 5)
118
- begin
119
- return yield() if block_given?
120
- rescue => e
121
- if retry_count > 0
122
- pp "retry #{retry_count}"
123
- retry_count -= 1
124
- retry
136
+ private_constant :Text2Node, :NodeRegexps
137
+ private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
138
+ end
139
+
140
+ class Hash
141
+ def deep_symbolize_keys
142
+ Hash[
143
+ self.map do |k, v|
144
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
145
+ [k.to_sym, v]
125
146
  end
126
- fail e
127
- end
147
+ ]
128
148
  end
129
149
  end
@@ -0,0 +1,78 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+
5
+ module Yasuri
6
+ class CLI < Thor
7
+ package_name "yasuri"
8
+
9
+ default_command :scrape
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
11
+ "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
12
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
13
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
14
+ option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
15
+ def scrape(uri)
16
+ begin
17
+ test_arguments(options)
18
+ rescue => e
19
+ $stderr.puts e.message
20
+ return -1
21
+ end
22
+
23
+ interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
24
+ file_path = options[:file]
25
+ json_string = options[:json]
26
+
27
+ begin
28
+ tree = make_tree(file_path, json_string)
29
+ result = tree.scrape(uri, interval_ms: interval_ms)
30
+ rescue => e
31
+ $stderr.puts e.message
32
+ return -1
33
+ end
34
+
35
+ if result.instance_of?(String)
36
+ puts result
37
+ else
38
+ j result
39
+ end
40
+
41
+ return 0
42
+ end
43
+
44
+ private
45
+
46
+ def test_arguments(options)
47
+ too_many_options = [options[:file], options[:json]].compact.count != 1
48
+ raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
49
+
50
+ empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
51
+ raise "ERROR: --file option require not empty argument." if empty_file_argument
52
+
53
+ empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
54
+ raise "ERROR: --json option require not empty argument." if empty_json_string_argument
55
+ end
56
+
57
+ def make_tree(file_path, json_string)
58
+ if file_path
59
+ begin
60
+ src = File.read(file_path)
61
+ make_tree_from_file(src)
62
+ rescue => e
63
+ raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
64
+ end
65
+ else
66
+ begin
67
+ Yasuri.json2tree(json_string)
68
+ rescue => e
69
+ raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
70
+ end
71
+ end
72
+ end
73
+
74
+ def make_tree_from_file(src)
75
+ Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
76
+ end
77
+ end
78
+ end
@@ -1,26 +1,30 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  class LinksNode
8
6
  include Node
7
+
9
8
  def inject(agent, page, opt = {}, element = page)
10
- retry_count = opt[:retry_count] || 5
9
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
10
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
11
11
 
12
12
  links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
- child_page = Yasuri.with_retry(retry_count) { link_button.click }
15
+ child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
16
16
 
17
17
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
18
+ child_name = Yasuri.node_name(child_node.name, opt)
19
19
  [child_name, child_node.inject(agent, child_page, opt)]
20
20
  end
21
21
 
22
22
  Hash[child_results_kv]
23
- end # each named child node
23
+ end
24
+ end
25
+
26
+ def node_type_str
27
+ "links".freeze
24
28
  end
25
29
  end
26
30
  end