yasuri 2.0.13 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yasuri'
4
+ uri = "https://github.com/tac0x2a?tab=repositories"
5
+
6
+ # Node tree constructing by DSL
7
+ root = Yasuri.map_root do
8
+ text_title '/html/head/title'
9
+ links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
10
+ text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
11
+ text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
12
+ text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
13
+ text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
14
+ end
15
+ end
16
+
17
+ # Node tree constructing by YAML
18
+ # src = <<-EOYML
19
+ # text_title: /html/head/title
20
+ # links_repo:
21
+ # path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
22
+ # text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
23
+ # text_desc:
24
+ # path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
25
+ # proc: :strip
26
+ # text_stars:
27
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
28
+ # proc: :to_i
29
+ # text_forks:
30
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
31
+ # proc: :to_i
32
+ # EOYML
33
+ # root = Yasuri.yaml2tree(src)
34
+
35
+ contents = root.scrape(uri, interval_ms: 100)
36
+ # jj contents
37
+ # {
38
+ # "title": "tac0x2a (TAC) / Repositories · GitHub",
39
+ # "repo": [
40
+ # {
41
+ # "name": "o-namazu",
42
+ # "desc": "Oh Namazu (Catfish) in datalake",
43
+ # "stars": 1,
44
+ # "forks": 0
45
+ # },
46
+ # {
47
+ # "name": "grebe",
48
+ # "desc": "grebe in datalake",
49
+ # "stars": 2,
50
+ # "forks": 0
51
+ # },
52
+ # {
53
+ # "name": "yasuri",
54
+ # "desc": "Yasuri (鑢) is easy web scraping library.",
55
+ # "stars": 43,
56
+ # "forks": 1
57
+ # },
58
+ # {
59
+ # "name": "dotfiles",
60
+ # "desc": "dotfiles",
61
+ # "stars": 0,
62
+ # "forks": 0
63
+ # }
64
+ # ...
65
+ # ]
66
+ # }
67
+
68
+ # Output as markdown
69
+ puts "# #{contents['title']}"
70
+ contents['repo'].each do |h|
71
+ puts "-----"
72
+ puts "## #{h['name']}"
73
+ puts h['desc']
74
+ puts ""
75
+ puts "* Stars: #{h['stars']}"
76
+ puts "* Forks: #{h['forks']}"
77
+ puts ""
78
+ end
@@ -0,0 +1,15 @@
1
+
2
+ # yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
3
+ text_title: /html/head/title
4
+ links_repo:
5
+ path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
6
+ text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
7
+ text_desc:
8
+ path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
9
+ proc: :strip
10
+ text_stars:
11
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
12
+ proc: :to_i
13
+ text_forks:
14
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
15
+ proc: :to_i
@@ -0,0 +1,4 @@
1
+ {
2
+ "text_title": "/html/head/title",
3
+ "text_desc": "//*[@id=\"intro\"]/p"
4
+ }
@@ -0,0 +1,11 @@
1
+ # yasuri scrape "https://www.tac42.net/" -f sample.yml
2
+ links_each:
3
+ path: //*[@id="posts"]/article/header/h1/a
4
+ text_title: //*[@id="content"]/article/header/h1
5
+ text_description: /html/head/meta[12]/@content
6
+ text_date:
7
+ path: //*[@id="content"]/article/header/div/span
8
+ proc: :strip
9
+ text_length:
10
+ path: //*[@id="content"]
11
+ proc: :size
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.13"
2
+ VERSION = "3.3.1"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -1,6 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # Author:: TAC (tac@tac42.net)
4
1
 
5
2
  require 'mechanize'
6
3
  require 'json'
@@ -11,118 +8,142 @@ require_relative 'yasuri_text_node'
11
8
  require_relative 'yasuri_struct_node'
12
9
  require_relative 'yasuri_paginate_node'
13
10
  require_relative 'yasuri_links_node'
11
+ require_relative 'yasuri_map_node'
14
12
  require_relative 'yasuri_node_generator'
15
13
 
16
14
  module Yasuri
17
15
 
16
+ DefaultRetryCount = 5
17
+ DefaultInterval_ms = 0
18
+
18
19
  def self.json2tree(json_string)
19
- json = JSON.parse(json_string, {symbolize_names: true})
20
- Yasuri.hash2node(json)
21
- end
20
+ raise RuntimeError if json_string.nil? or json_string.empty?
22
21
 
23
- def self.tree2json(node)
24
- Yasuri.node2hash(node).to_json
22
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
23
+ self.hash2node(node_hash)
25
24
  end
26
25
 
27
26
  def self.yaml2tree(yaml_string)
28
27
  raise RuntimeError if yaml_string.nil? or yaml_string.empty?
29
28
 
30
- yaml = YAML.load(yaml_string)
31
- raise RuntimeError if yaml.keys.size < 1
29
+ node_hash = YAML.safe_load(yaml_string, [Symbol], symbolize_names: true)
30
+ self.hash2node(node_hash.deep_symbolize_keys)
31
+ end
32
32
 
33
- root_key, root = yaml.keys.first, yaml.values.first
34
- hash = Yasuri.yaml2tree_sub(root_key, root)
33
+ def self.tree2json(node)
34
+ raise RuntimeError if node.nil?
35
35
 
36
- Yasuri.hash2node(hash)
36
+ self.node2hash(node).to_json
37
37
  end
38
38
 
39
- private
40
- def self.yaml2tree_sub(name, body)
41
- return nil if name.nil? or body.nil?
39
+ def self.with_retry(
40
+ retry_count = DefaultRetryCount,
41
+ interval_ms = DefaultInterval_ms)
42
42
 
43
- new_body = Hash[:name, name]
44
- body.each{|k,v| new_body[k.to_sym] = v}
45
- body = new_body
43
+ begin
44
+ Kernel.sleep(interval_ms * 0.001)
45
+ return yield() if block_given?
46
+ rescue => e
47
+ if retry_count > 0
48
+ retry_count -= 1
49
+ retry
50
+ end
51
+ fail e
52
+ end
53
+ end
46
54
 
47
- return body if body[:children].nil?
55
+ def self.node_name(name, opt)
56
+ symbolize_names = opt[:symbolize_names]
57
+ symbolize_names ? name.to_sym : name
58
+ end
48
59
 
49
- body[:children] = body[:children].map do |c|
50
- k, b = c.keys.first, c.values.first
51
- Yasuri.yaml2tree_sub(k, b)
60
+ # private
61
+
62
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
63
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
64
+
65
+ child_nodes = []
66
+ opt = {}
67
+ path = nil
68
+
69
+ if node_hash.is_a?(String)
70
+ path = node_hash
71
+ else
72
+ child_nodes, opt, path = self.hash2child_node(node_hash)
52
73
  end
53
74
 
54
- body
55
- end
75
+ # If only single node under root, return only the node.
76
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
56
77
 
57
- def self.method_missing(name, *args, &block)
58
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
59
- generated || super(name, args)
78
+ node = if node_type_class.nil?
79
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
80
+ else
81
+ node_type_class::new(path, node_name, child_nodes, **opt)
82
+ end
83
+
84
+ node
60
85
  end
61
86
 
62
- private
63
87
  Text2Node = {
64
88
  text: Yasuri::TextNode,
65
89
  struct: Yasuri::StructNode,
66
90
  links: Yasuri::LinksNode,
67
- pages: Yasuri::PaginateNode
91
+ pages: Yasuri::PaginateNode,
92
+ map: Yasuri::MapNode
68
93
  }
69
- Node2Text = Text2Node.invert
70
-
71
- ReservedKeys = %i|node name path children|
72
- def self.hash2node(node_h)
73
- node, name, path, children = ReservedKeys.map do |key|
74
- node_h[key]
75
- end
76
- children ||= []
77
94
 
78
- fail "Not found 'node' value in map" if node.nil?
79
- fail "Not found 'name' value in map" if name.nil?
80
- fail "Not found 'path' value in map" if path.nil?
95
+ NodeRegexps = Text2Node.keys.map { |node_type_sym| /^(#{node_type_sym})_(.+)$/ }
81
96
 
82
- childnodes = children.map{|c| Yasuri.hash2node(c) }
83
- ReservedKeys.each{|key| node_h.delete(key)}
84
- opt = node_h
97
+ def self.hash2child_node(node_hash)
98
+ child_nodes = []
99
+ opt = {}
100
+ path = nil
85
101
 
86
- klass = Text2Node[node.to_sym]
87
- fail "Undefined node type #{node}" if klass.nil?
88
- klass.new(path, name, childnodes, opt)
89
- end
90
-
91
- def self.node2hash(node)
92
- json = JSON.parse("{}")
93
- return json if node.nil?
102
+ node_hash.each do |key, value|
103
+ # is node?
94
104
 
95
- klass = node.class
96
- klass_str = Node2Text[klass]
105
+ node_regexp = NodeRegexps.find { |r| key =~ r }
97
106
 
98
- json["node"] = klass_str
99
- json["name"] = node.name
100
- json["path"] = node.xpath
107
+ case key
108
+ when node_regexp
109
+ node_type_sym = $1.to_sym
110
+ child_node_name = $2
111
+ child_node_type = Text2Node[node_type_sym]
112
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
113
+ when :path
114
+ path = value
115
+ else
116
+ opt[key] = value
117
+ end
118
+ end
101
119
 
102
- children = node.children.map{|c| Yasuri.node2hash(c)}
103
- json["children"] = children if not children.empty?
120
+ [child_nodes, opt, path]
121
+ end
104
122
 
105
- node.opts.each do |key,value|
106
- json[key] = value if not value.nil?
107
- end
123
+ def self.node2hash(node)
124
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
108
125
 
109
- json
126
+ {
127
+ "#{node.node_type_str}_#{node.name}" => node.to_h
128
+ }
110
129
  end
111
130
 
112
- def self.NodeName(name, symbolize_names:false)
113
- symbolize_names ? name.to_sym : name
131
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
132
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
133
+ generated || super(method_name, **opt)
114
134
  end
115
135
 
116
- def self.with_retry(retry_count = 5)
117
- begin
118
- return yield() if block_given?
119
- rescue => e
120
- if retry_count > 0
121
- pp "retry #{retry_count}"
122
- retry_count -= 1
123
- retry
136
+ private_constant :Text2Node, :NodeRegexps
137
+ private_class_method :method_missing, :hash2child_node, :hash2node, :node2hash
138
+ end
139
+
140
+ class Hash
141
+ def deep_symbolize_keys
142
+ Hash[
143
+ self.map do |k, v|
144
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
145
+ [k.to_sym, v]
124
146
  end
125
- fail e
126
- end
147
+ ]
127
148
  end
128
149
  end
@@ -0,0 +1,78 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+
5
+ module Yasuri
6
+ class CLI < Thor
7
+ package_name "yasuri"
8
+
9
+ default_command :scrape
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]",
11
+ "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
12
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
13
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
14
+ option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
15
+ def scrape(uri)
16
+ begin
17
+ test_arguments(options)
18
+ rescue => e
19
+ $stderr.puts e.message
20
+ return -1
21
+ end
22
+
23
+ interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
24
+ file_path = options[:file]
25
+ json_string = options[:json]
26
+
27
+ begin
28
+ tree = make_tree(file_path, json_string)
29
+ result = tree.scrape(uri, interval_ms: interval_ms)
30
+ rescue => e
31
+ $stderr.puts e.message
32
+ return -1
33
+ end
34
+
35
+ if result.instance_of?(String)
36
+ puts result
37
+ else
38
+ j result
39
+ end
40
+
41
+ return 0
42
+ end
43
+
44
+ private
45
+
46
+ def test_arguments(options)
47
+ too_many_options = [options[:file], options[:json]].compact.count != 1
48
+ raise "ERROR: Only one of `--file` or `--json` option should be specified." if too_many_options
49
+
50
+ empty_file_argument = options[:file]&.empty? || options[:file] == "file" || options[:json]&.empty?
51
+ raise "ERROR: --file option require not empty argument." if empty_file_argument
52
+
53
+ empty_json_string_argument = options[:json]&.empty? || options[:json] == "json"
54
+ raise "ERROR: --json option require not empty argument." if empty_json_string_argument
55
+ end
56
+
57
+ def make_tree(file_path, json_string)
58
+ if file_path
59
+ begin
60
+ src = File.read(file_path)
61
+ make_tree_from_file(src)
62
+ rescue => e
63
+ raise "ERROR: Failed to convert to yasuri tree `#{file_path}`. #{e.message}"
64
+ end
65
+ else
66
+ begin
67
+ Yasuri.json2tree(json_string)
68
+ rescue => e
69
+ raise "ERROR: Failed to convert json to yasuri tree. #{e.message}"
70
+ end
71
+ end
72
+ end
73
+
74
+ def make_tree_from_file(src)
75
+ Yasuri.json2tree(src) rescue Yasuri.yaml2tree(src)
76
+ end
77
+ end
78
+ end
@@ -1,26 +1,30 @@
1
1
 
2
- # Author:: TAC (tac@tac42.net)
3
-
4
2
  require_relative 'yasuri_node'
5
3
 
6
4
  module Yasuri
7
5
  class LinksNode
8
6
  include Node
7
+
9
8
  def inject(agent, page, opt = {}, element = page)
10
- retry_count = opt[:retry_count] || 5
9
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
10
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
11
11
 
12
12
  links = element.search(@xpath) || [] # links expected
13
13
  links.map do |link|
14
14
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
- child_page = Yasuri.with_retry(retry_count) { link_button.click }
15
+ child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
16
16
 
17
17
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
18
+ child_name = Yasuri.node_name(child_node.name, opt)
19
19
  [child_name, child_node.inject(agent, child_page, opt)]
20
20
  end
21
21
 
22
22
  Hash[child_results_kv]
23
- end # each named child node
23
+ end
24
+ end
25
+
26
+ def node_type_str
27
+ "links".freeze
24
28
  end
25
29
  end
26
30
  end