yasuri 2.0.12 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ # Author:: TAC (tac@tac42.net)
3
+
4
+ require 'yasuri'
5
+ uri = "https://github.com/tac0x2a?tab=repositories"
6
+
7
+ # Node tree constructing by DSL
8
+ root = Yasuri.map_root do
9
+ text_title '/html/head/title'
10
+ links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
11
+ text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
12
+ text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
13
+ text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
14
+ text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
15
+ end
16
+ end
17
+
18
+ # Node tree constructing by YAML
19
+ # src = <<-EOYML
20
+ # text_title: /html/head/title
21
+ # links_repo:
22
+ # path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
23
+ # text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
24
+ # text_desc:
25
+ # path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
26
+ # proc: :strip
27
+ # text_stars:
28
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
29
+ # proc: :to_i
30
+ # text_forks:
31
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
32
+ # proc: :to_i
33
+ # EOYML
34
+ # root = Yasuri.yaml2tree(src)
35
+
36
+ contents = root.scrape(uri, interval_ms: 100)
37
+ # jj contents
38
+ # {
39
+ # "title": "tac0x2a (TAC) / Repositories · GitHub",
40
+ # "repo": [
41
+ # {
42
+ # "name": "o-namazu",
43
+ # "desc": "Oh Namazu (Catfish) in datalake",
44
+ # "stars": 1,
45
+ # "forks": 0
46
+ # },
47
+ # {
48
+ # "name": "grebe",
49
+ # "desc": "grebe in datalake",
50
+ # "stars": 2,
51
+ # "forks": 0
52
+ # },
53
+ # {
54
+ # "name": "yasuri",
55
+ # "desc": "Yasuri (鑢) is easy web scraping library.",
56
+ # "stars": 43,
57
+ # "forks": 1
58
+ # },
59
+ # {
60
+ # "name": "dotfiles",
61
+ # "desc": "dotfiles",
62
+ # "stars": 0,
63
+ # "forks": 0
64
+ # }
65
+ # ...
66
+ # ]
67
+ # }
68
+
69
+ # Output as markdown
70
+ puts "# #{contents['title']}"
71
+ contents['repo'].each do |h|
72
+ puts "-----"
73
+ puts "## #{h['name']}"
74
+ puts h['desc']
75
+ puts ""
76
+ puts "* Stars: #{h['stars']}"
77
+ puts "* Forks: #{h['forks']}"
78
+ puts ""
79
+ end
@@ -0,0 +1,15 @@
1
+
2
+ # yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
3
+ text_title: /html/head/title
4
+ links_repo:
5
+ path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
6
+ text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
7
+ text_desc:
8
+ path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
9
+ proc: :strip
10
+ text_stars:
11
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
12
+ proc: :to_i
13
+ text_forks:
14
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
15
+ proc: :to_i
@@ -0,0 +1,4 @@
1
+ {
2
+ "text_title": "/html/head/title",
3
+ "text_desc": "//*[@id=\"intro\"]/p"
4
+ }
@@ -0,0 +1,11 @@
1
+ # yasuri scrape "https://www.tac42.net/" -f sample.yml
2
+ links_each:
3
+ path: //*[@id="posts"]/article/header/h1/a
4
+ text_title: //*[@id="content"]/article/header/h1
5
+ text_description: /html/head/meta[12]/@content
6
+ text_date:
7
+ path: //*[@id="content"]/article/header/div/span
8
+ proc: :strip
9
+ text_length:
10
+ path: //*[@id="content"]
11
+ proc: :size
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.12"
2
+ VERSION = "3.3.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -4,28 +4,45 @@
4
4
 
5
5
  require 'mechanize'
6
6
  require 'json'
7
+ require 'yaml'
7
8
 
8
9
  require_relative 'yasuri_node'
9
10
  require_relative 'yasuri_text_node'
10
11
  require_relative 'yasuri_struct_node'
11
12
  require_relative 'yasuri_paginate_node'
12
13
  require_relative 'yasuri_links_node'
14
+ require_relative 'yasuri_map_node'
13
15
  require_relative 'yasuri_node_generator'
14
16
 
15
17
  module Yasuri
16
18
 
19
+ DefaultRetryCount = 5
20
+ DefaultInterval_ms = 0
21
+
17
22
  def self.json2tree(json_string)
18
- json = JSON.parse(json_string, {symbolize_names: true})
19
- Yasuri.hash2node(json)
23
+ raise RuntimeError if json_string.nil? or json_string.empty?
24
+
25
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
26
+ Yasuri.hash2node(node_hash)
20
27
  end
21
28
 
22
29
  def self.tree2json(node)
30
+ raise RuntimeError if node.nil?
31
+
23
32
  Yasuri.node2hash(node).to_json
24
33
  end
25
34
 
26
- def self.method_missing(name, *args, &block)
27
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
28
- generated || super(name, args)
35
+ def self.yaml2tree(yaml_string)
36
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
37
+
38
+ node_hash = YAML.load(yaml_string)
39
+ Yasuri.hash2node(node_hash.deep_symbolize_keys)
40
+ end
41
+
42
+ private
43
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
44
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
45
+ generated || super(method_name, **opt)
29
46
  end
30
47
 
31
48
  private
@@ -33,61 +50,78 @@ module Yasuri
33
50
  text: Yasuri::TextNode,
34
51
  struct: Yasuri::StructNode,
35
52
  links: Yasuri::LinksNode,
36
- pages: Yasuri::PaginateNode
53
+ pages: Yasuri::PaginateNode,
54
+ map: Yasuri::MapNode
37
55
  }
38
- Node2Text = Text2Node.invert
39
56
 
40
- ReservedKeys = %i|node name path children|
41
- def self.hash2node(node_h)
42
- node, name, path, children = ReservedKeys.map do |key|
43
- node_h[key]
57
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
58
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
59
+
60
+ node_prefixes = Text2Node.keys.freeze
61
+ child_nodes = []
62
+ opt = {}
63
+ path = nil
64
+
65
+ if node_hash.is_a?(String)
66
+ path = node_hash
67
+ else
68
+ node_hash.each do |key, value|
69
+ # is node?
70
+ node_regexps = Text2Node.keys.map do |node_type_sym|
71
+ /^(#{node_type_sym.to_s})_(.+)$/
72
+ end
73
+ node_regexp = node_regexps.find do |node_regexp|
74
+ key =~ node_regexp
75
+ end
76
+
77
+ case key
78
+ when node_regexp
79
+ node_type_sym = $1.to_sym
80
+ child_node_name = $2
81
+ child_node_type = Text2Node[node_type_sym]
82
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
83
+ when :path
84
+ path = value
85
+ else
86
+ opt[key] = value
87
+ end
88
+ end
44
89
  end
45
- children ||= []
46
90
 
47
- fail "Not found 'node' value in json" if node.nil?
48
- fail "Not found 'name' value in json" if name.nil?
49
- fail "Not found 'path' value in json" if path.nil?
91
+ # If only single node under root, return only the node.
92
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
50
93
 
51
- childnodes = children.map{|c| Yasuri.hash2node(c) }
52
- ReservedKeys.each{|key| node_h.delete(key)}
53
- opt = node_h
94
+ node = if node_type_class.nil?
95
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
96
+ else
97
+ node_type_class::new(path, node_name, child_nodes, **opt)
98
+ end
54
99
 
55
- klass = Text2Node[node.to_sym]
56
- fail "Undefined node type #{node}" if klass.nil?
57
- klass.new(path, name, childnodes, opt)
100
+ node
58
101
  end
59
102
 
60
103
  def self.node2hash(node)
61
- json = JSON.parse("{}")
62
- return json if node.nil?
63
-
64
- klass = node.class
65
- klass_str = Node2Text[klass]
104
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
66
105
 
67
- json["node"] = klass_str
68
- json["name"] = node.name
69
- json["path"] = node.xpath
70
-
71
- children = node.children.map{|c| Yasuri.node2hash(c)}
72
- json["children"] = children if not children.empty?
73
-
74
- node.opts.each do |key,value|
75
- json[key] = value if not value.nil?
76
- end
77
-
78
- json
106
+ {
107
+ "#{node.node_type_str}_#{node.name}" => node.to_h
108
+ }
79
109
  end
80
110
 
81
- def self.NodeName(name, symbolize_names:false)
111
+ def self.node_name(name, opt)
112
+ symbolize_names = opt[:symbolize_names]
82
113
  symbolize_names ? name.to_sym : name
83
114
  end
84
115
 
85
- def self.with_retry(retry_count = 5)
116
+ def self.with_retry(
117
+ retry_count = DefaultRetryCount,
118
+ interval_ms = DefaultInterval_ms)
119
+
86
120
  begin
121
+ Kernel.sleep(interval_ms * 0.001)
87
122
  return yield() if block_given?
88
123
  rescue => e
89
124
  if retry_count > 0
90
- pp "retry #{retry_count}"
91
125
  retry_count -= 1
92
126
  retry
93
127
  end
@@ -95,3 +129,14 @@ module Yasuri
95
129
  end
96
130
  end
97
131
  end
132
+
133
+ class Hash
134
+ def deep_symbolize_keys
135
+ Hash[
136
+ self.map do |k, v|
137
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
138
+ [k.to_sym, v]
139
+ end
140
+ ]
141
+ end
142
+ end
@@ -0,0 +1,64 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+
5
+ module Yasuri
6
+ class CLI < Thor
7
+ package_name "yasuri"
8
+
9
+ default_command :scrape
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
11
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
12
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
13
+ option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
14
+ def scrape(uri)
15
+ # argument validations
16
+ if [options[:file], options[:json]].compact.count != 1
17
+ $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
+ return -1
19
+ end
20
+ if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
+ $stderr.puts "ERROR: --file option require not empty argument."
22
+ return -1
23
+ end
24
+ if options[:json]&.empty? or options[:json] == "json"
25
+ $stderr.puts "ERROR: --json option require not empty argument."
26
+ return -1
27
+ end
28
+
29
+ interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
30
+
31
+ tree = if options[:file]
32
+ src = File.read(options[:file])
33
+
34
+ begin
35
+ Yasuri.json2tree(src)
36
+ rescue
37
+ begin
38
+ Yasuri.yaml2tree(src)
39
+ rescue => e
40
+ $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
41
+ return -1
42
+ end
43
+ end
44
+ else
45
+ begin
46
+ Yasuri.json2tree(options[:json])
47
+ rescue => e
48
+ $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
49
+ return -1
50
+ end
51
+ end
52
+
53
+ result = tree.scrape(uri, interval_ms: interval_ms)
54
+
55
+ if result.instance_of?(String)
56
+ puts result
57
+ else
58
+ j result
59
+ end
60
+
61
+ return 0
62
+ end
63
+ end
64
+ end
@@ -6,21 +6,27 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
+
9
10
  def inject(agent, page, opt = {}, element = page)
10
- retry_count = opt[:retry_count] || 5
11
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
12
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
11
13
 
12
14
  links = element.search(@xpath) || [] # links expected
13
15
  links.map do |link|
14
16
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
- child_page = Yasuri.with_retry(retry_count) { link_button.click }
17
+ child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
16
18
 
17
19
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
20
+ child_name = Yasuri.node_name(child_node.name, opt)
19
21
  [child_name, child_node.inject(agent, child_page, opt)]
20
22
  end
21
23
 
22
24
  Hash[child_results_kv]
23
25
  end # each named child node
24
26
  end
25
- end
26
- end
27
+
28
+ def node_type_str
29
+ "links".freeze
30
+ end
31
+ end # class
32
+ end # module
@@ -0,0 +1,40 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ include Node
5
+ attr_reader :name, :children
6
+
7
+ def initialize(name, children, **opt)
8
+ @name = name
9
+ @children = children
10
+ @opt = opt
11
+ end
12
+
13
+ def inject(agent, page, opt = {}, element = page)
14
+ child_results_kv = @children.map do |node|
15
+ [node.name, node.inject(agent, page, opt)]
16
+ end
17
+ Hash[child_results_kv]
18
+ end
19
+
20
+ def to_h
21
+ node_hash = {}
22
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
23
+
24
+ children.each do |child|
25
+ child_node_name = "#{child.node_type_str}_#{child.name}"
26
+ node_hash[child_node_name] = child.to_h
27
+ end
28
+
29
+ node_hash
30
+ end
31
+
32
+ def opts
33
+ {}
34
+ end
35
+
36
+ def node_type_str
37
+ "map".freeze
38
+ end
39
+ end
40
+ end