yasuri 2.0.12 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ # Author:: TAC (tac@tac42.net)
3
+
4
+ require 'yasuri'
5
+ uri = "https://github.com/tac0x2a?tab=repositories"
6
+
7
+ # Node tree constructing by DSL
8
+ root = Yasuri.map_root do
9
+ text_title '/html/head/title'
10
+ links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
11
+ text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
12
+ text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
13
+ text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
14
+ text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
15
+ end
16
+ end
17
+
18
+ # Node tree constructing by YAML
19
+ # src = <<-EOYML
20
+ # text_title: /html/head/title
21
+ # links_repo:
22
+ # path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
23
+ # text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
24
+ # text_desc:
25
+ # path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
26
+ # proc: :strip
27
+ # text_stars:
28
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
29
+ # proc: :to_i
30
+ # text_forks:
31
+ # path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
32
+ # proc: :to_i
33
+ # EOYML
34
+ # root = Yasuri.yaml2tree(src)
35
+
36
+ contents = root.scrape(uri, interval_ms: 100)
37
+ # jj contents
38
+ # {
39
+ # "title": "tac0x2a (TAC) / Repositories · GitHub",
40
+ # "repo": [
41
+ # {
42
+ # "name": "o-namazu",
43
+ # "desc": "Oh Namazu (Catfish) in datalake",
44
+ # "stars": 1,
45
+ # "forks": 0
46
+ # },
47
+ # {
48
+ # "name": "grebe",
49
+ # "desc": "grebe in datalake",
50
+ # "stars": 2,
51
+ # "forks": 0
52
+ # },
53
+ # {
54
+ # "name": "yasuri",
55
+ # "desc": "Yasuri (鑢) is easy web scraping library.",
56
+ # "stars": 43,
57
+ # "forks": 1
58
+ # },
59
+ # {
60
+ # "name": "dotfiles",
61
+ # "desc": "dotfiles",
62
+ # "stars": 0,
63
+ # "forks": 0
64
+ # }
65
+ # ...
66
+ # ]
67
+ # }
68
+
69
+ # Output as markdown
70
+ puts "# #{contents['title']}"
71
+ contents['repo'].each do |h|
72
+ puts "-----"
73
+ puts "## #{h['name']}"
74
+ puts h['desc']
75
+ puts ""
76
+ puts "* Stars: #{h['stars']}"
77
+ puts "* Forks: #{h['forks']}"
78
+ puts ""
79
+ end
@@ -0,0 +1,15 @@
1
+
2
+ # yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
3
+ text_title: /html/head/title
4
+ links_repo:
5
+ path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
6
+ text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
7
+ text_desc:
8
+ path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
9
+ proc: :strip
10
+ text_stars:
11
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
12
+ proc: :to_i
13
+ text_forks:
14
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
15
+ proc: :to_i
@@ -0,0 +1,4 @@
1
+ {
2
+ "text_title": "/html/head/title",
3
+ "text_desc": "//*[@id=\"intro\"]/p"
4
+ }
@@ -0,0 +1,11 @@
1
+ # yasuri scrape "https://www.tac42.net/" -f sample.yml
2
+ links_each:
3
+ path: //*[@id="posts"]/article/header/h1/a
4
+ text_title: //*[@id="content"]/article/header/h1
5
+ text_description: /html/head/meta[12]/@content
6
+ text_date:
7
+ path: //*[@id="content"]/article/header/div/span
8
+ proc: :strip
9
+ text_length:
10
+ path: //*[@id="content"]
11
+ proc: :size
data/exe/yasuri ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "yasuri"
4
+
5
+ Yasuri::CLI.start
data/lib/yasuri.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "yasuri/version"
2
2
  require "yasuri/yasuri"
3
+ require "yasuri/yasuri_cli"
3
4
 
4
5
  module Yasuri
5
6
  # Your code goes here...
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "2.0.12"
2
+ VERSION = "3.3.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -4,28 +4,45 @@
4
4
 
5
5
  require 'mechanize'
6
6
  require 'json'
7
+ require 'yaml'
7
8
 
8
9
  require_relative 'yasuri_node'
9
10
  require_relative 'yasuri_text_node'
10
11
  require_relative 'yasuri_struct_node'
11
12
  require_relative 'yasuri_paginate_node'
12
13
  require_relative 'yasuri_links_node'
14
+ require_relative 'yasuri_map_node'
13
15
  require_relative 'yasuri_node_generator'
14
16
 
15
17
  module Yasuri
16
18
 
19
+ DefaultRetryCount = 5
20
+ DefaultInterval_ms = 0
21
+
17
22
  def self.json2tree(json_string)
18
- json = JSON.parse(json_string, {symbolize_names: true})
19
- Yasuri.hash2node(json)
23
+ raise RuntimeError if json_string.nil? or json_string.empty?
24
+
25
+ node_hash = JSON.parse(json_string, {symbolize_names: true})
26
+ Yasuri.hash2node(node_hash)
20
27
  end
21
28
 
22
29
  def self.tree2json(node)
30
+ raise RuntimeError if node.nil?
31
+
23
32
  Yasuri.node2hash(node).to_json
24
33
  end
25
34
 
26
- def self.method_missing(name, *args, &block)
27
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
28
- generated || super(name, args)
35
+ def self.yaml2tree(yaml_string)
36
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
37
+
38
+ node_hash = YAML.load(yaml_string)
39
+ Yasuri.hash2node(node_hash.deep_symbolize_keys)
40
+ end
41
+
42
+ private
43
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
44
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
45
+ generated || super(method_name, **opt)
29
46
  end
30
47
 
31
48
  private
@@ -33,61 +50,78 @@ module Yasuri
33
50
  text: Yasuri::TextNode,
34
51
  struct: Yasuri::StructNode,
35
52
  links: Yasuri::LinksNode,
36
- pages: Yasuri::PaginateNode
53
+ pages: Yasuri::PaginateNode,
54
+ map: Yasuri::MapNode
37
55
  }
38
- Node2Text = Text2Node.invert
39
56
 
40
- ReservedKeys = %i|node name path children|
41
- def self.hash2node(node_h)
42
- node, name, path, children = ReservedKeys.map do |key|
43
- node_h[key]
57
+ def self.hash2node(node_hash, node_name = nil, node_type_class = nil)
58
+ raise RuntimeError.new("") if node_name.nil? and node_hash.empty?
59
+
60
+ node_prefixes = Text2Node.keys.freeze
61
+ child_nodes = []
62
+ opt = {}
63
+ path = nil
64
+
65
+ if node_hash.is_a?(String)
66
+ path = node_hash
67
+ else
68
+ node_hash.each do |key, value|
69
+ # is node?
70
+ node_regexps = Text2Node.keys.map do |node_type_sym|
71
+ /^(#{node_type_sym.to_s})_(.+)$/
72
+ end
73
+ node_regexp = node_regexps.find do |node_regexp|
74
+ key =~ node_regexp
75
+ end
76
+
77
+ case key
78
+ when node_regexp
79
+ node_type_sym = $1.to_sym
80
+ child_node_name = $2
81
+ child_node_type = Text2Node[node_type_sym]
82
+ child_nodes << self.hash2node(value, child_node_name, child_node_type)
83
+ when :path
84
+ path = value
85
+ else
86
+ opt[key] = value
87
+ end
88
+ end
44
89
  end
45
- children ||= []
46
90
 
47
- fail "Not found 'node' value in json" if node.nil?
48
- fail "Not found 'name' value in json" if name.nil?
49
- fail "Not found 'path' value in json" if path.nil?
91
+ # If only single node under root, return only the node.
92
+ return child_nodes.first if node_name.nil? and child_nodes.size == 1
50
93
 
51
- childnodes = children.map{|c| Yasuri.hash2node(c) }
52
- ReservedKeys.each{|key| node_h.delete(key)}
53
- opt = node_h
94
+ node = if node_type_class.nil?
95
+ Yasuri::MapNode.new(node_name, child_nodes, **opt)
96
+ else
97
+ node_type_class::new(path, node_name, child_nodes, **opt)
98
+ end
54
99
 
55
- klass = Text2Node[node.to_sym]
56
- fail "Undefined node type #{node}" if klass.nil?
57
- klass.new(path, name, childnodes, opt)
100
+ node
58
101
  end
59
102
 
60
103
  def self.node2hash(node)
61
- json = JSON.parse("{}")
62
- return json if node.nil?
63
-
64
- klass = node.class
65
- klass_str = Node2Text[klass]
104
+ return node.to_h if node.instance_of?(Yasuri::MapNode)
66
105
 
67
- json["node"] = klass_str
68
- json["name"] = node.name
69
- json["path"] = node.xpath
70
-
71
- children = node.children.map{|c| Yasuri.node2hash(c)}
72
- json["children"] = children if not children.empty?
73
-
74
- node.opts.each do |key,value|
75
- json[key] = value if not value.nil?
76
- end
77
-
78
- json
106
+ {
107
+ "#{node.node_type_str}_#{node.name}" => node.to_h
108
+ }
79
109
  end
80
110
 
81
- def self.NodeName(name, symbolize_names:false)
111
+ def self.node_name(name, opt)
112
+ symbolize_names = opt[:symbolize_names]
82
113
  symbolize_names ? name.to_sym : name
83
114
  end
84
115
 
85
- def self.with_retry(retry_count = 5)
116
+ def self.with_retry(
117
+ retry_count = DefaultRetryCount,
118
+ interval_ms = DefaultInterval_ms)
119
+
86
120
  begin
121
+ Kernel.sleep(interval_ms * 0.001)
87
122
  return yield() if block_given?
88
123
  rescue => e
89
124
  if retry_count > 0
90
- pp "retry #{retry_count}"
91
125
  retry_count -= 1
92
126
  retry
93
127
  end
@@ -95,3 +129,14 @@ module Yasuri
95
129
  end
96
130
  end
97
131
  end
132
+
133
+ class Hash
134
+ def deep_symbolize_keys
135
+ Hash[
136
+ self.map do |k, v|
137
+ v = v.deep_symbolize_keys if v.kind_of?(Hash)
138
+ [k.to_sym, v]
139
+ end
140
+ ]
141
+ end
142
+ end
@@ -0,0 +1,64 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'yasuri'
4
+
5
+ module Yasuri
6
+ class CLI < Thor
7
+ package_name "yasuri"
8
+
9
+ default_command :scrape
10
+ desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
11
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
12
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
13
+ option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
14
+ def scrape(uri)
15
+ # argument validations
16
+ if [options[:file], options[:json]].compact.count != 1
17
+ $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
18
+ return -1
19
+ end
20
+ if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
21
+ $stderr.puts "ERROR: --file option require not empty argument."
22
+ return -1
23
+ end
24
+ if options[:json]&.empty? or options[:json] == "json"
25
+ $stderr.puts "ERROR: --json option require not empty argument."
26
+ return -1
27
+ end
28
+
29
+ interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
30
+
31
+ tree = if options[:file]
32
+ src = File.read(options[:file])
33
+
34
+ begin
35
+ Yasuri.json2tree(src)
36
+ rescue
37
+ begin
38
+ Yasuri.yaml2tree(src)
39
+ rescue => e
40
+ $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
41
+ return -1
42
+ end
43
+ end
44
+ else
45
+ begin
46
+ Yasuri.json2tree(options[:json])
47
+ rescue => e
48
+ $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
49
+ return -1
50
+ end
51
+ end
52
+
53
+ result = tree.scrape(uri, interval_ms: interval_ms)
54
+
55
+ if result.instance_of?(String)
56
+ puts result
57
+ else
58
+ j result
59
+ end
60
+
61
+ return 0
62
+ end
63
+ end
64
+ end
@@ -6,21 +6,27 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
+
9
10
  def inject(agent, page, opt = {}, element = page)
10
- retry_count = opt[:retry_count] || 5
11
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
12
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
11
13
 
12
14
  links = element.search(@xpath) || [] # links expected
13
15
  links.map do |link|
14
16
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
- child_page = Yasuri.with_retry(retry_count) { link_button.click }
17
+ child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
16
18
 
17
19
  child_results_kv = @children.map do |child_node|
18
- child_name = Yasuri.NodeName(child_node.name, opt)
20
+ child_name = Yasuri.node_name(child_node.name, opt)
19
21
  [child_name, child_node.inject(agent, child_page, opt)]
20
22
  end
21
23
 
22
24
  Hash[child_results_kv]
23
25
  end # each named child node
24
26
  end
25
- end
26
- end
27
+
28
+ def node_type_str
29
+ "links".freeze
30
+ end
31
+ end # class
32
+ end # module
@@ -0,0 +1,40 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ include Node
5
+ attr_reader :name, :children
6
+
7
+ def initialize(name, children, **opt)
8
+ @name = name
9
+ @children = children
10
+ @opt = opt
11
+ end
12
+
13
+ def inject(agent, page, opt = {}, element = page)
14
+ child_results_kv = @children.map do |node|
15
+ [node.name, node.inject(agent, page, opt)]
16
+ end
17
+ Hash[child_results_kv]
18
+ end
19
+
20
+ def to_h
21
+ node_hash = {}
22
+ self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
23
+
24
+ children.each do |child|
25
+ child_node_name = "#{child.node_type_str}_#{child.name}"
26
+ node_hash[child_node_name] = child.to_h
27
+ end
28
+
29
+ node_hash
30
+ end
31
+
32
+ def opts
33
+ {}
34
+ end
35
+
36
+ def node_type_str
37
+ "map".freeze
38
+ end
39
+ end
40
+ end