yasuri 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +54 -24
- data/USAGE.ja.md +216 -72
- data/USAGE.md +225 -78
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +71 -36
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +3 -3
- data/lib/yasuri/yasuri_map_node.rb +12 -27
- data/lib/yasuri/yasuri_node.rb +15 -37
- data/lib/yasuri/yasuri_paginate_node.rb +5 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +5 -5
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_spec.rb +125 -140
- data/yasuri.gemspec +3 -1
- metadata +31 -4
| @@ -0,0 +1,64 @@ | |
| 1 | 
            +
            require 'thor'
         | 
| 2 | 
            +
            require 'json'
         | 
| 3 | 
            +
            require 'yasuri'
         | 
| 4 | 
            +
            require 'mechanize'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Yasuri
         | 
| 7 | 
            +
              class CLI < Thor
         | 
| 8 | 
            +
                package_name "yasuri"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                default_command :scrape
         | 
| 11 | 
            +
                desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
         | 
| 12 | 
            +
                option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
         | 
| 13 | 
            +
                option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
         | 
| 14 | 
            +
                def scrape(uri)
         | 
| 15 | 
            +
                  # argument validations
         | 
| 16 | 
            +
                  if [options[:file], options[:json]].compact.count != 1
         | 
| 17 | 
            +
                    $stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
         | 
| 18 | 
            +
                    return -1
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
                  if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
         | 
| 21 | 
            +
                    $stderr.puts "ERROR: --file option require not empty argument."
         | 
| 22 | 
            +
                    return -1
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
                  if options[:json]&.empty? or options[:json] == "json"
         | 
| 25 | 
            +
                    $stderr.puts "ERROR: --json option require not empty argument."
         | 
| 26 | 
            +
                    return -1
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  tree = if options[:file]
         | 
| 30 | 
            +
                          src = File.read(options[:file])
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                          begin
         | 
| 33 | 
            +
                            Yasuri.json2tree(src)
         | 
| 34 | 
            +
                          rescue
         | 
| 35 | 
            +
                            begin
         | 
| 36 | 
            +
                              Yasuri.yaml2tree(src)
         | 
| 37 | 
            +
                            rescue => e
         | 
| 38 | 
            +
                              $stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
         | 
| 39 | 
            +
                              return -1
         | 
| 40 | 
            +
                            end
         | 
| 41 | 
            +
                          end
         | 
| 42 | 
            +
                        else
         | 
| 43 | 
            +
                          begin
         | 
| 44 | 
            +
                            Yasuri.json2tree(options[:json])
         | 
| 45 | 
            +
                          rescue => e
         | 
| 46 | 
            +
                            $stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
         | 
| 47 | 
            +
                            return -1
         | 
| 48 | 
            +
                          end
         | 
| 49 | 
            +
                        end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  agent = Mechanize.new
         | 
| 52 | 
            +
                  root_page = agent.get(uri)
         | 
| 53 | 
            +
                  result = tree.inject(agent, root_page)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  if result.instance_of?(String)
         | 
| 56 | 
            +
                    puts result
         | 
| 57 | 
            +
                  else
         | 
| 58 | 
            +
                    j result
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  return 0
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
            end
         | 
| @@ -7,7 +7,7 @@ module Yasuri | |
| 7 7 | 
             
              class LinksNode
         | 
| 8 8 | 
             
                include Node
         | 
| 9 9 | 
             
                def inject(agent, page, opt = {}, element = page)
         | 
| 10 | 
            -
                  retry_count = opt[:retry_count] ||  | 
| 10 | 
            +
                  retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
         | 
| 11 11 |  | 
| 12 12 | 
             
                  links = element.search(@xpath) || [] # links expected
         | 
| 13 13 | 
             
                  links.map do |link|
         | 
| @@ -15,7 +15,7 @@ module Yasuri | |
| 15 15 | 
             
                    child_page = Yasuri.with_retry(retry_count) { link_button.click }
         | 
| 16 16 |  | 
| 17 17 | 
             
                    child_results_kv = @children.map do |child_node|
         | 
| 18 | 
            -
                      child_name = Yasuri. | 
| 18 | 
            +
                      child_name = Yasuri.node_name(child_node.name, opt)
         | 
| 19 19 | 
             
                      [child_name, child_node.inject(agent, child_page, opt)]
         | 
| 20 20 | 
             
                    end
         | 
| 21 21 |  | 
| @@ -24,7 +24,7 @@ module Yasuri | |
| 24 24 | 
             
                end
         | 
| 25 25 |  | 
| 26 26 | 
             
                def node_type_str
         | 
| 27 | 
            -
                  "links"
         | 
| 27 | 
            +
                  "links".freeze
         | 
| 28 28 | 
             
                end
         | 
| 29 29 | 
             
              end # class
         | 
| 30 30 | 
             
            end # module
         | 
| @@ -3,7 +3,7 @@ module Yasuri | |
| 3 3 | 
             
              class MapNode
         | 
| 4 4 | 
             
                attr_reader :name, :children
         | 
| 5 5 |  | 
| 6 | 
            -
                def initialize(name, children, opt | 
| 6 | 
            +
                def initialize(name, children, **opt)
         | 
| 7 7 | 
             
                  @name = name
         | 
| 8 8 | 
             
                  @children = children
         | 
| 9 9 | 
             
                  @opt = opt
         | 
| @@ -16,39 +16,24 @@ module Yasuri | |
| 16 16 | 
             
                  Hash[child_results_kv]
         | 
| 17 17 | 
             
                end
         | 
| 18 18 |  | 
| 19 | 
            -
                def opts
         | 
| 20 | 
            -
                  {}
         | 
| 21 | 
            -
                end
         | 
| 22 | 
            -
             | 
| 23 19 | 
             
                def to_h
         | 
| 24 | 
            -
                   | 
| 25 | 
            -
                   | 
| 26 | 
            -
                  h["name"] = self.name
         | 
| 27 | 
            -
                  h["children"] = self.children.map{|c| c.to_h} if not children.empty?
         | 
| 20 | 
            +
                  node_hash = {}
         | 
| 21 | 
            +
                  self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
         | 
| 28 22 |  | 
| 29 | 
            -
                   | 
| 30 | 
            -
                     | 
| 23 | 
            +
                  children.each do |child|
         | 
| 24 | 
            +
                    child_node_name = "#{child.node_type_str}_#{child.name}"
         | 
| 25 | 
            +
                    node_hash[child_node_name] = child.to_h
         | 
| 31 26 | 
             
                  end
         | 
| 32 27 |  | 
| 33 | 
            -
                   | 
| 28 | 
            +
                  node_hash
         | 
| 34 29 | 
             
                end
         | 
| 35 30 |  | 
| 36 | 
            -
                def  | 
| 37 | 
            -
                   | 
| 38 | 
            -
             | 
| 39 | 
            -
                  node, name, children = reservedKeys.map do |key|
         | 
| 40 | 
            -
                    node_h[key]
         | 
| 41 | 
            -
                  end
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                  fail "Not found 'name' value in map" if name.nil?
         | 
| 44 | 
            -
                  fail "Not found 'children' value in map" if children.nil?
         | 
| 45 | 
            -
                  children ||= []
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                  childnodes = children.map{|c| Yasuri.hash2node(c) }
         | 
| 48 | 
            -
                  reservedKeys.each{|key| node_h.delete(key)}
         | 
| 49 | 
            -
                  opt = node_h
         | 
| 31 | 
            +
                def opts
         | 
| 32 | 
            +
                  {}
         | 
| 33 | 
            +
                end
         | 
| 50 34 |  | 
| 51 | 
            -
             | 
| 35 | 
            +
                def node_type_str
         | 
| 36 | 
            +
                  "map".freeze
         | 
| 52 37 | 
             
                end
         | 
| 53 38 | 
             
              end
         | 
| 54 39 | 
             
            end
         | 
    
        data/lib/yasuri/yasuri_node.rb
    CHANGED
    
    | @@ -7,7 +7,7 @@ module Yasuri | |
| 7 7 | 
             
              module Node
         | 
| 8 8 | 
             
                attr_reader :url, :xpath, :name, :children
         | 
| 9 9 |  | 
| 10 | 
            -
                def initialize(xpath, name, children = [], opt | 
| 10 | 
            +
                def initialize(xpath, name, children = [], **opt)
         | 
| 11 11 | 
             
                  @xpath, @name, @children = xpath, name, children
         | 
| 12 12 | 
             
                end
         | 
| 13 13 |  | 
| @@ -15,50 +15,28 @@ module Yasuri | |
| 15 15 | 
             
                  fail "#{Kernel.__method__} is not implemented in included class."
         | 
| 16 16 | 
             
                end
         | 
| 17 17 |  | 
| 18 | 
            -
                def opts
         | 
| 19 | 
            -
                  {}
         | 
| 20 | 
            -
                end
         | 
| 21 | 
            -
             | 
| 22 18 | 
             
                def to_h
         | 
| 23 | 
            -
                   | 
| 24 | 
            -
                  h["node"] = self.node_type_str
         | 
| 25 | 
            -
                  h["name"] = self.name
         | 
| 26 | 
            -
                  h["path"] = self.xpath
         | 
| 27 | 
            -
                  h["children"] = self.children.map{|c| c.to_h} if not children.empty?
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                  self.opts.each do |key,value|
         | 
| 30 | 
            -
                    h[key] = value if not value.nil?
         | 
| 31 | 
            -
                  end
         | 
| 32 | 
            -
             | 
| 33 | 
            -
                  h
         | 
| 34 | 
            -
                end
         | 
| 19 | 
            +
                  return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
         | 
| 35 20 |  | 
| 36 | 
            -
             | 
| 37 | 
            -
                   | 
| 38 | 
            -
                    reservedKeys = %i|node name path children|
         | 
| 21 | 
            +
                  node_hash = {}
         | 
| 22 | 
            +
                  self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
         | 
| 39 23 |  | 
| 40 | 
            -
             | 
| 41 | 
            -
                      node_h[key]
         | 
| 42 | 
            -
                    end
         | 
| 24 | 
            +
                  node_hash[:path] = @xpath if @xpath
         | 
| 43 25 |  | 
| 44 | 
            -
             | 
| 45 | 
            -
                     | 
| 46 | 
            -
                     | 
| 47 | 
            -
             | 
| 48 | 
            -
                    childnodes = children.map{|c| Yasuri.hash2node(c) }
         | 
| 49 | 
            -
                    reservedKeys.each{|key| node_h.delete(key)}
         | 
| 50 | 
            -
                    opt = node_h
         | 
| 51 | 
            -
             | 
| 52 | 
            -
                    self.new(path, name, childnodes, **opt)
         | 
| 26 | 
            +
                  children.each do |child|
         | 
| 27 | 
            +
                    child_node_name = "#{child.node_type_str}_#{child.name}"
         | 
| 28 | 
            +
                    node_hash[child_node_name] = child.to_h
         | 
| 53 29 | 
             
                  end
         | 
| 54 30 |  | 
| 55 | 
            -
                   | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 31 | 
            +
                  node_hash
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                def opts
         | 
| 35 | 
            +
                  {}
         | 
| 58 36 | 
             
                end
         | 
| 59 37 |  | 
| 60 | 
            -
                def  | 
| 61 | 
            -
                   | 
| 38 | 
            +
                def node_type_str
         | 
| 39 | 
            +
                  fail "#{Kernel.__method__} is not implemented in included class."
         | 
| 62 40 | 
             
                end
         | 
| 63 41 | 
             
              end
         | 
| 64 42 | 
             
            end
         | 
| @@ -14,7 +14,7 @@ module Yasuri | |
| 14 14 | 
             
                end
         | 
| 15 15 |  | 
| 16 16 | 
             
                def inject(agent, page, opt = {}, element = page)
         | 
| 17 | 
            -
                  retry_count = opt[:retry_count] ||  | 
| 17 | 
            +
                  retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
         | 
| 18 18 |  | 
| 19 19 | 
             
                  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
         | 
| 20 20 |  | 
| @@ -22,12 +22,12 @@ module Yasuri | |
| 22 22 | 
             
                  limit = @limit.nil? ? Float::MAX : @limit
         | 
| 23 23 | 
             
                  while page
         | 
| 24 24 | 
             
                    child_results_kv = @children.map do |child_node|
         | 
| 25 | 
            -
                      child_name = Yasuri. | 
| 25 | 
            +
                      child_name = Yasuri.node_name(child_node.name, opt)
         | 
| 26 26 | 
             
                      [child_name, child_node.inject(agent, page, opt)]
         | 
| 27 27 | 
             
                    end
         | 
| 28 28 | 
             
                    child_results << Hash[child_results_kv]
         | 
| 29 29 |  | 
| 30 | 
            -
                    link = page.search(@xpath).first
         | 
| 30 | 
            +
                    link = page.search(@xpath).first # Todo raise:  link is not found
         | 
| 31 31 | 
             
                    break if link == nil
         | 
| 32 32 |  | 
| 33 33 | 
             
                    link_button = Mechanize::Page::Link.new(link, agent, page)
         | 
| @@ -41,12 +41,13 @@ module Yasuri | |
| 41 41 |  | 
| 42 42 | 
             
                  child_results
         | 
| 43 43 | 
             
                end
         | 
| 44 | 
            +
             | 
| 44 45 | 
             
                def opts
         | 
| 45 46 | 
             
                  {limit:@limit, flatten:@flatten}
         | 
| 46 47 | 
             
                end
         | 
| 47 48 |  | 
| 48 49 | 
             
                def node_type_str
         | 
| 49 | 
            -
                  "pages"
         | 
| 50 | 
            +
                  "pages".freeze
         | 
| 50 51 | 
             
                end
         | 
| 51 52 | 
             
              end
         | 
| 52 53 | 
             
            end
         | 
| @@ -10,12 +10,16 @@ module Yasuri | |
| 10 10 | 
             
                  sub_tags = element.search(@xpath)
         | 
| 11 11 | 
             
                  tree = sub_tags.map do |sub_tag|
         | 
| 12 12 | 
             
                    child_results_kv = @children.map do |child_node|
         | 
| 13 | 
            -
                      child_name = Yasuri. | 
| 13 | 
            +
                      child_name = Yasuri.node_name(child_node.name, opt)
         | 
| 14 14 | 
             
                      [child_name, child_node.inject(agent, page, opt, sub_tag)]
         | 
| 15 15 | 
             
                    end
         | 
| 16 16 | 
             
                    Hash[child_results_kv]
         | 
| 17 17 | 
             
                  end
         | 
| 18 18 | 
             
                  tree.size == 1 ? tree.first : tree
         | 
| 19 19 | 
             
                end # inject
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def node_type_str
         | 
| 22 | 
            +
                  "struct".freeze
         | 
| 23 | 
            +
                end
         | 
| 20 24 | 
             
              end
         | 
| 21 25 | 
             
            end
         | 
| @@ -18,7 +18,6 @@ module Yasuri | |
| 18 18 | 
             
                  @truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
         | 
| 19 19 |  | 
| 20 20 | 
             
                  @proc = proc.nil? ? nil : proc.to_sym
         | 
| 21 | 
            -
             | 
| 22 21 | 
             
                end
         | 
| 23 22 |  | 
| 24 23 | 
             
                def inject(agent, page, opt = {}, element = page)
         | 
| @@ -31,15 +30,16 @@ module Yasuri | |
| 31 30 | 
             
                  end
         | 
| 32 31 |  | 
| 33 32 | 
             
                  text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
         | 
| 34 | 
            -
                  text
         | 
| 35 | 
            -
                end
         | 
| 36 33 |  | 
| 37 | 
            -
             | 
| 38 | 
            -
                  "text"
         | 
| 34 | 
            +
                  text
         | 
| 39 35 | 
             
                end
         | 
| 40 36 |  | 
| 41 37 | 
             
                def opts
         | 
| 42 38 | 
             
                  {truncate:@truncate, proc:@proc}
         | 
| 43 39 | 
             
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                def node_type_str
         | 
| 42 | 
            +
                  "text".freeze
         | 
| 43 | 
            +
                end
         | 
| 44 44 | 
             
              end
         | 
| 45 45 | 
             
            end
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    | @@ -16,7 +16,7 @@ require 'simplecov' | |
| 16 16 | 
             
            require 'coveralls'
         | 
| 17 17 | 
             
            Coveralls.wear!
         | 
| 18 18 |  | 
| 19 | 
            -
            SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
         | 
| 19 | 
            +
            SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
         | 
| 20 20 | 
             
              SimpleCov::Formatter::HTMLFormatter,
         | 
| 21 21 | 
             
              Coveralls::SimpleCov::Formatter
         | 
| 22 22 | 
             
            ]
         | 
| @@ -0,0 +1,83 @@ | |
| 1 | 
            +
            require_relative 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe 'Yasuri' do
         | 
| 4 | 
            +
              include_context 'httpserver'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              before do
         | 
| 7 | 
            +
                @agent = Mechanize.new
         | 
| 8 | 
            +
                @index_page = @agent.get(uri)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                @res_dir = File.expand_path('../cli_resources', __FILE__)
         | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              describe 'cli scrape' do
         | 
| 14 | 
            +
                it "require --file or --json option" do
         | 
| 15 | 
            +
                  expect {
         | 
| 16 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {})
         | 
| 17 | 
            +
                  }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                it "only one of --file or --json option" do
         | 
| 21 | 
            +
                  expect {
         | 
| 22 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
         | 
| 23 | 
            +
                  }.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                it "require --file option is not empty string" do
         | 
| 27 | 
            +
                  expect {
         | 
| 28 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
         | 
| 29 | 
            +
                  }.to output("ERROR: --file option require not empty argument.\n").to_stderr
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                it "require --json option is not empty string" do
         | 
| 33 | 
            +
                  expect {
         | 
| 34 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
         | 
| 35 | 
            +
                  }.to output("ERROR: --json option require not empty argument.\n").to_stderr
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
             | 
| 39 | 
            +
                it "display text node as simple string" do
         | 
| 40 | 
            +
                  expect {
         | 
| 41 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
         | 
| 42 | 
            +
                  }.to output("Yasuri Test\n").to_stdout
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                it "display texts in single json" do
         | 
| 46 | 
            +
                  expect {
         | 
| 47 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
         | 
| 48 | 
            +
                  }.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
             | 
| 52 | 
            +
                it "display text node as simple string via json file" do
         | 
| 53 | 
            +
                  expect {
         | 
| 54 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
         | 
| 55 | 
            +
                  }.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
                it "display text node as simple string via yaml file" do
         | 
| 58 | 
            +
                  expect {
         | 
| 59 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
         | 
| 60 | 
            +
                  }.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
                it "display ERROR when json string is wrong" do
         | 
| 65 | 
            +
                  wrong_json = '{,,}'
         | 
| 66 | 
            +
                  expect {
         | 
| 67 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
         | 
| 68 | 
            +
                  }.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
                it "display ERROR when json file contains is wrong" do
         | 
| 71 | 
            +
                  file_path = "#{@res_dir}/tree_wrong.json"
         | 
| 72 | 
            +
                  expect {
         | 
| 73 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
         | 
| 74 | 
            +
                  }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
         | 
| 75 | 
            +
                end
         | 
| 76 | 
            +
                it "display ERROR when yaml file contains is wrong" do
         | 
| 77 | 
            +
                  file_path = "#{@res_dir}/tree_wrong.yml"
         | 
| 78 | 
            +
                  expect {
         | 
| 79 | 
            +
                    Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
         | 
| 80 | 
            +
                  }.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
         | 
| 81 | 
            +
                end
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
            end
         | 
    
        data/spec/yasuri_spec.rb
    CHANGED
    
    | @@ -13,6 +13,7 @@ describe 'Yasuri' do | |
| 13 13 | 
             
                @index_page = @agent.get(@uri)
         | 
| 14 14 | 
             
              end
         | 
| 15 15 |  | 
| 16 | 
            +
             | 
| 16 17 | 
             
              ############
         | 
| 17 18 | 
             
              # yam2tree #
         | 
| 18 19 | 
             
              ############
         | 
| @@ -23,10 +24,8 @@ describe 'Yasuri' do | |
| 23 24 |  | 
| 24 25 | 
             
                it "return text node" do
         | 
| 25 26 | 
             
                  src = <<-EOB
         | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
              path: "/html/body/p[1]"
         | 
| 29 | 
            -
            EOB
         | 
| 27 | 
            +
                    text_content: "/html/body/p[1]"
         | 
| 28 | 
            +
                  EOB
         | 
| 30 29 | 
             
                  generated = Yasuri.yaml2tree(src)
         | 
| 31 30 | 
             
                  original  = Yasuri::TextNode.new('/html/body/p[1]', "content")
         | 
| 32 31 |  | 
| @@ -35,10 +34,9 @@ EOB | |
| 35 34 |  | 
| 36 35 | 
             
                it "return text node as symbol" do
         | 
| 37 36 | 
             
                  src = <<-EOB
         | 
| 38 | 
            -
            : | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
            EOB
         | 
| 37 | 
            +
                  :text_content:
         | 
| 38 | 
            +
                    :path: "/html/body/p[1]"
         | 
| 39 | 
            +
                  EOB
         | 
| 42 40 | 
             
                  generated = Yasuri.yaml2tree(src)
         | 
| 43 41 | 
             
                  original  = Yasuri::TextNode.new('/html/body/p[1]', "content")
         | 
| 44 42 |  | 
| @@ -48,14 +46,10 @@ EOB | |
| 48 46 | 
             
                it "return LinksNode/TextNode" do
         | 
| 49 47 |  | 
| 50 48 | 
             
                  src = <<-EOB
         | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
                - content:
         | 
| 56 | 
            -
                    node: text
         | 
| 57 | 
            -
                    path: "/html/body/p"
         | 
| 58 | 
            -
            EOB
         | 
| 49 | 
            +
                  links_root:
         | 
| 50 | 
            +
                    path: "/html/body/a"
         | 
| 51 | 
            +
                    text_content: "/html/body/p"
         | 
| 52 | 
            +
                  EOB
         | 
| 59 53 | 
             
                  generated = Yasuri.yaml2tree(src)
         | 
| 60 54 | 
             
                  original  = Yasuri::LinksNode.new('/html/body/a', "root", [
         | 
| 61 55 | 
             
                                Yasuri::TextNode.new('/html/body/p', "content"),
         | 
| @@ -66,21 +60,13 @@ EOB | |
| 66 60 |  | 
| 67 61 | 
             
                it "return StructNode/StructNode/[TextNode,TextNode]" do
         | 
| 68 62 | 
             
                  src = <<-EOB
         | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
                    children:
         | 
| 77 | 
            -
                      - title:
         | 
| 78 | 
            -
                          node: text
         | 
| 79 | 
            -
                          path: "./td[1]"
         | 
| 80 | 
            -
                      - pub_date:
         | 
| 81 | 
            -
                          node: text
         | 
| 82 | 
            -
                          path: "./td[2]"
         | 
| 83 | 
            -
            EOB
         | 
| 63 | 
            +
                  struct_tables:
         | 
| 64 | 
            +
                    path: "/html/body/table"
         | 
| 65 | 
            +
                    struct_table:
         | 
| 66 | 
            +
                      path: "./tr"
         | 
| 67 | 
            +
                      text_title: "./td[1]"
         | 
| 68 | 
            +
                      text_pub_date: "./td[2]"
         | 
| 69 | 
            +
                  EOB
         | 
| 84 70 |  | 
| 85 71 | 
             
                  generated = Yasuri.yaml2tree(src)
         | 
| 86 72 | 
             
                  original  = Yasuri::StructNode.new('/html/body/table', "tables", [
         | 
| @@ -105,10 +91,10 @@ EOB | |
| 105 91 | 
             
                end
         | 
| 106 92 |  | 
| 107 93 | 
             
                it "return TextNode" do
         | 
| 108 | 
            -
                  src = %q| | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
| 94 | 
            +
                  src = %q|
         | 
| 95 | 
            +
                  {
         | 
| 96 | 
            +
                    "text_content": "/html/body/p[1]"
         | 
| 97 | 
            +
                  }|
         | 
| 112 98 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 113 99 | 
             
                  original  = Yasuri::TextNode.new('/html/body/p[1]', "content")
         | 
| 114 100 |  | 
| @@ -116,30 +102,24 @@ EOB | |
| 116 102 | 
             
                end
         | 
| 117 103 |  | 
| 118 104 | 
             
                it "return TextNode with truncate_regexp" do
         | 
| 119 | 
            -
                  src = %q| | 
| 120 | 
            -
             | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 105 | 
            +
                  src = %q|
         | 
| 106 | 
            +
                  {
         | 
| 107 | 
            +
                    "text_content": {
         | 
| 108 | 
            +
                      "path": "/html/body/p[1]",
         | 
| 109 | 
            +
                      "truncate"  : "^[^,]+"
         | 
| 110 | 
            +
                    }
         | 
| 111 | 
            +
                  }|
         | 
| 124 112 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 125 113 | 
             
                  original  = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
         | 
| 126 114 | 
             
                  compare_generated_vs_original(generated, original, @index_page)
         | 
| 127 115 | 
             
                end
         | 
| 128 116 |  | 
| 129 117 | 
             
                it "return MapNode with TextNodes" do
         | 
| 130 | 
            -
                  src = %q| | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
             | 
| 135 | 
            -
                                  "path"  : "/html/body/p[1]"
         | 
| 136 | 
            -
                                },
         | 
| 137 | 
            -
                                { "node"  : "text",
         | 
| 138 | 
            -
                                  "name"  : "content02",
         | 
| 139 | 
            -
                                  "path"  : "/html/body/p[2]"
         | 
| 140 | 
            -
                                }
         | 
| 141 | 
            -
                              ]
         | 
| 142 | 
            -
                            }|
         | 
| 118 | 
            +
                  src = %q|
         | 
| 119 | 
            +
                  {
         | 
| 120 | 
            +
                    "text_content01": "/html/body/p[1]",
         | 
| 121 | 
            +
                    "text_content02": "/html/body/p[2]"
         | 
| 122 | 
            +
                  }|
         | 
| 143 123 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 144 124 | 
             
                  original  = Yasuri::MapNode.new('parent', [
         | 
| 145 125 | 
             
                    Yasuri::TextNode.new('/html/body/p[1]', "content01"),
         | 
| @@ -149,14 +129,14 @@ EOB | |
| 149 129 | 
             
                end
         | 
| 150 130 |  | 
| 151 131 | 
             
                it "return LinksNode/TextNode" do
         | 
| 152 | 
            -
                  src = %q| | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 132 | 
            +
                  src = %q|
         | 
| 133 | 
            +
                  {
         | 
| 134 | 
            +
                    "links_root": {
         | 
| 135 | 
            +
                      "path": "/html/body/a",
         | 
| 136 | 
            +
                      "text_content": "/html/body/p"
         | 
| 137 | 
            +
                    }
         | 
| 138 | 
            +
                  }|
         | 
| 139 | 
            +
             | 
| 160 140 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 161 141 | 
             
                  original  = Yasuri::LinksNode.new('/html/body/a', "root", [
         | 
| 162 142 | 
             
                                Yasuri::TextNode.new('/html/body/p', "content"),
         | 
| @@ -166,14 +146,13 @@ EOB | |
| 166 146 | 
             
                end
         | 
| 167 147 |  | 
| 168 148 | 
             
                it "return PaginateNode/TextNode" do
         | 
| 169 | 
            -
                  src = %q| | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
                           }|
         | 
| 149 | 
            +
                  src = %q|
         | 
| 150 | 
            +
                  {
         | 
| 151 | 
            +
                    "pages_root": {
         | 
| 152 | 
            +
                      "path": "/html/body/nav/span/a[@class=\'next\']",
         | 
| 153 | 
            +
                      "text_content": "/html/body/p"
         | 
| 154 | 
            +
                    }
         | 
| 155 | 
            +
                  }|
         | 
| 177 156 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 178 157 | 
             
                  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
         | 
| 179 158 | 
             
                               Yasuri::TextNode.new('/html/body/p', "content"),
         | 
| @@ -185,15 +164,14 @@ EOB | |
| 185 164 | 
             
                end
         | 
| 186 165 |  | 
| 187 166 | 
             
                it "return PaginateNode/TextNode with limit" do
         | 
| 188 | 
            -
                  src = %q| | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 193 | 
            -
             | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
                           }|
         | 
| 167 | 
            +
                  src = %q|
         | 
| 168 | 
            +
                  {
         | 
| 169 | 
            +
                    "pages_root": {
         | 
| 170 | 
            +
                      "path": "/html/body/nav/span/a[@class=\'next\']",
         | 
| 171 | 
            +
                      "limit": 2,
         | 
| 172 | 
            +
                      "text_content": "/html/body/p"
         | 
| 173 | 
            +
                    }
         | 
| 174 | 
            +
                  }|
         | 
| 197 175 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 198 176 | 
             
                  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
         | 
| 199 177 | 
             
                               Yasuri::TextNode.new('/html/body/p', "content"),
         | 
| @@ -205,24 +183,17 @@ EOB | |
| 205 183 | 
             
                end
         | 
| 206 184 |  | 
| 207 185 | 
             
                it "return StructNode/StructNode/[TextNode,TextNode]" do
         | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
| 212 | 
            -
             | 
| 213 | 
            -
             | 
| 214 | 
            -
             | 
| 215 | 
            -
             | 
| 216 | 
            -
             | 
| 217 | 
            -
             | 
| 218 | 
            -
             | 
| 219 | 
            -
                                   },
         | 
| 220 | 
            -
                                   { "node" : "text",
         | 
| 221 | 
            -
                                     "name" : "pub_date",
         | 
| 222 | 
            -
                                     "path" : "./td[2]"
         | 
| 223 | 
            -
                                   }]
         | 
| 224 | 
            -
                               }]
         | 
| 225 | 
            -
                           }|
         | 
| 186 | 
            +
                  src = %q|
         | 
| 187 | 
            +
                  {
         | 
| 188 | 
            +
                    "struct_tables": {
         | 
| 189 | 
            +
                      "path": "/html/body/table",
         | 
| 190 | 
            +
                      "struct_table": {
         | 
| 191 | 
            +
                        "path": "./tr",
         | 
| 192 | 
            +
                        "text_title": "./td[1]",
         | 
| 193 | 
            +
                        "text_pub_date": "./td[2]"
         | 
| 194 | 
            +
                      }
         | 
| 195 | 
            +
                    }
         | 
| 196 | 
            +
                  }|
         | 
| 226 197 | 
             
                  generated = Yasuri.json2tree(src)
         | 
| 227 198 | 
             
                  original  = Yasuri::StructNode.new('/html/body/table', "tables", [
         | 
| 228 199 | 
             
                    Yasuri::StructNode.new('./tr', "table", [
         | 
| @@ -235,22 +206,22 @@ EOB | |
| 235 206 | 
             
                end
         | 
| 236 207 | 
             
              end
         | 
| 237 208 |  | 
| 209 | 
            +
             | 
| 238 210 | 
             
              #############
         | 
| 239 211 | 
             
              # tree2json #
         | 
| 240 212 | 
             
              #############
         | 
| 241 213 | 
             
              describe '.tree2json' do
         | 
| 242 214 | 
             
                it "return empty json" do
         | 
| 243 | 
            -
                   | 
| 244 | 
            -
                  expect(json).to match "{}"
         | 
| 215 | 
            +
                  expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
         | 
| 245 216 | 
             
                end
         | 
| 246 217 |  | 
| 247 218 | 
             
                it "return text node" do
         | 
| 248 219 | 
             
                  node = Yasuri::TextNode.new("/html/head/title", "title")
         | 
| 249 220 | 
             
                  json = Yasuri.tree2json(node)
         | 
| 250 | 
            -
                  expected_str = %q| | 
| 251 | 
            -
             | 
| 252 | 
            -
             | 
| 253 | 
            -
             | 
| 221 | 
            +
                  expected_str = %q|
         | 
| 222 | 
            +
                  {
         | 
| 223 | 
            +
                    "text_title": "/html/head/title"
         | 
| 224 | 
            +
                  }|
         | 
| 254 225 | 
             
                  expected = JSON.parse(expected_str)
         | 
| 255 226 | 
             
                  actual   = JSON.parse(json)
         | 
| 256 227 | 
             
                  expect(actual).to match expected
         | 
| @@ -259,11 +230,13 @@ EOB | |
| 259 230 | 
             
                it "return text node with truncate_regexp" do
         | 
| 260 231 | 
             
                  node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
         | 
| 261 232 | 
             
                  json = Yasuri.tree2json(node)
         | 
| 262 | 
            -
                  expected_str = %q| | 
| 263 | 
            -
             | 
| 264 | 
            -
             | 
| 265 | 
            -
             | 
| 266 | 
            -
             | 
| 233 | 
            +
                  expected_str = %q|
         | 
| 234 | 
            +
                  {
         | 
| 235 | 
            +
                    "text_title": {
         | 
| 236 | 
            +
                      "path": "/html/head/title",
         | 
| 237 | 
            +
                      "truncate": "^[^,]+"
         | 
| 238 | 
            +
                    }
         | 
| 239 | 
            +
                  }|
         | 
| 267 240 | 
             
                  expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
         | 
| 268 241 | 
             
                  actual   = Yasuri.tree2json(Yasuri.json2tree(json))
         | 
| 269 242 | 
             
                  expect(actual).to match expected
         | 
| @@ -276,19 +249,12 @@ EOB | |
| 276 249 | 
             
                  ])
         | 
| 277 250 | 
             
                  actual_json = Yasuri.tree2json(tree)
         | 
| 278 251 |  | 
| 279 | 
            -
                  expected_json = %q| | 
| 280 | 
            -
             | 
| 281 | 
            -
                    " | 
| 282 | 
            -
             | 
| 283 | 
            -
                        "name"  : "content01",
         | 
| 284 | 
            -
                        "path"  : "/html/body/p[1]"
         | 
| 285 | 
            -
                      },
         | 
| 286 | 
            -
                      { "node"  : "text",
         | 
| 287 | 
            -
                        "name"  : "content02",
         | 
| 288 | 
            -
                        "path"  : "/html/body/p[2]"
         | 
| 289 | 
            -
                      }
         | 
| 290 | 
            -
                    ]
         | 
| 252 | 
            +
                  expected_json = %q|
         | 
| 253 | 
            +
                  {
         | 
| 254 | 
            +
                    "text_content01": "/html/body/p[1]",
         | 
| 255 | 
            +
                    "text_content02": "/html/body/p[2]"
         | 
| 291 256 | 
             
                  }|
         | 
| 257 | 
            +
             | 
| 292 258 | 
             
                  expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
         | 
| 293 259 | 
             
                  actual   = Yasuri.tree2json(Yasuri.json2tree(actual_json))
         | 
| 294 260 | 
             
                  expect(actual).to match expected
         | 
| @@ -299,14 +265,14 @@ EOB | |
| 299 265 | 
             
                            Yasuri::TextNode.new('/html/body/p', "content"),
         | 
| 300 266 | 
             
                          ])
         | 
| 301 267 | 
             
                  json   = Yasuri.tree2json(tree)
         | 
| 302 | 
            -
             | 
| 303 | 
            -
             | 
| 304 | 
            -
             | 
| 305 | 
            -
             | 
| 306 | 
            -
             | 
| 307 | 
            -
             | 
| 308 | 
            -
             | 
| 309 | 
            -
             | 
| 268 | 
            +
             | 
| 269 | 
            +
                  expected_src = %q|
         | 
| 270 | 
            +
                  {
         | 
| 271 | 
            +
                    "links_root": {
         | 
| 272 | 
            +
                      "path": "/html/body/a",
         | 
| 273 | 
            +
                      "text_content":"/html/body/p"
         | 
| 274 | 
            +
                    }
         | 
| 275 | 
            +
                  }|
         | 
| 310 276 | 
             
                  expected  = JSON.parse(expected_src)
         | 
| 311 277 | 
             
                  actual    = JSON.parse(json)
         | 
| 312 278 | 
             
                  expect(actual).to match expected
         | 
| @@ -318,25 +284,44 @@ EOB | |
| 318 284 | 
             
                         ], limit:10)
         | 
| 319 285 |  | 
| 320 286 | 
             
                  json   = Yasuri.tree2json(tree)
         | 
| 321 | 
            -
                  expected_src = %q| | 
| 322 | 
            -
             | 
| 323 | 
            -
             | 
| 324 | 
            -
             | 
| 325 | 
            -
             | 
| 326 | 
            -
             | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 | 
            -
             | 
| 330 | 
            -
                                     }|
         | 
| 287 | 
            +
                  expected_src = %q|
         | 
| 288 | 
            +
                  {
         | 
| 289 | 
            +
                    "pages_root": {
         | 
| 290 | 
            +
                      "path": "/html/body/nav/span/a[@class='next']",
         | 
| 291 | 
            +
                      "limit": 10,
         | 
| 292 | 
            +
                      "flatten": false,
         | 
| 293 | 
            +
                      "text_content": "/html/body/p"
         | 
| 294 | 
            +
                    }
         | 
| 295 | 
            +
                  }|
         | 
| 331 296 | 
             
                  expected  = JSON.parse(expected_src)
         | 
| 332 297 | 
             
                  actual    = JSON.parse(json)
         | 
| 333 298 | 
             
                  expect(actual).to match expected
         | 
| 334 299 | 
             
                end
         | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 300 | 
             
              end
         | 
| 339 301 |  | 
| 302 | 
            +
              it "return StructNode/StructNode/[TextNode,TextNode]" do
         | 
| 303 | 
            +
                tree  = Yasuri::StructNode.new('/html/body/table', "tables", [
         | 
| 304 | 
            +
                  Yasuri::StructNode.new('./tr', "table", [
         | 
| 305 | 
            +
                    Yasuri::TextNode.new('./td[1]', "title"),
         | 
| 306 | 
            +
                    Yasuri::TextNode.new('./td[2]', "pub_date"),
         | 
| 307 | 
            +
                  ])
         | 
| 308 | 
            +
                ])
         | 
| 309 | 
            +
                json   = Yasuri.tree2json(tree)
         | 
| 310 | 
            +
                expected_src = %q|
         | 
| 311 | 
            +
                {
         | 
| 312 | 
            +
                  "struct_tables": {
         | 
| 313 | 
            +
                    "path": "/html/body/table",
         | 
| 314 | 
            +
                    "struct_table": {
         | 
| 315 | 
            +
                      "path": "./tr",
         | 
| 316 | 
            +
                      "text_title": "./td[1]",
         | 
| 317 | 
            +
                      "text_pub_date": "./td[2]"
         | 
| 318 | 
            +
                    }
         | 
| 319 | 
            +
                  }
         | 
| 320 | 
            +
                }|
         | 
| 321 | 
            +
                expected  = JSON.parse(expected_src)
         | 
| 322 | 
            +
                actual    = JSON.parse(json)
         | 
| 323 | 
            +
                expect(actual).to match expected
         | 
| 324 | 
            +
              end
         | 
| 340 325 |  | 
| 341 326 | 
             
              it 'has a version number' do
         | 
| 342 327 | 
             
                expect(Yasuri::VERSION).not_to be nil
         |