RubyGems - digger - Versions diffs - 0.0.1 - Mend

digger 0.0.1

Files changed (15) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 71e41cc25211835ca901f56a0514d6d4f94326d9
+  data.tar.gz: e7f954b64cb216e5cda6391940576fe0f188553e
+SHA512:
+  metadata.gz: 78a0717ae08e03a0325dc338411f3aae055a2f19c0849802d2aa8c0b17b6fabeacc98a9dea66142bf8f5058b08c6d4043244b65db944f28dc5f8317ccc641f4f
+  data.tar.gz: 186bfded593330616d7849dd519b8f47a5bd3e1ed2dce7bb33580dcbdb4e61bbb1eabb49bcf7a7c23d6cd93d56fed8e93cc41d366797914b55e83da7c0638437

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in digger.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2015 binz
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# Digger
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'digger'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install digger
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/digger/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/digger.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'digger/version'
+Gem::Specification.new do |spec|
+  spec.name          = "digger"
+  spec.version       = Digger::VERSION
+  spec.authors       = ["binz"]
+  spec.email         = ["xinkiang@gmail.com"]
+  spec.summary       = %q{Dig need stractual infomation from web page.}
+  spec.description   = %q{}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.7"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
+  spec.add_runtime_dependency 'http-cookie', '~> 1.0'
+end

data/lib/digger.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'digger/version'
+require 'digger/page'
+require 'digger/http'
+require 'digger/pattern'
+require 'digger/model'
+module Digger
+  #
+end

data/lib/digger/http.rb ADDED Viewed

@@ -0,0 +1,284 @@
+require 'net/http'
+require 'http/cookie'
+require 'zlib'
+require 'digger/page'
+# https://github.com/taganaka/polipus/blob/master/lib/polipus/http.rb
+module Digger
+  class HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECT_LIMIT = 5
+    RESCUABLE_ERRORS = [
+      EOFError,
+      Errno::ECONNREFUSED,
+      Errno::ECONNRESET,
+      Errno::EHOSTUNREACH,
+      Errno::EINVAL,
+      Errno::EPIPE,
+      Errno::ETIMEDOUT,
+      Net::HTTPBadResponse,
+      Net::HTTPHeaderSyntaxError,
+      Net::ProtocolError,
+      SocketError,
+      Timeout::Error,
+      Zlib::DataError,
+      Zlib::GzipFile::Error
+    ]
+    def initialize(opts = {})
+      @connections = {}
+      @connections_hits = {}
+      @opts = opts
+    end
+    #
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
+    #
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
+      url = URI(url)
+      pages = []
+      get(url, referer) do |response, code, location, redirect_to, response_time|
+        handle_compression response
+        pages << Page.new(location, body: response.body,
+                                    code: code,
+                                    headers: response.to_hash,
+                                    referer: referer,
+                                    depth: depth,
+                                    redirect_to: redirect_to,
+                                    response_time: response_time,
+                                    fetched_at: Time.now.to_i)
+      end
+      pages
+    rescue *RESCUABLE_ERRORS => e
+      if verbose?
+        puts e.inspect
+        puts e.backtrace
+      end
+      [Page.new(url, error: e, referer: referer, depth: depth)]
+    end
+    #
+    # The maximum number of redirects to follow
+    #
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+    #
+    # The user-agent string which will be sent with each request,
+    # or nil if no such option is set
+    #
+    def user_agent
+      if @opts[:user_agent].respond_to?(:sample)
+        @opts[:user_agent].sample
+      else
+        @opts[:user_agent]
+      end
+    end
+    #
+    # The proxy address string
+    #
+    def proxy_host
+      @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
+    end
+    #
+    # The proxy port
+    #
+    def proxy_port
+      @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
+    end
+    #
+    # The proxy username
+    #
+    def proxy_user
+      @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
+    end
+    #
+    # The proxy password
+    #
+    def proxy_pass
+      #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
+      @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
+    end
+    #
+    # Shorthand to get proxy info with a single call
+    # It returns an array of ['addr', port, 'user', 'pass']
+    #
+    def proxy_host_port
+      @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
+    end
+    #
+    # HTTP read timeout in seconds
+    #
+    def read_timeout
+      @opts[:read_timeout]
+    end
+    #
+    # HTTP open timeout in seconds
+    #
+    def open_timeout
+      @opts[:open_timeout]
+    end
+    # Does this HTTP client accept cookies from the server?
+    #
+    def accept_cookies?
+      @opts[:accept_cookies]
+    end
+    def cookie_jar
+      @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
+      @opts[:cookie_jar]
+    end
+    private
+    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
+    def get(url, referer = nil)
+      limit = redirect_limit
+      loc = url
+      loop do
+        # if redirected to a relative url, merge it with the host of the original
+        # request url
+        loc = url.merge(loc) if loc.relative?
+        response, response_time = get_response(loc, referer)
+        code = Integer(response.code)
+        redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
+        yield response, code, loc, redirect_to, response_time
+        limit -= 1
+        break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
+      end
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def get_response(url, referer = nil)
+      full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
+      opts['Cookie']  = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
+      opts['Accept-Encoding'] = 'gzip,deflate'
+      retries = 0
+      begin
+        start = Time.now
+        # format request
+        req = Net::HTTP::Get.new(full_path, opts)
+        # HTTP Basic authentication
+        req.basic_auth url.user, url.password if url.user
+        if @opts[:http_user]
+          req.basic_auth @opts[:http_user], @opts[:http_password]
+        end
+        # urls auth schema has higher priority
+        req.basic_auth url.user, url.password if url.user
+        response = connection(url).request(req)
+        finish = Time.now
+        response_time = ((finish - start) * 1000).round
+        cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
+        return response, response_time
+      rescue *RESCUABLE_ERRORS => e
+        puts e.inspect if verbose?
+        refresh_connection(url)
+        retries += 1
+        if retries < 3
+          retry
+        else
+          raise e
+        end
+      end
+    end
+    def connection(url)
+      @connections[url.host] ||= {}
+      @connections_hits[url.host] ||= {}
+      if @connections[url.host][url.port]
+        if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
+          @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
+          return refresh_connection url
+        end
+        @connections_hits[url.host][url.port] += 1
+        return @connections[url.host][url.port]
+      end
+      refresh_connection url
+    end
+    def refresh_connection(url)
+      if @opts[:logger] && proxy_host && proxy_port
+        @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
+      end
+      # Block has higher priority
+      unless @opts[:proxy_host_port].nil?
+        p_host, p_port, p_user, p_pass = proxy_host_port
+      else
+        p_host = proxy_host
+        p_port = proxy_port
+        p_user = proxy_user
+        p_pass = proxy_pass
+      end
+      http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
+      http.read_timeout = read_timeout if read_timeout
+      http.open_timeout = open_timeout if open_timeout
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      @connections_hits[url.host][url.port] = 1
+      @connections[url.host][url.port] = http.start
+    end
+    def verbose?
+      @opts[:verbose]
+    end
+    #
+    # Allowed to connect to the requested url?
+    #
+    def allowed?(to_url, from_url)
+      to_url.host.nil? || (to_url.host == from_url.host)
+    end
+    def handle_compression(response)
+      case response['content-encoding']
+      when 'gzip', 'x-gzip'
+        body_io = StringIO.new(response.body)
+        response.body.replace Zlib::GzipReader.new(body_io).read
+      when 'deflate'
+        response.body.replace Zlib::Inflate.inflate(response.body)
+      end
+    end
+  end
+end

data/lib/digger/model.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'digger/pattern'
+module Digger
+  class Model
+    @@patterns = {}
+    class << self
+      def pattern_config
+        @@patterns[self.name] ||= {}
+      end
+      Pattern::TYPES.each do |method|
+        define_method method, ->(pairs, &block){
+          pairs.each_pair do |key, value|
+            pattern_config[key] = Pattern.new(type: method, value: value, block: block)
+          end
+        }
+      end
+      def index_page
+      end
+      def one_page
+      end
+    end
+    def match_page(page)
+      result = {}
+      self.class.pattern_config.each_pair do |key, pattern|
+        result[key] = pattern.match_page(page)
+      end
+      result
+    end
+    def dig(url)
+      client = Digger::HTTP.new
+      page = client.fetch_page(url)
+      match_page(page)
+    end
+  end
+end

data/lib/digger/page.rb ADDED Viewed

@@ -0,0 +1,279 @@
+require 'nokogiri'
+require 'json'
+require 'ostruct'
+require 'set'
+require 'kconv'
+# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
+module Digger
+  class Page
+    attr_reader :url
+    # The raw HTTP response body of the page
+    attr_reader :body
+    # Headers of the HTTP response
+    attr_reader :headers
+    # URL of the page this one redirected to, if any
+    attr_reader :redirect_to
+    # Exception object, if one was raised during HTTP#fetch_page
+    attr_reader :error
+    # Integer response code of the page
+    attr_accessor :code
+    # Depth of this page from the root of the crawl.
+    attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
+    # OpenStruct it holds users defined data
+    attr_accessor :user_data
+    attr_accessor :aliases
+    attr_accessor :domain_aliases
+    # Whether the current page should be stored
+    # Default: true
+    attr_accessor :storable
+    attr_accessor :fetched_at
+    #
+    # Create a new page
+    #
+    def initialize(url, params = {})
+      @url = URI(url)
+      @code = params[:code]
+      @headers = params[:headers] || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(params[:aka]).compact
+      @referer = params[:referer]
+      @depth = params[:depth] || 0
+      @redirect_to = to_absolute(params[:redirect_to])
+      @response_time = params[:response_time]
+      @body = params[:body]
+      @error = params[:error]
+      @fetched = !params[:code].nil?
+      @user_data = OpenStruct.new
+      @domain_aliases = params[:domain_aliases] ||= []
+      @storable = true
+      @fetched_at = params[:fetched_at]
+    end
+    def title
+      doc.title if doc
+    end
+    #
+    # Array of distinct A tag HREFs from the page
+    #
+    def links
+      unless @links.nil?
+        @links = Set.new
+        return [] unless doc
+        doc.search('//a[@href]').each do |a|
+          u = a['href']
+          next if u.nil? || u.empty?
+          abs = to_absolute(u) rescue next
+          @links << abs if abs && in_domain?(abs)
+        end
+      end
+      @links.to_a
+    end
+    #
+    # Nokogiri document for the HTML body
+    #
+    def doc
+      # return @doc if @doc
+      # @body ||= ''
+      # @body = @body.encode('utf-8', 'binary', :invalid => :replace,
+      #                      :undef => :replace, :replace => '')
+      # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
+      @doc ||= begin
+        Nokogiri::HTML(body) if !body.nil? && html? rescue nil
+      end
+    end
+    #
+    # Discard links, a next call of page.links will return an empty array
+    #
+    def discard_links!
+      @links = []
+    end
+    #
+    # Delete the Nokogiri document and response body to conserve memory
+    #
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = @body = nil
+    end
+    #
+    # Was the page successfully fetched?
+    # +true+ if the page was fetched with no error, +false+ otherwise.
+    #
+    def fetched?
+      @fetched
+    end
+    #
+    # The content-type returned by the HTTP request for this page
+    #
+    def content_type
+      headers['content-type'].first
+    end
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      content_type =~ %r{^(text/html|application/xhtml+xml)\b}
+    end
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300...400).include?(@code)
+    end
+    #
+    # Returns +true+ if the page is a HTTP success, returns +false+
+    # otherwise.
+    #
+    def success?
+      (200..206).include?(@code)
+    end
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+    #
+    # Base URI from the HTML doc head element
+    # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
+    #
+    def base
+      @base = if doc
+                href = doc.search('//head/base/@href')
+                URI(href.to_s) unless href.nil? rescue nil
+              end unless @base
+      return nil if @base && @base.to_s.empty?
+      @base
+    end
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      return nil if link.nil?
+      # link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
+      # remove anchor
+      link =
+        begin
+          URI.encode(URI.decode(link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
+        rescue URI::Error
+          return nil
+        end
+      relative = begin
+                   URI(link)
+                 rescue URI::Error
+                   return nil
+                 end
+      absolute = base ? base.merge(relative) : @url.merge(relative)
+      absolute.path = '/' if absolute.path.empty?
+      absolute
+    end
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      @domain_aliases ||= []
+      uri.host == @url.host || @domain_aliases.include?(uri.host)
+    end
+    def to_hash
+      {
+        'url'           => @url.to_s,
+        'headers'       => Marshal.dump(@headers),
+        'body'          => @body,
+        'links'         => links.map(&:to_s),
+        'code'          => @code,
+        'depth'         => @depth,
+        'referer'       => @referer.to_s,
+        'redirect_to'   => @redirect_to.to_s,
+        'response_time' => @response_time,
+        'fetched'       => @fetched,
+        'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
+        'fetched_at'    => @fetched_at,
+        'error'         => @error.to_s
+      }
+    end
+    def to_json
+      th = to_hash.dup
+      th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
+      th.delete('headers') if content_type.empty?
+      th.to_json
+    end
+    #
+    # Returns +true+ if page is marked as storeable
+    # +false+ otherwise
+    # Default is +true+
+    #
+    def storable?
+      @storable
+    end
+    def expired?(ttl)
+      return false if fetched_at.nil?
+      (Time.now.to_i - ttl) > fetched_at
+    end
+    def self.from_hash(hash)
+      page = new(URI(hash['url']))
+      {
+        '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
+        '@body'          => hash['body'],
+        '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
+        '@code'          => hash['code'].to_i,
+        '@depth'         => hash['depth'].to_i,
+        '@referer'       => hash['referer'],
+        '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
+        '@response_time' => hash['response_time'].to_i,
+        '@fetched'       => hash['fetched'],
+        '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
+        '@fetched_at'    => hash['fetched_at'],
+        '@error'         => hash['error']
+      }.each do |var, value|
+        page.instance_variable_set(var, value)
+      end
+      page
+    end
+    def self.from_json(json)
+      hash = JSON.parse json
+      from_hash hash
+    end
+  end
+end

data/lib/digger/pattern.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'nokogiri'
+module Digger
+  class Pattern
+    attr_accessor :type, :value, :block
+    def initialize(hash = {})
+      hash.each_pair{|key, value| send("#{key}=", value) if %w{type value block}.include?(key.to_s)}
+    end
+    def safe_block
+      block && begin
+        if block.respond_to?(:call)
+          block
+        elsif block.strip == '' #
+          nil
+        else
+          proc{ $SAFE = 2; eval block }.call
+        end
+      rescue StandardError
+        nil
+      end
+    end
+    def self.wrap(hash)
+      Hash[hash.map{|key, value| [key, value.is_a?(Pattern) ? value : Pattern.new(value)]}]
+    end
+    MATCH_MAX = 3
+    TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
+    def regexp?
+      TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
+    end
+    def match_page(page, &callback)
+      blk = callback || safe_block
+      if regexp? # regular expression
+        index = TYPES.index(type)
+        blk ||= ->(text){text.strip}
+        # content is String
+        if type == 'match_many'
+          match = page.body.gsub(value).to_a
+        else
+          matches = page.body.match(value)
+          match = matches.nil? ? nil : matches[index]
+        end
+      else # css expression
+        blk ||= ->(node){node.content.strip}
+        # content is Nokogiri::HTML::Document
+        if type == 'css_one'
+          match = page.doc.css(value).first
+        elsif type == 'css_many' # css_many
+          match = page.doc.css(value)
+        end
+      end
+      if match.nil?
+        nil
+      elsif %w{css_many match_many}.include? type
+        match.map{|node| blk.call(node) }.uniq
+      else
+        blk.call(match)
+      end
+    rescue
+      nil
+    end
+    class Nokogiri::XML::Node
+      %w{one many}.each do |name|
+        define_method "inner_#{name}" do |css, &block|
+          callback = ->(node) do
+            if node
+              (block || ->(n){n.text.strip}).call(node)
+            else
+              nil
+            end
+          end
+          if name == 'one' # inner_one
+            callback.call(self.css(css).first)
+          else # inner_many
+            self.css(css).map{|node| callback.call(node)}
+          end
+        end
+      end
+      def source
+        to_xml
+      end
+    end # nokogiri
+  end
+end

data/lib/digger/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Digger
+  VERSION = "0.0.1"
+end

data/spec/digger_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'digger'
+http = Digger::HTTP.new
+page = http.fetch_page('http://nan.so/')
+pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
+class Item < Digger::Model
+  css_many sites: '.sites>a>span'
+end
+describe Digger do
+  it "http should fetch a page" do
+    expect(page.code).to eq(200)
+  end
+  it "pattern should match content" do
+    sites = pattern.match_page(page)
+    expect(sites.include?('百度网盘')).to eq(true)
+  end
+  it "model should dig content" do
+    item = Item.new.match_page(page)
+    expect(item[:sites].include?('读远')).to be(true)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,114 @@
+--- !ruby/object:Gem::Specification
+name: digger
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- binz
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-03-27 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.7'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+- !ruby/object:Gem::Dependency
+  name: http-cookie
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+description: ''
+email:
+- xinkiang@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- digger.gemspec
+- lib/digger.rb
+- lib/digger/http.rb
+- lib/digger/model.rb
+- lib/digger/page.rb
+- lib/digger/pattern.rb
+- lib/digger/version.rb
+- spec/digger_spec.rb
+homepage: ''
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Dig need stractual infomation from web page.
+test_files:
+- spec/digger_spec.rb