webpage 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/webpage.rb +121 -276
 - metadata +6 -4
 
    
        data/webpage.rb
    CHANGED
    
    | 
         @@ -2,315 +2,160 @@ 
     | 
|
| 
       2 
2 
     | 
    
         
             
            require 'pp'
         
     | 
| 
       3 
3 
     | 
    
         
             
            require 'mechanize'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'uri'
         
     | 
| 
       5 
     | 
    
         
            -
            class Webpage
         
     | 
| 
       6 
     | 
    
         
            -
                attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
         
     | 
| 
       7 
     | 
    
         
            -
                attr_accessor :ignored_exts
         
     | 
| 
       8 
     | 
    
         
            -
                def initialize(uri)
         
     | 
| 
       9 
     | 
    
         
            -
                    @links = Array.new
         
     | 
| 
       10 
     | 
    
         
            -
                    @relative_paths = Array.new
         
     | 
| 
       11 
     | 
    
         
            -
                    @outbound_links = Array.new
         
     | 
| 
       12 
     | 
    
         
            -
                    @internal_outbound_links = Array.new
         
     | 
| 
       13 
     | 
    
         
            -
                    @external_outbound_links = Array.new
         
     | 
| 
       14 
     | 
    
         
            -
                    @broken_outbound_links = Array.new
         
     | 
| 
       15 
     | 
    
         
            -
                    @external_inbound_links = Array.new
         
     | 
| 
       16 
     | 
    
         
            -
                    @back_links = Array.new
         
     | 
| 
       17 
     | 
    
         
            -
                    @internal_inbound_links = Array.new
         
     | 
| 
       18 
     | 
    
         
            -
                    @external_inbound_links = Array.new
         
     | 
| 
       19 
     | 
    
         
            -
                    @internal_links = Array.new
         
     | 
| 
       20 
     | 
    
         
            -
                    @invalid_links = Array.new
         
     | 
| 
       21 
     | 
    
         
            -
                    @accessed_uri = Array.new
         
     | 
| 
       22 
     | 
    
         
            -
                    @related_uris = Array.new
         
     | 
| 
       23 
     | 
    
         
            -
                    @successful = false
         
     | 
| 
       24 
     | 
    
         
            -
                    begin
         
     | 
| 
       25 
     | 
    
         
            -
                        @uri = URI.parse(uri)
         
     | 
| 
       26 
     | 
    
         
            -
                        raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
         
     | 
| 
       27 
     | 
    
         
            -
                        @domain = Webpage.host_to_domain @uri.host
         
     | 
| 
       28 
     | 
    
         
            -
                        agent = Mechanize.new
         
     | 
| 
       29 
     | 
    
         
            -
                        agent.open_timeout = 3
         
     | 
| 
       30 
     | 
    
         
            -
                        @page = agent.get @uri.to_s
         
     | 
| 
       31 
     | 
    
         
            -
                        raise 'not webpage' unless @page.class == Mechanize::Page
         
     | 
| 
       32 
     | 
    
         
            -
                        @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
         
     | 
| 
       33 
     | 
    
         
            -
                        @successful = true
         
     | 
| 
       34 
     | 
    
         
            -
                    rescue Exception => e
         
     | 
| 
       35 
     | 
    
         
            -
                        warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
         
     | 
| 
       36 
     | 
    
         
            -
                    end
         
     | 
| 
       37 
     | 
    
         
            -
                end
         
     | 
| 
       38 
     | 
    
         
            -
                
         
     | 
| 
       39 
     | 
    
         
            -
                def encoding
         
     | 
| 
       40 
     | 
    
         
            -
                    return @page.encoding
         
     | 
| 
       41 
     | 
    
         
            -
                end
         
     | 
| 
       42 
     | 
    
         
            -
                
         
     | 
| 
       43 
     | 
    
         
            -
                def keywords
         
     | 
| 
       44 
     | 
    
         
            -
                    meta = @page.search("//meta[@name='keywords']").first
         
     | 
| 
       45 
     | 
    
         
            -
                    return meta.attributes["content"].value.split(',') unless meta.nil?
         
     | 
| 
       46 
     | 
    
         
            -
                end
         
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                def description
         
     | 
| 
       49 
     | 
    
         
            -
                    meta = @page.search("//meta[@name='description']").first
         
     | 
| 
       50 
     | 
    
         
            -
                    if meta.nil?
         
     | 
| 
       51 
     | 
    
         
            -
                        return false
         
     | 
| 
       52 
     | 
    
         
            -
                    end
         
     | 
| 
       53 
     | 
    
         
            -
                    return meta.atrributes['content'].value
         
     | 
| 
       54 
     | 
    
         
            -
                end
         
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
                def body
         
     | 
| 
       57 
     | 
    
         
            -
                    return @page.body
         
     | 
| 
       58 
     | 
    
         
            -
                    #(return @page.body unless @page.body.include?'<html>') if @successful
         
     | 
| 
       59 
     | 
    
         
            -
                    #return String.new
         
     | 
| 
       60 
     | 
    
         
            -
                end
         
     | 
| 
       61 
     | 
    
         
            -
                
         
     | 
| 
       62 
     | 
    
         
            -
                def text
         
     | 
| 
       63 
     | 
    
         
            -
                    return Nokogiri::HTML(body).xpath("//text()").text
         
     | 
| 
       64 
     | 
    
         
            -
                    #return body.gsub(/<\/?[^>]*>/, "")
         
     | 
| 
       65 
     | 
    
         
            -
                end
         
     | 
| 
       66 
     | 
    
         
            -
                
         
     | 
| 
       67 
     | 
    
         
            -
                def title
         
     | 
| 
       68 
     | 
    
         
            -
                    return @page.title unless @page.title.nil?
         
     | 
| 
       69 
     | 
    
         
            -
                    return false
         
     | 
| 
       70 
     | 
    
         
            -
                end
         
     | 
| 
       71 
     | 
    
         
            -
                
         
     | 
| 
       72 
     | 
    
         
            -
                
         
     | 
| 
       73 
     | 
    
         
            -
                #get all links from html content
         
     | 
| 
       74 
     | 
    
         
            -
                #1.$all = get all <a>
         
     | 
| 
       75 
     | 
    
         
            -
                #2.$href = get all href from $all
         
     | 
| 
       76 
     | 
    
         
            -
                #3.make all $href to be absolute path and put to @links
         
     | 
| 
       77 
     | 
    
         
            -
            =begin
         
     | 
| 
       78 
     | 
    
         
            -
                def links
         
     | 
| 
       79 
     | 
    
         
            -
                    return @links unless @links.empty?
         
     | 
| 
       80 
     | 
    
         
            -
                    begin
         
     | 
| 
       81 
     | 
    
         
            -
                        agent = Mechanize.new
         
     | 
| 
       82 
     | 
    
         
            -
                        agent.open_timeout = 5
         
     | 
| 
       83 
     | 
    
         
            -
                        agent.get @uri do |page|
         
     | 
| 
       84 
     | 
    
         
            -
                            page.links.each do |link| #1
         
     | 
| 
       85 
     | 
    
         
            -
                                next if link.href.nil?
         
     | 
| 
       86 
     | 
    
         
            -
                                uri = Webpage.uri_normalize(link.href)
         
     | 
| 
       87 
     | 
    
         
            -
                                begin
         
     | 
| 
       88 
     | 
    
         
            -
                                    @links << @uri.merge(uri).to_s
         
     | 
| 
       89 
     | 
    
         
            -
                                rescue URI::InvalidURIError,URI::InvalidComponentError
         
     | 
| 
       90 
     | 
    
         
            -
                                    warn "ignore\n #{uri} \n #{link.href}"
         
     | 
| 
       91 
     | 
    
         
            -
                                end
         
     | 
| 
       92 
     | 
    
         
            -
                            end
         
     | 
| 
       93 
     | 
    
         
            -
                        end
         
     | 
| 
       94 
     | 
    
         
            -
                    rescue Errno::ETIMEDOUT,Timeout::Error
         
     | 
| 
       95 
     | 
    
         
            -
                        warn "timeout:#{@uri}"
         
     | 
| 
       96 
     | 
    
         
            -
                    rescue NoMethodError => e
         
     | 
| 
       97 
     | 
    
         
            -
                        warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
         
     | 
| 
       98 
     | 
    
         
            -
                    rescue Zlib::GzipFile::Error,Mechanize::Error => e
         
     | 
| 
       99 
     | 
    
         
            -
                        warn "gzip error:#{@uri}.#{e}"
         
     | 
| 
       100 
     | 
    
         
            -
                    rescue Net::HTTP::Persistent::Error
         
     | 
| 
       101 
     | 
    
         
            -
                        warn "network reset:#{@uri}"
         
     | 
| 
       102 
     | 
    
         
            -
                    rescue SocketError =>e
         
     | 
| 
       103 
     | 
    
         
            -
                        warn "#{e}.#{@uri}"
         
     | 
| 
       104 
     | 
    
         
            -
                    end
         
     | 
| 
       105 
     | 
    
         
            -
                    return Array.new if @links.empty?
         
     | 
| 
       106 
     | 
    
         
            -
                    #@links = @links.uniq - @accessed_uri
         
     | 
| 
       107 
     | 
    
         
            -
                    #@accessed_uri += @links
         
     | 
| 
       108 
     | 
    
         
            -
                    @links.uniq!
         
     | 
| 
       109 
     | 
    
         
            -
                    scan_links
         
     | 
| 
       110 
     | 
    
         
            -
                    return @links
         
     | 
| 
       111 
     | 
    
         
            -
                end
         
     | 
| 
       112 
     | 
    
         
            -
            =end
         
     | 
| 
       113 
     | 
    
         
            -
                
         
     | 
| 
       114 
     | 
    
         
            -
                def report
         
     | 
| 
       115 
     | 
    
         
            -
                    scan_links
         
     | 
| 
       116 
     | 
    
         
            -
                    scan_outbound_links
         
     | 
| 
       117 
     | 
    
         
            -
                    scan_inbound_links
         
     | 
| 
       118 
     | 
    
         
            -
                    report = {
         
     | 
| 
       119 
     | 
    
         
            -
                        :internal_links => @internal_links,
         
     | 
| 
       120 
     | 
    
         
            -
                        :internal_outbound_links => @internal_outbound_links,
         
     | 
| 
       121 
     | 
    
         
            -
                        :outbound_links => @outbound_links,
         
     | 
| 
       122 
     | 
    
         
            -
                        :broken_outbound_links => @broken_outbound_links,
         
     | 
| 
       123 
     | 
    
         
            -
                        :external_inbound_links => @external_inbound_links,
         
     | 
| 
       124 
     | 
    
         
            -
                        :internal_inbound_links => @internal_inbound_links,
         
     | 
| 
       125 
     | 
    
         
            -
                        :external_outbound_links => @external_outbound_links,
         
     | 
| 
       126 
     | 
    
         
            -
                        :related_uris => @related_uris,
         
     | 
| 
       127 
     | 
    
         
            -
                        :invalid_links => @invalid_links
         
     | 
| 
       128 
     | 
    
         
            -
                    }
         
     | 
| 
       129 
     | 
    
         
            -
                end
         
     | 
| 
       130 
     | 
    
         
            -
                
         
     | 
| 
       131 
     | 
    
         
            -
                
         
     | 
| 
       132 
     | 
    
         
            -
            =begin
         
     | 
| 
       133 
     | 
    
         
            -
                def external_outbound_links
         
     | 
| 
       134 
     | 
    
         
            -
                    return @external_outbound_links  unless @external_outbound_links.empty?
         
     | 
| 
       135 
     | 
    
         
            -
                    links
         
     | 
| 
       136 
     | 
    
         
            -
                    return @external_outbound_links
         
     | 
| 
       137 
     | 
    
         
            -
                end
         
     | 
| 
       138 
     | 
    
         
            -
             
     | 
| 
       139 
     | 
    
         
            -
                def internal_outbound_links
         
     | 
| 
       140 
     | 
    
         
            -
                    return @internal_outbound_links unless @internal_outbound_links.empty?
         
     | 
| 
       141 
     | 
    
         
            -
                    links
         
     | 
| 
       142 
     | 
    
         
            -
                    return @internal_outbound_links
         
     | 
| 
       143 
     | 
    
         
            -
                end
         
     | 
| 
       144 
     | 
    
         
            -
             
     | 
| 
       145 
     | 
    
         
            -
                def back_links#inbound links among all the outbound links
         
     | 
| 
       146 
     | 
    
         
            -
                    return @back_links unless @back_links.empty?
         
     | 
| 
       147 
     | 
    
         
            -
                    scan_outbound_links
         
     | 
| 
       148 
     | 
    
         
            -
                    return @back_links
         
     | 
| 
       149 
     | 
    
         
            -
                end
         
     | 
| 
       150 
     | 
    
         
            -
                
         
     | 
| 
       151 
     | 
    
         
            -
                def broken_outbound_links
         
     | 
| 
       152 
     | 
    
         
            -
                    return @broken_outbound_links unless @broken_outbound_links.empty?
         
     | 
| 
       153 
     | 
    
         
            -
                    scan_outbound_links
         
     | 
| 
       154 
     | 
    
         
            -
                    return @broken_outbound_links
         
     | 
| 
       155 
     | 
    
         
            -
                end
         
     | 
| 
       156 
     | 
    
         
            -
                
         
     | 
| 
       157 
     | 
    
         
            -
                def external_inbound_links#outter inbound links
         
     | 
| 
       158 
     | 
    
         
            -
                    return @external_inbound_links unless @external_inbound_links.empty?
         
     | 
| 
       159 
     | 
    
         
            -
                    scan_inbound_links
         
     | 
| 
       160 
     | 
    
         
            -
                    return @external_inbound_links
         
     | 
| 
       161 
     | 
    
         
            -
                end
         
     | 
| 
       162 
     | 
    
         
            -
                
         
     | 
| 
       163 
     | 
    
         
            -
                def internal_inbound_links
         
     | 
| 
       164 
     | 
    
         
            -
                    return @internal_inbound_links unless @internal_inbound_links.empty?
         
     | 
| 
       165 
     | 
    
         
            -
                    scan_inbound_links
         
     | 
| 
       166 
     | 
    
         
            -
                    return @internal_inbound_links
         
     | 
| 
       167 
     | 
    
         
            -
                end
         
     | 
| 
       168 
     | 
    
         
            -
            =end
         
     | 
| 
       169 
     | 
    
         
            -
                def pagerank
         
     | 
| 
       170 
     | 
    
         
            -
                    return @pagerank unless @pagerank.nil?
         
     | 
| 
       171 
     | 
    
         
            -
                    require 'page_rankr'
         
     | 
| 
       172 
     | 
    
         
            -
                    @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
         
     | 
| 
       173 
     | 
    
         
            -
                    return @pagerank
         
     | 
| 
       174 
     | 
    
         
            -
                end
         
     | 
| 
       175 
     | 
    
         
            -
             
     | 
| 
       176 
     | 
    
         
            -
                def ppl#pagerank per link
         
     | 
| 
       177 
     | 
    
         
            -
                    pagerank
         
     | 
| 
       178 
     | 
    
         
            -
                    return false if @pagerank.nil?
         
     | 
| 
       179 
     | 
    
         
            -
                    scan_links
         
     | 
| 
       180 
     | 
    
         
            -
                    return (@pagerank / @links.size)
         
     | 
| 
       181 
     | 
    
         
            -
                end
         
     | 
| 
       182 
     | 
    
         
            -
                def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
         
     | 
| 
       183 
     | 
    
         
            -
                    scan_links
         
     | 
| 
       184 
     | 
    
         
            -
                    raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
         
     | 
| 
       185 
     | 
    
         
            -
                    seed_uris.concat(@external_outbound_links - checked_uris)
         
     | 
| 
       186 
     | 
    
         
            -
                    related_keywords.concat(keywords)
         
     | 
| 
       187 
     | 
    
         
            -
                    result = Array.new
         
     | 
| 
       188 
     | 
    
         
            -
                    while seed_uris.size > 0 and result.size < max
         
     | 
| 
       189 
     | 
    
         
            -
                        uri = seed_uris.first
         
     | 
| 
       190 
     | 
    
         
            -
                        checked_uris << uri unless checked_uris.include?uri
         
     | 
| 
       191 
     | 
    
         
            -
                        seed_uris.delete(uri)
         
     | 
| 
       192 
     | 
    
         
            -
                        w = Webpage.new uri
         
     | 
| 
       193 
     | 
    
         
            -
                        next unless w.successful
         
     | 
| 
       194 
     | 
    
         
            -
                        text = w.body + w.title
         
     | 
| 
       195 
     | 
    
         
            -
                        related_keywords.each do |word|
         
     | 
| 
       196 
     | 
    
         
            -
                            if text.include?word
         
     | 
| 
       197 
     | 
    
         
            -
                                #result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
         
     | 
| 
       198 
     | 
    
         
            -
                                domain = Webpage.host_to_domain(URI.parse(uri).host)
         
     | 
| 
       199 
     | 
    
         
            -
                                result << domain unless result.include? domain
         
     | 
| 
       200 
     | 
    
         
            -
                                seed_uris.concat(w.external_outbound_links - checked_uris)
         
     | 
| 
       201 
     | 
    
         
            -
                                break
         
     | 
| 
       202 
     | 
    
         
            -
                            end
         
     | 
| 
       203 
     | 
    
         
            -
                        end
         
     | 
| 
       204 
     | 
    
         
            -
                    end
         
     | 
| 
       205 
     | 
    
         
            -
                    return result
         
     | 
| 
       206 
     | 
    
         
            -
                end
         
     | 
| 
       207 
     | 
    
         
            -
                
         
     | 
| 
       208 
     | 
    
         
            -
                def link_to(target_uri)
         
     | 
| 
       209 
     | 
    
         
            -
                    scan_links
         
     | 
| 
       210 
     | 
    
         
            -
                    target_uri = Webpage.uri_normalize(target_uri)
         
     | 
| 
       211 
     | 
    
         
            -
                    target_host = URI.parse(target_uri).host
         
     | 
| 
       212 
     | 
    
         
            -
                    target_domain = Webpage.host_to_domain(target_host)
         
     | 
| 
       213 
     | 
    
         
            -
                    type = 0 #not link to 
         
     | 
| 
       214 
     | 
    
         
            -
                    @links.each do |link|
         
     | 
| 
       215 
     | 
    
         
            -
                        candidate_host = URI.parse(link).host
         
     | 
| 
       216 
     | 
    
         
            -
                        if link == target_uri
         
     | 
| 
       217 
     | 
    
         
            -
                            type = 3 #definitely link to
         
     | 
| 
       218 
     | 
    
         
            -
                            break
         
     | 
| 
       219 
     | 
    
         
            -
                        elsif  URI.parse(link).host == target_host
         
     | 
| 
       220 
     | 
    
         
            -
                           type = 2 if type < 2 #link to the host
         
     | 
| 
       221 
     | 
    
         
            -
                        elsif Webpage.host_to_domain(candidate_host) == target_domain
         
     | 
| 
       222 
     | 
    
         
            -
                           type = 1 if type < 1 #link to the root domain
         
     | 
| 
       223 
     | 
    
         
            -
                        end
         
     | 
| 
       224 
     | 
    
         
            -
                    end
         
     | 
| 
       225 
     | 
    
         
            -
                    return type
         
     | 
| 
       226 
     | 
    
         
            -
                end
         
     | 
| 
       227 
5 
     | 
    
         | 
| 
      
 6 
     | 
    
         
            +
            class WebHelper
         
     | 
| 
       228 
7 
     | 
    
         
             
                def self.uri_normalize(uri)
         
     | 
| 
       229 
8 
     | 
    
         
             
                    uri = URI.parse(uri).normalize
         
     | 
| 
       230 
9 
     | 
    
         
             
                    fragment = uri.fragment
         
     | 
| 
       231 
     | 
    
         
            -
                     
     | 
| 
      
 10 
     | 
    
         
            +
                    uri = uri.to_s
         
     | 
| 
      
 11 
     | 
    
         
            +
                    uri.sub!(/##{fragment}$/,'') unless fragment.nil?
         
     | 
| 
      
 12 
     | 
    
         
            +
                    return uri
         
     | 
| 
       232 
13 
     | 
    
         
             
                    #uri = uri.to_s.strip.sub(/\#.*$/,'')
         
     | 
| 
       233 
14 
     | 
    
         
             
                    #uri.path = '/' if uri.path.nil?
         
     | 
| 
       234 
15 
     | 
    
         
             
                end
         
     | 
| 
       235 
     | 
    
         
            -
                
         
     | 
| 
       236 
16 
     | 
    
         
             
                def self.host_to_domain(host)
         
     | 
| 
       237 
17 
     | 
    
         
             
                    domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
         
     | 
| 
       238 
18 
     | 
    
         
             
                    return domain[1] unless domain.nil?
         
     | 
| 
       239 
19 
     | 
    
         
             
                    return false
         
     | 
| 
       240 
20 
     | 
    
         
             
                end
         
     | 
| 
       241 
     | 
    
         
            -
             
     | 
| 
       242 
21 
     | 
    
         
             
                def self.uri_encode(str)
         
     | 
| 
       243 
22 
     | 
    
         
             
                    return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
         
     | 
| 
       244 
23 
     | 
    
         
             
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
            end
         
     | 
| 
       245 
25 
     | 
    
         | 
| 
       246 
     | 
    
         
            -
             
     | 
| 
       247 
     | 
    
         
            -
             
     | 
| 
       248 
     | 
    
         
            -
             
     | 
| 
       249 
     | 
    
         
            -
             
     | 
| 
       250 
     | 
    
         
            -
             
     | 
| 
       251 
     | 
    
         
            -
             
     | 
| 
       252 
     | 
    
         
            -
             
     | 
| 
       253 
     | 
    
         
            -
             
     | 
| 
       254 
     | 
    
         
            -
             
     | 
| 
      
 26 
     | 
    
         
            +
            class Mechanize::Page
         
     | 
| 
      
 27 
     | 
    
         
            +
                #@invalid_links = Hash.new
         
     | 
| 
      
 28 
     | 
    
         
            +
                attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
         
     | 
| 
      
 29 
     | 
    
         
            +
                public
         
     | 
| 
      
 30 
     | 
    
         
            +
                def text
         
     | 
| 
      
 31 
     | 
    
         
            +
                    return Nokogiri::HTML(body).xpath("//text()").text
         
     | 
| 
      
 32 
     | 
    
         
            +
                    #return body.gsub(/<\/?[^>]*>/, "")
         
     | 
| 
      
 33 
     | 
    
         
            +
                end
         
     | 
| 
      
 34 
     | 
    
         
            +
                def keywords
         
     | 
| 
      
 35 
     | 
    
         
            +
                    meta = search("//meta[@name='keywords']").first
         
     | 
| 
      
 36 
     | 
    
         
            +
                    return meta.attributes["content"].value.split(',') unless meta.nil?
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                def description
         
     | 
| 
      
 40 
     | 
    
         
            +
                    meta = search("//meta[@name='description']").first
         
     | 
| 
      
 41 
     | 
    
         
            +
                    if meta.nil?
         
     | 
| 
      
 42 
     | 
    
         
            +
                        return false
         
     | 
| 
       255 
43 
     | 
    
         
             
                    end
         
     | 
| 
       256 
     | 
    
         
            -
                     
     | 
| 
       257 
     | 
    
         
            -
                    @external_inbound_links.uniq!
         
     | 
| 
      
 44 
     | 
    
         
            +
                    return meta.attributes['content'].value
         
     | 
| 
       258 
45 
     | 
    
         
             
                end
         
     | 
| 
       259 
46 
     | 
    
         | 
| 
       260 
     | 
    
         
            -
                def  
     | 
| 
       261 
     | 
    
         
            -
                     
     | 
| 
       262 
     | 
    
         
            -
             
     | 
| 
       263 
     | 
    
         
            -
             
     | 
| 
       264 
     | 
    
         
            -
                            @invalid_links << outlink
         
     | 
| 
       265 
     | 
    
         
            -
                            next
         
     | 
| 
       266 
     | 
    
         
            -
                        end
         
     | 
| 
       267 
     | 
    
         
            -
                        next if w.links.nil?
         
     | 
| 
       268 
     | 
    
         
            -
                        w.links.each do |uri|
         
     | 
| 
       269 
     | 
    
         
            -
                            #uri = URI.parse(uri)
         
     | 
| 
       270 
     | 
    
         
            -
                            #next if uri.host.nil?
         
     | 
| 
       271 
     | 
    
         
            -
                            if Webpage.host_to_domain(uri) == @domain
         
     | 
| 
       272 
     | 
    
         
            -
                                @back_links << uri.to_s
         
     | 
| 
       273 
     | 
    
         
            -
                            else
         
     | 
| 
       274 
     | 
    
         
            -
                                @broken_outbound_links << uri.to_s
         
     | 
| 
       275 
     | 
    
         
            -
                            end
         
     | 
| 
       276 
     | 
    
         
            -
                        end
         
     | 
| 
       277 
     | 
    
         
            -
                    end
         
     | 
| 
       278 
     | 
    
         
            -
                    @back_links.uniq!
         
     | 
| 
       279 
     | 
    
         
            -
                    @broken_outbound_links.uniq!
         
     | 
| 
      
 47 
     | 
    
         
            +
                def pagerank
         
     | 
| 
      
 48 
     | 
    
         
            +
                    require 'page_rankr'
         
     | 
| 
      
 49 
     | 
    
         
            +
                    @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
         
     | 
| 
      
 50 
     | 
    
         
            +
                    return @pagerank
         
     | 
| 
       280 
51 
     | 
    
         
             
                end
         
     | 
| 
       281 
52 
     | 
    
         | 
| 
       282 
53 
     | 
    
         
             
                def scan_links
         
     | 
| 
       283 
     | 
    
         
            -
                     
     | 
| 
      
 54 
     | 
    
         
            +
                    @external_outbound_links = Array.new
         
     | 
| 
      
 55 
     | 
    
         
            +
                    @internal_outbound_links = Array.new
         
     | 
| 
      
 56 
     | 
    
         
            +
                    @valid_links = Array.new
         
     | 
| 
      
 57 
     | 
    
         
            +
                    @invalid_links = Array.new
         
     | 
| 
      
 58 
     | 
    
         
            +
                    @nofollowed_links = Array.new
         
     | 
| 
       284 
59 
     | 
    
         
             
                    exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
         
     | 
| 
       285 
     | 
    
         
            -
                     
     | 
| 
      
 60 
     | 
    
         
            +
                    links.each do |link|
         
     | 
| 
       286 
61 
     | 
    
         
             
                        #初步解析
         
     | 
| 
       287 
     | 
    
         
            -
             
     | 
| 
       288 
     | 
    
         
            -
             
     | 
| 
       289 
     | 
    
         
            -
             
     | 
| 
       290 
     | 
    
         
            -
             
     | 
| 
       291 
     | 
    
         
            -
             
     | 
| 
      
 62 
     | 
    
         
            +
            =begin
         
     | 
| 
      
 63 
     | 
    
         
            +
                uri = URI.parse(link.uri).normalize
         
     | 
| 
      
 64 
     | 
    
         
            +
                href = uri.to_s
         
     | 
| 
      
 65 
     | 
    
         
            +
            rescue URI::InvalidURIError => e
         
     | 
| 
      
 66 
     | 
    
         
            +
                pp link
         
     | 
| 
      
 67 
     | 
    
         
            +
                puts e
         
     | 
| 
      
 68 
     | 
    
         
            +
                @invalid_links << link
         
     | 
| 
      
 69 
     | 
    
         
            +
                next
         
     | 
| 
      
 70 
     | 
    
         
            +
            =end
         
     | 
| 
      
 71 
     | 
    
         
            +
                        #忽略非http请求
         
     | 
| 
      
 72 
     | 
    
         
            +
                        if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
         
     | 
| 
      
 73 
     | 
    
         
            +
                            @invalid_links << link#todo 不同链接key重复,无法体现
         
     | 
| 
      
 74 
     | 
    
         
            +
                            next
         
     | 
| 
      
 75 
     | 
    
         
            +
                        end
         
     | 
| 
      
 76 
     | 
    
         
            +
                        #忽略非网页文件,忽略js按钮忽略邮件
         
     | 
| 
      
 77 
     | 
    
         
            +
                        if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
         
     | 
| 
      
 78 
     | 
    
         
            +
                            @invalid_links << link
         
     | 
| 
       292 
79 
     | 
    
         
             
                            next
         
     | 
| 
       293 
80 
     | 
    
         
             
                        end
         
     | 
| 
       294 
     | 
    
         
            -
                         
     | 
| 
       295 
     | 
    
         
            -
                        if  
     | 
| 
       296 
     | 
    
         
            -
                            @ 
     | 
| 
      
 81 
     | 
    
         
            +
                        #nofollow links
         
     | 
| 
      
 82 
     | 
    
         
            +
                        if link.rel.include?'nofollow'
         
     | 
| 
      
 83 
     | 
    
         
            +
                            @nofollowed_links << link
         
     | 
| 
       297 
84 
     | 
    
         
             
                            next
         
     | 
| 
       298 
85 
     | 
    
         
             
                        end
         
     | 
| 
      
 86 
     | 
    
         
            +
                        if link.respond_to?'fragment' and link.fragment.empty?
         
     | 
| 
      
 87 
     | 
    
         
            +
                            @invalid_links << link
         
     | 
| 
      
 88 
     | 
    
         
            +
                            next
         
     | 
| 
      
 89 
     | 
    
         
            +
                        end
         
     | 
| 
      
 90 
     | 
    
         
            +
                        pp link
         
     | 
| 
       299 
91 
     | 
    
         
             
                        #处理相对路径
         
     | 
| 
       300 
     | 
    
         
            -
                        if uri.relative?
         
     | 
| 
       301 
     | 
    
         
            -
                            @ 
     | 
| 
       302 
     | 
    
         
            -
                             
     | 
| 
       303 
     | 
    
         
            -
                             
     | 
| 
      
 92 
     | 
    
         
            +
                        if !link.uri.nil? and link.uri.relative?
         
     | 
| 
      
 93 
     | 
    
         
            +
                            @invalid_links << link
         
     | 
| 
      
 94 
     | 
    
         
            +
                            #puts @uri.merge(link)
         
     | 
| 
      
 95 
     | 
    
         
            +
                            #link.uri = @uri.merge(link.uri)
         
     | 
| 
      
 96 
     | 
    
         
            +
                            @internal_outbound_links << link unless link.uri == @uri
         
     | 
| 
      
 97 
     | 
    
         
            +
                        elsif link.uri.nil?
         
     | 
| 
      
 98 
     | 
    
         
            +
                            warn "warning: host nil #{link.uri}"
         
     | 
| 
      
 99 
     | 
    
         
            +
                            next
         
     | 
| 
       304 
100 
     | 
    
         
             
                        else
         
     | 
| 
       305 
     | 
    
         
            -
                             
     | 
| 
       306 
     | 
    
         
            -
             
     | 
| 
       307 
     | 
    
         
            -
                                @internal_outbound_links << href
         
     | 
| 
      
 101 
     | 
    
         
            +
                            if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
         
     | 
| 
      
 102 
     | 
    
         
            +
                                @internal_outbound_links << link
         
     | 
| 
       308 
103 
     | 
    
         
             
                            else
         
     | 
| 
       309 
     | 
    
         
            -
                                @external_outbound_links <<  
     | 
| 
      
 104 
     | 
    
         
            +
                                @external_outbound_links << link
         
     | 
| 
       310 
105 
     | 
    
         
             
                            end
         
     | 
| 
       311 
106 
     | 
    
         
             
                        end
         
     | 
| 
       312 
     | 
    
         
            -
                        @ 
     | 
| 
      
 107 
     | 
    
         
            +
                        @valid_links << link
         
     | 
| 
      
 108 
     | 
    
         
            +
                    end
         
     | 
| 
      
 109 
     | 
    
         
            +
                    @outbound_links = @internal_outbound_links + @external_outbound_links
         
     | 
| 
      
 110 
     | 
    
         
            +
                    @scanned = true
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
            end
         
     | 
| 
      
 113 
     | 
    
         
            +
            class URI::Generic
         
     | 
| 
      
 114 
     | 
    
         
            +
                def absolute?()
         
     | 
| 
      
 115 
     | 
    
         
            +
                    if @scheme or path.start_with?'/'
         
     | 
| 
      
 116 
     | 
    
         
            +
                        true
         
     | 
| 
      
 117 
     | 
    
         
            +
                    else
         
     | 
| 
      
 118 
     | 
    
         
            +
                        false
         
     | 
| 
      
 119 
     | 
    
         
            +
                    end
         
     | 
| 
      
 120 
     | 
    
         
            +
                end
         
     | 
| 
      
 121 
     | 
    
         
            +
                def domain
         
     | 
| 
      
 122 
     | 
    
         
            +
                    domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
         
     | 
| 
      
 123 
     | 
    
         
            +
                    return domain[1] unless domain.nil?
         
     | 
| 
      
 124 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 125 
     | 
    
         
            +
                end
         
     | 
| 
      
 126 
     | 
    
         
            +
            =begin
         
     | 
| 
      
 127 
     | 
    
         
            +
                def normalize!
         
     | 
| 
      
 128 
     | 
    
         
            +
                    if path && path == ''
         
     | 
| 
      
 129 
     | 
    
         
            +
                        set_path('/')
         
     | 
| 
      
 130 
     | 
    
         
            +
                    end
         
     | 
| 
      
 131 
     | 
    
         
            +
                    if scheme && scheme != scheme.downcase
         
     | 
| 
      
 132 
     | 
    
         
            +
                        set_scheme(self.scheme.downcase)
         
     | 
| 
       313 
133 
     | 
    
         
             
                    end
         
     | 
| 
       314 
     | 
    
         
            -
                     
     | 
| 
      
 134 
     | 
    
         
            +
                    if host && host != host.downcase
         
     | 
| 
      
 135 
     | 
    
         
            +
                        set_host(self.host.downcase)
         
     | 
| 
      
 136 
     | 
    
         
            +
                    end
         
     | 
| 
      
 137 
     | 
    
         
            +
                    set_fragment(nil) unless fragment.nil?
         
     | 
| 
       315 
138 
     | 
    
         
             
                end
         
     | 
| 
      
 139 
     | 
    
         
            +
            =end
         
     | 
| 
       316 
140 
     | 
    
         
             
            end
         
     | 
| 
      
 141 
     | 
    
         
            +
            =begin
         
     | 
| 
      
 142 
     | 
    
         
            +
            class URI::Parser
         
     | 
| 
      
 143 
     | 
    
         
            +
                def parse(uri)
         
     | 
| 
      
 144 
     | 
    
         
            +
                    scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
                    if scheme && URI.scheme_list.include?(scheme.upcase)
         
     | 
| 
      
 147 
     | 
    
         
            +
                        URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
         
     | 
| 
      
 148 
     | 
    
         
            +
                    else
         
     | 
| 
      
 149 
     | 
    
         
            +
                        URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
         
     | 
| 
      
 150 
     | 
    
         
            +
                    end
         
     | 
| 
      
 151 
     | 
    
         
            +
                end
         
     | 
| 
      
 152 
     | 
    
         
            +
            end
         
     | 
| 
      
 153 
     | 
    
         
            +
            a = Mechanize.new
         
     | 
| 
      
 154 
     | 
    
         
            +
            w = a.get('http://dict.youdao.com/w/abc/')
         
     | 
| 
      
 155 
     | 
    
         
            +
            w.scan_links
         
     | 
| 
      
 156 
     | 
    
         
            +
            pp w.internal_outbound_links
         
     | 
| 
      
 157 
     | 
    
         
            +
            exit
         
     | 
| 
      
 158 
     | 
    
         
            +
            w.links.each do |link|
         
     | 
| 
      
 159 
     | 
    
         
            +
                puts link.rel
         
     | 
| 
      
 160 
     | 
    
         
            +
            end
         
     | 
| 
      
 161 
     | 
    
         
            +
            =end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: webpage
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.5
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -11,14 +11,15 @@ bindir: bin 
     | 
|
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
12 
     | 
    
         
             
            date: 2012-04-11 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       14 
     | 
    
         
            -
            description: to show seo oriented reports of the webpage,newbie's 
     | 
| 
      
 14 
     | 
    
         
            +
            description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
         
     | 
| 
      
 15 
     | 
    
         
            +
              work, careful
         
     | 
| 
       15 
16 
     | 
    
         
             
            email: seoaqua@qq.com
         
     | 
| 
       16 
17 
     | 
    
         
             
            executables: []
         
     | 
| 
       17 
18 
     | 
    
         
             
            extensions: []
         
     | 
| 
       18 
19 
     | 
    
         
             
            extra_rdoc_files: []
         
     | 
| 
       19 
20 
     | 
    
         
             
            files:
         
     | 
| 
       20 
21 
     | 
    
         
             
            - webpage.rb
         
     | 
| 
       21 
     | 
    
         
            -
            homepage: http:// 
     | 
| 
      
 22 
     | 
    
         
            +
            homepage: http://github.com/seoaqua/ruby-webpage
         
     | 
| 
       22 
23 
     | 
    
         
             
            licenses: []
         
     | 
| 
       23 
24 
     | 
    
         
             
            post_install_message: 
         
     | 
| 
       24 
25 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
         @@ -41,5 +42,6 @@ rubyforge_project: 
     | 
|
| 
       41 
42 
     | 
    
         
             
            rubygems_version: 1.8.21
         
     | 
| 
       42 
43 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       43 
44 
     | 
    
         
             
            specification_version: 3
         
     | 
| 
       44 
     | 
    
         
            -
            summary: to show seo oriented reports of the webpage,newbie's 
     | 
| 
      
 45 
     | 
    
         
            +
            summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
         
     | 
| 
      
 46 
     | 
    
         
            +
              work, careful
         
     | 
| 
       45 
47 
     | 
    
         
             
            test_files: []
         
     |