RubyGems - webpage - Versions diffs - 0.0.1 → 0.0.2 - Mend

webpage 0.0.1 → 0.0.2

Files changed (3) hide show

data/webpage.rb ADDED Viewed

@@ -0,0 +1,128 @@
+#coding:UTF-8
+require 'pp'
+require 'mechanize'
+require 'uri'
+class Webpage
+    attr_reader:links
+    def initialize(uri)
+        @uri = URI.parse(encode(uri))
+        @outbound_links = Array.new
+        @outter_inbound_links = Array.new
+        @inbound_links = Array.new
+        @internal_links = Array.new
+        @links = Array.new
+        @uri_dirname = File.dirname(@uri.path)
+        @uri_domain = host_to_domain @uri.host
+        @accessed_uri = Array.new
+    end
+    def encode(str)
+        return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
+    end
+    def host_to_domain(host)
+        return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
+    end
+    #get all links from html content
+    #1.$all = get all <a>
+    #2.$href = get all href from $all
+    #3.make all $href to be absolute path and put to @links
+    def links
+        return @links unless @links.empty?
+        begin
+            agent = Mechanize.new
+            agent.open_timeout = 5
+            agent.get @uri do |page|
+                page.links.each do |link| #1
+                    next if link.href.nil?
+                    uri = encode(link.href.strip)
+                    begin
+                        @links << @uri.merge(uri).to_s
+                    rescue URI::InvalidURIError,URI::InvalidComponentError
+                        warn "ignore\n #{uri} \n #{link.href}"
+                    end
+                end
+            end
+        rescue Errno::ETIMEDOUT,Timeout::Error
+            warn "timeout:#{@uri}"
+        rescue NoMethodError => e
+            warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
+        rescue Zlib::GzipFile::Error,Mechanize::Error => e
+            warn "gzip error:#{@uri}.#{e}"
+        rescue Net::HTTP::Persistent::Error
+            warn "network reset:#{@uri}"
+        rescue SocketError =>e
+            warn "#{e}.#{@uri}"
+        end
+        return Array.new if @links.empty?
+        @links = @links.uniq - @accessed_uri
+        @accessed_uri += @links
+        @links.each do |a|
+            uri = URI.parse(encode(a))
+            next if uri.host.nil?
+            if uri.host.end_with?@uri_domain
+                @internal_links << a
+            else
+                @outbound_links << a
+            end
+        end
+        return @links
+    end
+    def internal_links
+        return @internal_links if links
+        return false
+    end
+    def outbound_links
+        return @outbound_links if links
+        return false
+    end
+    def inbound_links
+        return @inbound_links unless @inbound_links.empty?
+        outbound_links.each do |outlink|
+            begin
+                w = Webpage.new(outlink)
+            rescue URI::InvalidURIError
+                warn "bad uri:#{outlink}"
+                next
+            end
+            w.links.each do |uri|
+                next unless uri.start_with?'http'
+                begin
+                    uri = URI.parse(encode(uri))
+                    next if uri.host.nil?
+                    @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
+                rescue URI::InvalidURIError
+                    warn "bad uri:#{uri}"
+                end
+            end
+        end
+        return @inbound_links.uniq
+    end
+    def outter_inbound_links
+        return @outter_inbound_links unless @outter_inbound_links.empty?
+        inbound_links.each do |inlink|
+            inlink = URI.parse inlink
+            @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
+        end
+        return @outter_inbound_links
+    end
+    def friend_links#inbound && outbound
+    end
+    def pagerank
+        return @pagerank unless @pagerank
+        require 'PageRankr'
+        @pagerank = PageRankr.ranks(@uri.to_s, :google)
+        return @pagerank
+    end
+    def ppl#pagerank_per_link
+        return (@pagerank / links.count)
+    end
+end
+w = Webpage.new('http://auto.163.com')
+puts w.outter_inbound_links

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: webpage
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,16 +9,16 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-11 00:00:00.000000000 Z
+date: 2012-04-11 00:00:00.000000000 Z
 dependencies: []
-description: to show a report of the webpage
+description: to show seo oriented reports of the webpage,newbie's work, careful
 email: seoaqua@qq.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/webpage.rb
-homepage: http://seoaqua.com/
+- webpage.rb
+homepage: http://seoaqua.com
 licenses: []
 post_install_message:
 rdoc_options: []
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.11
+rubygems_version: 1.8.21
 signing_key:
 specification_version: 3
-summary: to show a report of the webpage
+summary: to show seo oriented reports of the webpage,newbie's work, careful
 test_files: []

data/lib/webpage.rb DELETED Viewed

File without changes