RubyGems - webpage - Versions diffs - 0.0.1 → 0.0.2 - Mend

webpage 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/webpage.rb ADDED Viewed

@@ -0,0 +1,128 @@
+#coding:UTF-8
+require 'pp'
+require 'mechanize'
+require 'uri'
+class Webpage
+    attr_reader:links
+    def initialize(uri)
+        @uri = URI.parse(encode(uri))
+        @outbound_links = Array.new
+        @outter_inbound_links = Array.new
+        @inbound_links = Array.new
+        @internal_links = Array.new
+        @links = Array.new
+        @uri_dirname = File.dirname(@uri.path)
+        @uri_domain = host_to_domain @uri.host
+        @accessed_uri = Array.new
+    end
+    def encode(str)
+        return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
+    end
+    def host_to_domain(host)
+        return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
+    end
+    #get all links from html content
+    #1.$all = get all <a>
+    #2.$href = get all href from $all
+    #3.make all $href to be absolute path and put to @links
+    def links
+        return @links unless @links.empty?
+        begin
+            agent = Mechanize.new
+            agent.open_timeout = 5
+            agent.get @uri do |page|
+                page.links.each do |link| #1
+                    next if link.href.nil?
+                    uri = encode(link.href.strip)
+                    begin
+                        @links << @uri.merge(uri).to_s
+                    rescue URI::InvalidURIError,URI::InvalidComponentError
+                        warn "ignore\n #{uri} \n #{link.href}"
+                    end
+                end
+            end
+        rescue Errno::ETIMEDOUT,Timeout::Error
+            warn "timeout:#{@uri}"
+        rescue NoMethodError => e
+            warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
+        rescue Zlib::GzipFile::Error,Mechanize::Error => e
+            warn "gzip error:#{@uri}.#{e}"
+        rescue Net::HTTP::Persistent::Error
+            warn "network reset:#{@uri}"
+        rescue SocketError =>e
+            warn "#{e}.#{@uri}"
+        end
+        return Array.new if @links.empty?
+        @links = @links.uniq - @accessed_uri
+        @accessed_uri += @links
+        @links.each do |a|
+            uri = URI.parse(encode(a))
+            next if uri.host.nil?
+            if uri.host.end_with?@uri_domain
+                @internal_links << a
+            else
+                @outbound_links << a
+            end
+        end
+        return @links
+    end
+    def internal_links
+        return @internal_links if links
+        return false
+    end
+    def outbound_links
+        return @outbound_links if links
+        return false
+    end
+    def inbound_links
+        return @inbound_links unless @inbound_links.empty?
+        outbound_links.each do |outlink|
+            begin
+                w = Webpage.new(outlink)
+            rescue URI::InvalidURIError
+                warn "bad uri:#{outlink}"
+                next
+            end
+            w.links.each do |uri|
+                next unless uri.start_with?'http'
+                begin
+                    uri = URI.parse(encode(uri))
+                    next if uri.host.nil?
+                    @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
+                rescue URI::InvalidURIError
+                    warn "bad uri:#{uri}"
+                end
+            end
+        end
+        return @inbound_links.uniq
+    end
+    def outter_inbound_links
+        return @outter_inbound_links unless @outter_inbound_links.empty?
+        inbound_links.each do |inlink|
+            inlink = URI.parse inlink
+            @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
+        end
+        return @outter_inbound_links
+    end
+    def friend_links#inbound && outbound
+    end
+    def pagerank
+        return @pagerank unless @pagerank
+        require 'PageRankr'
+        @pagerank = PageRankr.ranks(@uri.to_s, :google)
+        return @pagerank
+    end
+    def ppl#pagerank_per_link
+        return (@pagerank / links.count)
+    end
+end
+w = Webpage.new('http://auto.163.com')
+puts w.outter_inbound_links

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: webpage
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,16 +9,16 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-11 00:00:00.000000000 Z
+date: 2012-04-11 00:00:00.000000000 Z
 dependencies: []
-description: to show a report of the webpage
+description: to show seo oriented reports of the webpage,newbie's work, careful
 email: seoaqua@qq.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/webpage.rb
-homepage: http://seoaqua.com/
+- webpage.rb
+homepage: http://seoaqua.com
 licenses: []
 post_install_message:
 rdoc_options: []
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.11
+rubygems_version: 1.8.21
 signing_key:
 specification_version: 3
-summary: to show a report of the webpage
+summary: to show seo oriented reports of the webpage,newbie's work, careful
 test_files: []

data/lib/webpage.rb DELETED Viewed

File without changes