RubyGems - webpage - Versions diffs - 0.0.5 → 0.0.6 - Mend

webpage 0.0.5 → 0.0.6

Files changed (8) hide show

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/lib/webpage/common.rb ADDED Viewed

@@ -0,0 +1,14 @@
+def fuzzy_uri(uri)
+    begin
+        URI(uri).normalize
+    rescue URI::InvalidURIError
+        URI(URI.encode(uri)).normalize
+    end
+end
+class Mechanize::Page::Link
+    def uri
+        @uri ||= if @href then
+            fuzzy_uri(@href)
+        end
+    end
+end

data/lib/webpage.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#coding:UTF-8
+require 'mechanize'
+require 'webpage/common'
+class Webpage
+    def initialize(body,options={})
+        raise ArgumentError 'body cannot be empty' unless body
+        @body = body
+        @options = options
+        @body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
+        @nokogiri = Nokogiri::HTML(@body)
+    end
+    def text
+        return @nokogiri.xpath("//text()").text
+        #return body.gsub(/<\/?[^>]*>/, "")
+    end
+    def keywords
+        @keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
+        #content = meta.attributes["content"].value unless meta.nil?
+        #return content.split(',') unless content.nil?
+    end
+    def description
+        @description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
+    end
+    def links
+        @links ||= %w(a area).map do |tag|
+            @nokogiri.xpath("//#{tag}")
+        end.flatten
+    end
+    def link_to?(target_uri)
+        links.any?{|link|make_href_absolute(link['href']) == target_uri}
+    end
+    def link_to_host?(host)
+        links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
+    end
+    private
+    def make_href_absolute(href)
+        href = fuzzy_uri(href.to_s)
+        return href.to_s if href.absolute?
+        raise 'need :basepath in options when initialize' unless @options.has_key?:basepath
+        basepath = fuzzy_uri(@options[:basepath])
+        raise 'basepath should be absolute' unless basepath.absolute?
+        URI.join(basepath,href)
+    end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end

data/spec/webpage_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#coding:UTF-8
+require 'webpage'
+uri = 'http://www.hudong.com/'
+to_uri = 'http://123.hudong.com/'
+to_host = 'hudong.com'
+page = Mechanize.new.get uri
+page = Webpage.new(page.body,{:basepath=>uri})
+describe Webpage do
+    it "text should be String" do
+        page.text.class.should == String
+    end
+    it "links should be an array" do
+        page.links.class.should == Array
+    end
+    it "links' elements should be Webpage::Link" do
+        page.links.each do |link|
+            link.class.should == Nokogiri::XML::Element
+        end
+    end
+    it "description should be text" do
+        page.description.class.should == String
+    end
+    it "keywords should be array" do
+        page.keywords.class.should == Array
+    end
+    it "keywords's values should be strings" do
+        page.keywords.each do |keyword|
+            keyword.class.should == String
+        end
+    end
+    it "link_to? should return bool" do
+        [TrueClass,FalseClass].should include page.link_to?(to_uri).class
+    end
+    it "link_to_host? should return bool" do
+        [TrueClass,FalseClass].should include page.link_to_host?(to_host).class
+    end
+end
+describe "the instance webpage" do
+    it "text should be big enought" do
+        page.text.size.should > 500
+    end
+    it "should has correct description " do
+        page.description.should == '互动百科是基于中文维基技术(维客,wiki百科)的网络百科全书,是全球最大中文百科网及百科全书。互动百科中文网,助您轻松百科探秘'
+    end
+    it "should has 7 keywords" do
+        page.keywords.size.should == 9
+    end
+    it "should link_to #{to_uri}" do
+        page.link_to?(to_uri).should be_true
+    end
+end

data/webpage.gemspec ADDED Viewed

@@ -0,0 +1,12 @@
+Gem::Specification.new do |s|
+s.name 			= %q{webpage}
+s.version 		= '0.0.6'
+s.authors 		= ["seoaqua"]
+s.date 			= %q{2012-07-29}
+s.description 	= %q{a tool to extract some basic data from a webpage}
+s.email 		= %q{seoaqua@qq.com}
+s.files			= `git ls-files`.split("\n")
+s.homepage 		= %q{https://github.com/seoaqua/webpage}
+s.summary = s.description
+s.add_development_dependency 'mechanize'
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: webpage
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
   prerelease:
 platform: ruby
 authors:
@@ -9,17 +9,37 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-11 00:00:00.000000000 Z
-dependencies: []
-description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
-  work, careful
+date: 2012-07-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: a tool to extract some basic data from a webpage
 email: seoaqua@qq.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- webpage.rb
-homepage: http://github.com/seoaqua/ruby-webpage
+- .rspec
+- lib/webpage.rb
+- lib/webpage/common.rb
+- spec/spec_helper.rb
+- spec/webpage_spec.rb
+- webpage.gemspec
+homepage: https://github.com/seoaqua/webpage
 licenses: []
 post_install_message:
 rdoc_options: []
@@ -39,9 +59,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.21
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
-summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
-  work, careful
+summary: a tool to extract some basic data from a webpage
 test_files: []

data/webpage.rb DELETED Viewed

@@ -1,161 +0,0 @@
-#coding:UTF-8
-require 'pp'
-require 'mechanize'
-require 'uri'
-class WebHelper
-    def self.uri_normalize(uri)
-        uri = URI.parse(uri).normalize
-        fragment = uri.fragment
-        uri = uri.to_s
-        uri.sub!(/##{fragment}$/,'') unless fragment.nil?
-        return uri
-        #uri = uri.to_s.strip.sub(/\#.*$/,'')
-        #uri.path = '/' if uri.path.nil?
-    end
-    def self.host_to_domain(host)
-        domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
-        return domain[1] unless domain.nil?
-        return false
-    end
-    def self.uri_encode(str)
-        return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
-    end
-end
-class Mechanize::Page
-    #@invalid_links = Hash.new
-    attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
-    public
-    def text
-        return Nokogiri::HTML(body).xpath("//text()").text
-        #return body.gsub(/<\/?[^>]*>/, "")
-    end
-    def keywords
-        meta = search("//meta[@name='keywords']").first
-        return meta.attributes["content"].value.split(',') unless meta.nil?
-    end
-    def description
-        meta = search("//meta[@name='description']").first
-        if meta.nil?
-            return false
-        end
-        return meta.attributes['content'].value
-    end
-    def pagerank
-        require 'page_rankr'
-        @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
-        return @pagerank
-    end
-    def scan_links
-        @external_outbound_links = Array.new
-        @internal_outbound_links = Array.new
-        @valid_links = Array.new
-        @invalid_links = Array.new
-        @nofollowed_links = Array.new
-        exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
-        links.each do |link|
-            #初步解析
-=begin
-    uri = URI.parse(link.uri).normalize
-    href = uri.to_s
-rescue URI::InvalidURIError => e
-    pp link
-    puts e
-    @invalid_links << link
-    next
-=end
-            #忽略非http请求
-            if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
-                @invalid_links << link#todo 不同链接key重复，无法体现
-                next
-            end
-            #忽略非网页文件,忽略js按钮忽略邮件
-            if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
-                @invalid_links << link
-                next
-            end
-            #nofollow links
-            if link.rel.include?'nofollow'
-                @nofollowed_links << link
-                next
-            end
-            if link.respond_to?'fragment' and link.fragment.empty?
-                @invalid_links << link
-                next
-            end
-            pp link
-            #处理相对路径
-            if !link.uri.nil? and link.uri.relative?
-                @invalid_links << link
-                #puts @uri.merge(link)
-                #link.uri = @uri.merge(link.uri)
-                @internal_outbound_links << link unless link.uri == @uri
-            elsif link.uri.nil?
-                warn "warning: host nil #{link.uri}"
-                next
-            else
-                if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
-                    @internal_outbound_links << link
-                else
-                    @external_outbound_links << link
-                end
-            end
-            @valid_links << link
-        end
-        @outbound_links = @internal_outbound_links + @external_outbound_links
-        @scanned = true
-    end
-end
-class URI::Generic
-    def absolute?()
-        if @scheme or path.start_with?'/'
-            true
-        else
-            false
-        end
-    end
-    def domain
-        domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
-        return domain[1] unless domain.nil?
-        return nil
-    end
-=begin
-    def normalize!
-        if path && path == ''
-            set_path('/')
-        end
-        if scheme && scheme != scheme.downcase
-            set_scheme(self.scheme.downcase)
-        end
-        if host && host != host.downcase
-            set_host(self.host.downcase)
-        end
-        set_fragment(nil) unless fragment.nil?
-    end
-=end
-end
-=begin
-class URI::Parser
-    def parse(uri)
-        scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
-        if scheme && URI.scheme_list.include?(scheme.upcase)
-            URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
-        else
-            URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
-        end
-    end
-end
-a = Mechanize.new
-w = a.get('http://dict.youdao.com/w/abc/')
-w.scan_links
-pp w.internal_outbound_links
-exit
-w.links.each do |link|
-    puts link.rel
-end
-=end