baidu 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baidu.rb +123 -57
- metadata +2 -2
    
        data/lib/baidu.rb
    CHANGED
    
    | @@ -1,61 +1,127 @@ | |
| 1 1 | 
             
            #coding:UTF-8
         | 
| 2 2 | 
             
            require 'mechanize'
         | 
| 3 3 | 
             
            class Baidu
         | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 4 | 
            +
                attr_accessor :perpage,:pagenumber,:debug
         | 
| 5 | 
            +
                attr_reader :page,:wd,:data
         | 
| 6 | 
            +
                BaseUri = 'http://www.baidu.com/s?'
         | 
| 7 | 
            +
                def initialize
         | 
| 8 | 
            +
                    @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
         | 
| 9 | 
            +
                    @a.idle_timeout = 2
         | 
| 10 | 
            +
                    @a.max_history = 1
         | 
| 11 | 
            +
                    @perpage = 100
         | 
| 12 | 
            +
                    @page = nil
         | 
| 13 | 
            +
                    @debug = false
         | 
| 14 | 
            +
                    @data = Hash.new
         | 
| 15 | 
            +
                    #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                public
         | 
| 19 | 
            +
                def query(wd)
         | 
| 20 | 
            +
                    @data.clear
         | 
| 21 | 
            +
                    @wd = wd
         | 
| 22 | 
            +
                    @data.clear
         | 
| 23 | 
            +
                    q = Array.new
         | 
| 24 | 
            +
                    q << "wd=#{wd}"
         | 
| 25 | 
            +
                    q << "rn=#{@perpage}"
         | 
| 26 | 
            +
                    queryStr = q.join("&")
         | 
| 27 | 
            +
                    uri = URI.encode((BaseUri + queryStr).encode('GBK'))
         | 
| 28 | 
            +
                    begin
         | 
| 29 | 
            +
                    @page = @a.get uri
         | 
| 30 | 
            +
                    rescue SocketError => e
         | 
| 31 | 
            +
                        puts e
         | 
| 32 | 
            +
                    end
         | 
| 33 | 
            +
                    clean
         | 
| 34 | 
            +
                    @number = self.how_many
         | 
| 35 | 
            +
                    @maxpage = (@number / @perpage.to_f).round
         | 
| 36 | 
            +
                    @currpage =0
         | 
| 37 | 
            +
            =begin
         | 
| 38 | 
            +
                    query = "#{query}"
         | 
| 39 | 
            +
                    @uri = @baseuri+URI.encode(query.encode('GBK'))
         | 
| 40 | 
            +
                    @page = @a.get @uri
         | 
| 41 | 
            +
                    self.clean
         | 
| 42 | 
            +
                    @number = self.how_many
         | 
| 43 | 
            +
                    @maxpage = (@number / @perpage.to_f).round
         | 
| 44 | 
            +
                    @maxpage =10 if @maxpage>10
         | 
| 45 | 
            +
                    @currpage =0
         | 
| 46 | 
            +
            =end
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                #site:xxx.yyy.com
         | 
| 50 | 
            +
                def how_many_pages(host)
         | 
| 51 | 
            +
                    return @data['how_many']if @data.has_key?'how_many'
         | 
| 52 | 
            +
                    query("site:#{host}")
         | 
| 53 | 
            +
                    return how_many
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                #domain:xxx.yyy.com/path/file.html
         | 
| 57 | 
            +
                def how_many_links(uri)
         | 
| 58 | 
            +
                    return @data['how_many']if @data.has_key?'how_many'
         | 
| 59 | 
            +
                    query("domain:\"#{uri}\"")
         | 
| 60 | 
            +
                    return how_many
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                #site:xxx.yyy.com inurl:zzz
         | 
| 64 | 
            +
                def how_many_pages_with(host,string)
         | 
| 65 | 
            +
                    return @data['how_many']if @data.has_key?'how_many'
         | 
| 66 | 
            +
                    query("site:#{host} inurl:#{string}")
         | 
| 67 | 
            +
                    return how_many
         | 
| 68 | 
            +
                end
         | 
| 69 | 
            +
                ########################################################################################################################
         | 
| 70 | 
            +
                #look up a word and get the rank of a uri with $host
         | 
| 71 | 
            +
                def rank(host)#on base of ranks
         | 
| 72 | 
            +
                    return @data['rank'][host] if @data.has_key?'rank' and @data['rank'].has_key?host
         | 
| 73 | 
            +
                    ranks.each_with_index do |uri,index|
         | 
| 74 | 
            +
                        if URI.parse(URI.encode(uri).host)
         | 
| 75 | 
            +
                            @data << {'rank'=>{host=>index+1}}
         | 
| 76 | 
            +
                            return index+1
         | 
| 77 | 
            +
                        end
         | 
| 78 | 
            +
                    end
         | 
| 79 | 
            +
            =begin
         | 
| 80 | 
            +
                    @page.search("//table[@class=\"result\"]").each do |table|
         | 
| 81 | 
            +
                        href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
         | 
| 82 | 
            +
                        begin
         | 
| 83 | 
            +
                            return table['id'] if host==URI.parse(URI.encode(href)).host
         | 
| 84 | 
            +
                        rescue URI::InvalidURIError
         | 
| 85 | 
            +
                            puts "invalid uri:#{href}" if @debug
         | 
| 86 | 
            +
                        end
         | 
| 87 | 
            +
                    end
         | 
| 88 | 
            +
                    return false
         | 
| 89 | 
            +
            =end
         | 
| 90 | 
            +
                end
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                def ranks#(keyword=false)
         | 
| 93 | 
            +
                    return @data['ranks'] if @data.has_key?'ranks'
         | 
| 94 | 
            +
                    raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
         | 
| 95 | 
            +
                    #self.query(keyword) if keyword
         | 
| 96 | 
            +
                    ranks = Array.new
         | 
| 97 | 
            +
                    @page.search("//table[@class=\"result\"]").each do |table|
         | 
| 98 | 
            +
                        ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
         | 
| 99 | 
            +
                    end
         | 
| 100 | 
            +
                    @data['ranks'] = ranks
         | 
| 101 | 
            +
                    return ranks
         | 
| 102 | 
            +
                end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                def how_many
         | 
| 105 | 
            +
                    return @data['how_many'] if @data.has_key?'how_many'
         | 
| 106 | 
            +
                    raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
         | 
| 107 | 
            +
                    numSpan = @page.search("//span[@class='nums']").first
         | 
| 108 | 
            +
                    return false if numSpan.nil?
         | 
| 109 | 
            +
                    return numSpan.content.gsub(/\D/,'').to_i
         | 
| 110 | 
            +
                    #return false if @page.search("//span[@class='nums']").first.nil?
         | 
| 111 | 
            +
                    #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
         | 
| 112 | 
            +
                end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                def next
         | 
| 115 | 
            +
                    nextbtn = @page.link_with(:text=>/下一页/)
         | 
| 116 | 
            +
                    return false if (nextbtn.nil? or @currpage >= @maxpage)
         | 
| 117 | 
            +
                    @page = @a.click(nextbtn)
         | 
| 118 | 
            +
                    self.clean
         | 
| 119 | 
            +
                    return true
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
                private
         | 
| 122 | 
            +
                def clean
         | 
| 123 | 
            +
                    @page.body.force_encoding('GBK')
         | 
| 124 | 
            +
                    @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
         | 
| 125 | 
            +
                    @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
         | 
| 126 | 
            +
                end
         | 
| 61 127 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: baidu
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.2. | 
| 4 | 
            +
              version: 0.2.4
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 39 39 | 
             
                  version: '0'
         | 
| 40 40 | 
             
            requirements: []
         | 
| 41 41 | 
             
            rubyforge_project: 
         | 
| 42 | 
            -
            rubygems_version: 1.8. | 
| 42 | 
            +
            rubygems_version: 1.8.17
         | 
| 43 43 | 
             
            signing_key: 
         | 
| 44 44 | 
             
            specification_version: 3
         | 
| 45 45 | 
             
            summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
         |