baidu 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/baidu.rb +50 -4
  2. metadata +4 -3
@@ -1,6 +1,52 @@
1
- class baidu
2
- def self.test
3
- puts "hi"
1
+ require 'mechanize'
2
+ class Baidu
3
+ def initialize(offset100=false)
4
+ @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
5
+ @perpage = 10
6
+ @perpage = 100 if offset100==true
7
+ @baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
8
+ end
9
+ def query(query)
10
+ @uri = @baseuri+URI.encode(Iconv.conv('GBK','UTF-8',query))
11
+ @page = @a.get @uri
12
+ self.clean
13
+ @number = number
14
+ @maxpage = (@number / @perpage.to_f).round
15
+ @maxpage =10 if @maxpage>10
16
+ @currpage =0
17
+ end
18
+ def how_many_pages(url)
19
+ self.query("site:#{url}")
20
+ end
21
+ def how_many_links
22
+ self.query("domain:\"#{link}\"")
23
+ end
24
+ def how_many_pages_with(url,string)
25
+ self.query("site:#{url} inurl:#{string}")
26
+ end
27
+ def rank(host)
28
+ @page.search("//table[@class=\"result\"]").each do |table|
29
+ href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
30
+ begin
31
+ return table['id'] if host==URI.parse(URI.encode(href)).host
32
+ rescue URI::InvalidURIError
33
+ puts "invalid uri:#{href}"
34
+ end
35
+ end
36
+ return false
37
+ end
38
+ def number
39
+ return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
40
+ end
41
+ def next
42
+ nextbtn = @page.link_with(:text=>/下一页/)
43
+ return false if (nextbtn.nil? or @currpage >= @maxpage)
44
+ @page = @a.click(nextbtn)
45
+ self.clean
46
+ return true
47
+ end
48
+ def clean
49
+ @page.body = Iconv.conv('UTF-16','GBK//IGNORE',@page.body)
50
+ @page.body.gsub! ("[\U0080-\U2C77]+",'') //mechanize will be confuzed without removing the few characters
4
51
  end
5
52
  end
6
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,8 @@ bindir: bin
11
11
  cert_chain: []
12
12
  date: 2011-11-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to get data from www.baidu.com
14
+ description: to get data from www.baidu.com. this is built by a newbie, so please
15
+ be careful
15
16
  email: seoaqua@qq.com
16
17
  executables: []
17
18
  extensions: []
@@ -41,5 +42,5 @@ rubyforge_project:
41
42
  rubygems_version: 1.8.11
42
43
  signing_key:
43
44
  specification_version: 3
44
- summary: to get data from www.baidu.com
45
+ summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
45
46
  test_files: []