baidu 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/baidu.rb +50 -4
  2. metadata +4 -3
@@ -1,6 +1,52 @@
1
- class baidu
2
- def self.test
3
- puts "hi"
1
+ require 'mechanize'
2
+ class Baidu
3
+ def initialize(offset100=false)
4
+ @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
5
+ @perpage = 10
6
+ @perpage = 100 if offset100==true
7
+ @baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
8
+ end
9
+ def query(query)
10
+ @uri = @baseuri+URI.encode(Iconv.conv('GBK','UTF-8',query))
11
+ @page = @a.get @uri
12
+ self.clean
13
+ @number = number
14
+ @maxpage = (@number / @perpage.to_f).round
15
+ @maxpage =10 if @maxpage>10
16
+ @currpage =0
17
+ end
18
+ def how_many_pages(url)
19
+ self.query("site:#{url}")
20
+ end
21
+ def how_many_links
22
+ self.query("domain:\"#{link}\"")
23
+ end
24
+ def how_many_pages_with(url,string)
25
+ self.query("site:#{url} inurl:#{string}")
26
+ end
27
+ def rank(host)
28
+ @page.search("//table[@class=\"result\"]").each do |table|
29
+ href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
30
+ begin
31
+ return table['id'] if host==URI.parse(URI.encode(href)).host
32
+ rescue URI::InvalidURIError
33
+ puts "invalid uri:#{href}"
34
+ end
35
+ end
36
+ return false
37
+ end
38
+ def number
39
+ return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
40
+ end
41
+ def next
42
+ nextbtn = @page.link_with(:text=>/下一页/)
43
+ return false if (nextbtn.nil? or @currpage >= @maxpage)
44
+ @page = @a.click(nextbtn)
45
+ self.clean
46
+ return true
47
+ end
48
+ def clean
49
+ @page.body = Iconv.conv('UTF-16','GBK//IGNORE',@page.body)
50
+ @page.body.gsub! ("[\U0080-\U2C77]+",'') //mechanize will be confuzed without removing the few characters
4
51
  end
5
52
  end
6
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,8 @@ bindir: bin
11
11
  cert_chain: []
12
12
  date: 2011-11-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to get data from www.baidu.com
14
+ description: to get data from www.baidu.com. this is built by a newbie, so please
15
+ be careful
15
16
  email: seoaqua@qq.com
16
17
  executables: []
17
18
  extensions: []
@@ -41,5 +42,5 @@ rubyforge_project:
41
42
  rubygems_version: 1.8.11
42
43
  signing_key:
43
44
  specification_version: 3
44
- summary: to get data from www.baidu.com
45
+ summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
45
46
  test_files: []