baidu 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +50 -4
- metadata +4 -3
data/lib/baidu.rb
CHANGED
@@ -1,6 +1,52 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
require 'mechanize'
|
2
|
+
class Baidu
|
3
|
+
def initialize(offset100=false)
|
4
|
+
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
5
|
+
@perpage = 10
|
6
|
+
@perpage = 100 if offset100==true
|
7
|
+
@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
8
|
+
end
|
9
|
+
def query(query)
|
10
|
+
@uri = @baseuri+URI.encode(Iconv.conv('GBK','UTF-8',query))
|
11
|
+
@page = @a.get @uri
|
12
|
+
self.clean
|
13
|
+
@number = number
|
14
|
+
@maxpage = (@number / @perpage.to_f).round
|
15
|
+
@maxpage =10 if @maxpage>10
|
16
|
+
@currpage =0
|
17
|
+
end
|
18
|
+
def how_many_pages(url)
|
19
|
+
self.query("site:#{url}")
|
20
|
+
end
|
21
|
+
def how_many_links
|
22
|
+
self.query("domain:\"#{link}\"")
|
23
|
+
end
|
24
|
+
def how_many_pages_with(url,string)
|
25
|
+
self.query("site:#{url} inurl:#{string}")
|
26
|
+
end
|
27
|
+
def rank(host)
|
28
|
+
@page.search("//table[@class=\"result\"]").each do |table|
|
29
|
+
href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
30
|
+
begin
|
31
|
+
return table['id'] if host==URI.parse(URI.encode(href)).host
|
32
|
+
rescue URI::InvalidURIError
|
33
|
+
puts "invalid uri:#{href}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return false
|
37
|
+
end
|
38
|
+
def number
|
39
|
+
return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
40
|
+
end
|
41
|
+
def next
|
42
|
+
nextbtn = @page.link_with(:text=>/下一页/)
|
43
|
+
return false if (nextbtn.nil? or @currpage >= @maxpage)
|
44
|
+
@page = @a.click(nextbtn)
|
45
|
+
self.clean
|
46
|
+
return true
|
47
|
+
end
|
48
|
+
def clean
|
49
|
+
@page.body = Iconv.conv('UTF-16','GBK//IGNORE',@page.body)
|
50
|
+
@page.body.gsub! ("[\U0080-\U2C77]+",'') //mechanize will be confuzed without removing the few characters
|
4
51
|
end
|
5
52
|
end
|
6
|
-
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,8 @@ bindir: bin
|
|
11
11
|
cert_chain: []
|
12
12
|
date: 2011-11-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description: to get data from www.baidu.com
|
14
|
+
description: to get data from www.baidu.com. this is built by a newbie, so please
|
15
|
+
be careful
|
15
16
|
email: seoaqua@qq.com
|
16
17
|
executables: []
|
17
18
|
extensions: []
|
@@ -41,5 +42,5 @@ rubyforge_project:
|
|
41
42
|
rubygems_version: 1.8.11
|
42
43
|
signing_key:
|
43
44
|
specification_version: 3
|
44
|
-
summary: to get data from www.baidu.com
|
45
|
+
summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
|
45
46
|
test_files: []
|