baidu 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +11 -6
- metadata +2 -2
data/lib/baidu.rb
CHANGED
@@ -3,30 +3,33 @@ require 'mechanize'
|
|
3
3
|
class Baidu
|
4
4
|
def initialize(offset100=false)
|
5
5
|
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
6
|
+
@a.max_history = 1
|
6
7
|
@perpage = 10
|
7
8
|
@perpage = 100 if offset100==true
|
8
9
|
@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
9
10
|
end
|
10
11
|
def query(query)
|
12
|
+
query = "#{query}"
|
11
13
|
@uri = @baseuri+URI.encode(query.encode('GBK'))
|
12
14
|
@page = @a.get @uri
|
15
|
+
#File.open('/tmp/testpage','w'){|f|f.puts @page.body} if query=='1458 Italia'
|
13
16
|
self.clean
|
14
|
-
@number = self.
|
17
|
+
@number = self.how_many
|
15
18
|
@maxpage = (@number / @perpage.to_f).round
|
16
19
|
@maxpage =10 if @maxpage>10
|
17
20
|
@currpage =0
|
18
21
|
end
|
19
22
|
def how_many_pages(uri)
|
20
23
|
self.query("site:#{uri}")
|
21
|
-
return self.
|
24
|
+
return self.how_many
|
22
25
|
end
|
23
26
|
def how_many_links(uri)
|
24
27
|
self.query("domain:\"#{uri}\"")
|
25
|
-
return self.
|
28
|
+
return self.how_many
|
26
29
|
end
|
27
30
|
def how_many_pages_with(url,string)
|
28
31
|
self.query("site:#{url} inurl:#{string}")
|
29
|
-
return self.
|
32
|
+
return self.how_many
|
30
33
|
end
|
31
34
|
def rank(host)
|
32
35
|
@page.search("//table[@class=\"result\"]").each do |table|
|
@@ -39,7 +42,8 @@ class Baidu
|
|
39
42
|
end
|
40
43
|
return false
|
41
44
|
end
|
42
|
-
def
|
45
|
+
def how_many
|
46
|
+
return false if @page.search("//span[@class='nums']").first.nil?
|
43
47
|
return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
44
48
|
end
|
45
49
|
def next
|
@@ -50,7 +54,8 @@ class Baidu
|
|
50
54
|
return true
|
51
55
|
end
|
52
56
|
def clean
|
53
|
-
@page.body.
|
57
|
+
@page.body.force_encoding('GBK')
|
58
|
+
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
54
59
|
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
55
60
|
end
|
56
61
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
39
|
version: '0'
|
40
40
|
requirements: []
|
41
41
|
rubyforge_project:
|
42
|
-
rubygems_version: 1.8.
|
42
|
+
rubygems_version: 1.8.15
|
43
43
|
signing_key:
|
44
44
|
specification_version: 3
|
45
45
|
summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
|