baidu 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +123 -57
- metadata +2 -2
data/lib/baidu.rb
CHANGED
@@ -1,61 +1,127 @@
|
|
1
1
|
#coding:UTF-8
|
2
2
|
require 'mechanize'
|
3
3
|
class Baidu
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
4
|
+
attr_accessor :perpage,:pagenumber,:debug
|
5
|
+
attr_reader :page,:wd,:data
|
6
|
+
BaseUri = 'http://www.baidu.com/s?'
|
7
|
+
def initialize
|
8
|
+
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
9
|
+
@a.idle_timeout = 2
|
10
|
+
@a.max_history = 1
|
11
|
+
@perpage = 100
|
12
|
+
@page = nil
|
13
|
+
@debug = false
|
14
|
+
@data = Hash.new
|
15
|
+
#@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
16
|
+
end
|
17
|
+
|
18
|
+
public
|
19
|
+
def query(wd)
|
20
|
+
@data.clear
|
21
|
+
@wd = wd
|
22
|
+
@data.clear
|
23
|
+
q = Array.new
|
24
|
+
q << "wd=#{wd}"
|
25
|
+
q << "rn=#{@perpage}"
|
26
|
+
queryStr = q.join("&")
|
27
|
+
uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
28
|
+
begin
|
29
|
+
@page = @a.get uri
|
30
|
+
rescue SocketError => e
|
31
|
+
puts e
|
32
|
+
end
|
33
|
+
clean
|
34
|
+
@number = self.how_many
|
35
|
+
@maxpage = (@number / @perpage.to_f).round
|
36
|
+
@currpage =0
|
37
|
+
=begin
|
38
|
+
query = "#{query}"
|
39
|
+
@uri = @baseuri+URI.encode(query.encode('GBK'))
|
40
|
+
@page = @a.get @uri
|
41
|
+
self.clean
|
42
|
+
@number = self.how_many
|
43
|
+
@maxpage = (@number / @perpage.to_f).round
|
44
|
+
@maxpage =10 if @maxpage>10
|
45
|
+
@currpage =0
|
46
|
+
=end
|
47
|
+
end
|
48
|
+
|
49
|
+
#site:xxx.yyy.com
|
50
|
+
def how_many_pages(host)
|
51
|
+
return @data['how_many']if @data.has_key?'how_many'
|
52
|
+
query("site:#{host}")
|
53
|
+
return how_many
|
54
|
+
end
|
55
|
+
|
56
|
+
#domain:xxx.yyy.com/path/file.html
|
57
|
+
def how_many_links(uri)
|
58
|
+
return @data['how_many']if @data.has_key?'how_many'
|
59
|
+
query("domain:\"#{uri}\"")
|
60
|
+
return how_many
|
61
|
+
end
|
62
|
+
|
63
|
+
#site:xxx.yyy.com inurl:zzz
|
64
|
+
def how_many_pages_with(host,string)
|
65
|
+
return @data['how_many']if @data.has_key?'how_many'
|
66
|
+
query("site:#{host} inurl:#{string}")
|
67
|
+
return how_many
|
68
|
+
end
|
69
|
+
########################################################################################################################
|
70
|
+
#look up a word and get the rank of a uri with $host
|
71
|
+
def rank(host)#on base of ranks
|
72
|
+
return @data['rank'][host] if @data.has_key?'rank' and @data['rank'].has_key?host
|
73
|
+
ranks.each_with_index do |uri,index|
|
74
|
+
if URI.parse(URI.encode(uri).host)
|
75
|
+
@data << {'rank'=>{host=>index+1}}
|
76
|
+
return index+1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
=begin
|
80
|
+
@page.search("//table[@class=\"result\"]").each do |table|
|
81
|
+
href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
82
|
+
begin
|
83
|
+
return table['id'] if host==URI.parse(URI.encode(href)).host
|
84
|
+
rescue URI::InvalidURIError
|
85
|
+
puts "invalid uri:#{href}" if @debug
|
86
|
+
end
|
87
|
+
end
|
88
|
+
return false
|
89
|
+
=end
|
90
|
+
end
|
91
|
+
|
92
|
+
def ranks#(keyword=false)
|
93
|
+
return @data['ranks'] if @data.has_key?'ranks'
|
94
|
+
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
95
|
+
#self.query(keyword) if keyword
|
96
|
+
ranks = Array.new
|
97
|
+
@page.search("//table[@class=\"result\"]").each do |table|
|
98
|
+
ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
99
|
+
end
|
100
|
+
@data['ranks'] = ranks
|
101
|
+
return ranks
|
102
|
+
end
|
103
|
+
|
104
|
+
def how_many
|
105
|
+
return @data['how_many'] if @data.has_key?'how_many'
|
106
|
+
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
107
|
+
numSpan = @page.search("//span[@class='nums']").first
|
108
|
+
return false if numSpan.nil?
|
109
|
+
return numSpan.content.gsub(/\D/,'').to_i
|
110
|
+
#return false if @page.search("//span[@class='nums']").first.nil?
|
111
|
+
#return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
112
|
+
end
|
113
|
+
|
114
|
+
def next
|
115
|
+
nextbtn = @page.link_with(:text=>/下一页/)
|
116
|
+
return false if (nextbtn.nil? or @currpage >= @maxpage)
|
117
|
+
@page = @a.click(nextbtn)
|
118
|
+
self.clean
|
119
|
+
return true
|
120
|
+
end
|
121
|
+
private
|
122
|
+
def clean
|
123
|
+
@page.body.force_encoding('GBK')
|
124
|
+
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
125
|
+
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
126
|
+
end
|
61
127
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
39
|
version: '0'
|
40
40
|
requirements: []
|
41
41
|
rubyforge_project:
|
42
|
-
rubygems_version: 1.8.
|
42
|
+
rubygems_version: 1.8.17
|
43
43
|
signing_key:
|
44
44
|
specification_version: 3
|
45
45
|
summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
|