baidu 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baidu.rb +123 -57
- metadata +2 -2
data/lib/baidu.rb
CHANGED
@@ -1,61 +1,127 @@
|
|
1
1
|
#coding:UTF-8
|
2
2
|
require 'mechanize'
|
3
3
|
class Baidu
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
4
|
+
attr_accessor :perpage,:pagenumber,:debug
|
5
|
+
attr_reader :page,:wd,:data
|
6
|
+
BaseUri = 'http://www.baidu.com/s?'
|
7
|
+
def initialize
|
8
|
+
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
9
|
+
@a.idle_timeout = 2
|
10
|
+
@a.max_history = 1
|
11
|
+
@perpage = 100
|
12
|
+
@page = nil
|
13
|
+
@debug = false
|
14
|
+
@data = Hash.new
|
15
|
+
#@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
16
|
+
end
|
17
|
+
|
18
|
+
public
|
19
|
+
def query(wd)
|
20
|
+
@data.clear
|
21
|
+
@wd = wd
|
22
|
+
@data.clear
|
23
|
+
q = Array.new
|
24
|
+
q << "wd=#{wd}"
|
25
|
+
q << "rn=#{@perpage}"
|
26
|
+
queryStr = q.join("&")
|
27
|
+
uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
28
|
+
begin
|
29
|
+
@page = @a.get uri
|
30
|
+
rescue SocketError => e
|
31
|
+
puts e
|
32
|
+
end
|
33
|
+
clean
|
34
|
+
@number = self.how_many
|
35
|
+
@maxpage = (@number / @perpage.to_f).round
|
36
|
+
@currpage =0
|
37
|
+
=begin
|
38
|
+
query = "#{query}"
|
39
|
+
@uri = @baseuri+URI.encode(query.encode('GBK'))
|
40
|
+
@page = @a.get @uri
|
41
|
+
self.clean
|
42
|
+
@number = self.how_many
|
43
|
+
@maxpage = (@number / @perpage.to_f).round
|
44
|
+
@maxpage =10 if @maxpage>10
|
45
|
+
@currpage =0
|
46
|
+
=end
|
47
|
+
end
|
48
|
+
|
49
|
+
#site:xxx.yyy.com
|
50
|
+
def how_many_pages(host)
|
51
|
+
return @data['how_many']if @data.has_key?'how_many'
|
52
|
+
query("site:#{host}")
|
53
|
+
return how_many
|
54
|
+
end
|
55
|
+
|
56
|
+
#domain:xxx.yyy.com/path/file.html
|
57
|
+
def how_many_links(uri)
|
58
|
+
return @data['how_many']if @data.has_key?'how_many'
|
59
|
+
query("domain:\"#{uri}\"")
|
60
|
+
return how_many
|
61
|
+
end
|
62
|
+
|
63
|
+
#site:xxx.yyy.com inurl:zzz
|
64
|
+
def how_many_pages_with(host,string)
|
65
|
+
return @data['how_many']if @data.has_key?'how_many'
|
66
|
+
query("site:#{host} inurl:#{string}")
|
67
|
+
return how_many
|
68
|
+
end
|
69
|
+
########################################################################################################################
|
70
|
+
#look up a word and get the rank of a uri with $host
|
71
|
+
def rank(host)#on base of ranks
|
72
|
+
return @data['rank'][host] if @data.has_key?'rank' and @data['rank'].has_key?host
|
73
|
+
ranks.each_with_index do |uri,index|
|
74
|
+
if URI.parse(URI.encode(uri).host)
|
75
|
+
@data << {'rank'=>{host=>index+1}}
|
76
|
+
return index+1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
=begin
|
80
|
+
@page.search("//table[@class=\"result\"]").each do |table|
|
81
|
+
href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
82
|
+
begin
|
83
|
+
return table['id'] if host==URI.parse(URI.encode(href)).host
|
84
|
+
rescue URI::InvalidURIError
|
85
|
+
puts "invalid uri:#{href}" if @debug
|
86
|
+
end
|
87
|
+
end
|
88
|
+
return false
|
89
|
+
=end
|
90
|
+
end
|
91
|
+
|
92
|
+
def ranks#(keyword=false)
|
93
|
+
return @data['ranks'] if @data.has_key?'ranks'
|
94
|
+
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
95
|
+
#self.query(keyword) if keyword
|
96
|
+
ranks = Array.new
|
97
|
+
@page.search("//table[@class=\"result\"]").each do |table|
|
98
|
+
ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
99
|
+
end
|
100
|
+
@data['ranks'] = ranks
|
101
|
+
return ranks
|
102
|
+
end
|
103
|
+
|
104
|
+
def how_many
|
105
|
+
return @data['how_many'] if @data.has_key?'how_many'
|
106
|
+
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
107
|
+
numSpan = @page.search("//span[@class='nums']").first
|
108
|
+
return false if numSpan.nil?
|
109
|
+
return numSpan.content.gsub(/\D/,'').to_i
|
110
|
+
#return false if @page.search("//span[@class='nums']").first.nil?
|
111
|
+
#return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
112
|
+
end
|
113
|
+
|
114
|
+
def next
|
115
|
+
nextbtn = @page.link_with(:text=>/下一页/)
|
116
|
+
return false if (nextbtn.nil? or @currpage >= @maxpage)
|
117
|
+
@page = @a.click(nextbtn)
|
118
|
+
self.clean
|
119
|
+
return true
|
120
|
+
end
|
121
|
+
private
|
122
|
+
def clean
|
123
|
+
@page.body.force_encoding('GBK')
|
124
|
+
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
125
|
+
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
126
|
+
end
|
61
127
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
39
|
version: '0'
|
40
40
|
requirements: []
|
41
41
|
rubyforge_project:
|
42
|
-
rubygems_version: 1.8.
|
42
|
+
rubygems_version: 1.8.17
|
43
43
|
signing_key:
|
44
44
|
specification_version: 3
|
45
45
|
summary: to get data from www.baidu.com. this is built by a newbie, so please be careful
|