baidu 0.2.6 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +47 -87
- metadata +5 -5
data/lib/baidu.rb
CHANGED
@@ -4,27 +4,23 @@ require 'mechanize'
|
|
4
4
|
require 'json'
|
5
5
|
require 'uri'
|
6
6
|
class Baidu
|
7
|
-
attr_accessor :perpage,:pagenumber,:debug
|
8
|
-
attr_reader :page,:wd,:data
|
9
7
|
BaseUri = 'http://www.baidu.com/s?'
|
8
|
+
PerPage = 100
|
9
|
+
|
10
10
|
def initialize
|
11
11
|
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
12
12
|
@a.idle_timeout = 2
|
13
13
|
@a.max_history = 1
|
14
|
-
@perpage = 100
|
15
14
|
@page = nil
|
16
|
-
@debug = false
|
17
|
-
@data = Hash.new
|
18
|
-
#@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
19
15
|
end
|
20
16
|
|
21
|
-
public
|
22
17
|
def suggestions(wd)
|
23
18
|
json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
24
19
|
m = /\[([^\]]*)\]/.match json
|
25
20
|
return JSON.parse m[0]
|
26
21
|
end
|
27
22
|
|
23
|
+
=begin
|
28
24
|
def extend(words,level=3,sleeptime=1)
|
29
25
|
level = level.to_i - 1
|
30
26
|
words = [words] unless words.respond_to? 'each'
|
@@ -40,32 +36,29 @@ class Baidu
|
|
40
36
|
return extensions if level < 1
|
41
37
|
return extensions + extend(extensions,level)
|
42
38
|
end
|
39
|
+
=end
|
43
40
|
|
44
41
|
def popular?(wd)
|
45
42
|
return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
|
46
43
|
end
|
47
44
|
|
48
45
|
def query(wd)
|
49
|
-
@data.clear
|
50
|
-
@wd = wd
|
51
|
-
@data.clear
|
52
46
|
q = Array.new
|
53
47
|
q << "wd=#{wd}"
|
54
|
-
q << "rn=#{
|
48
|
+
q << "rn=#{PerPage}"
|
55
49
|
queryStr = q.join("&")
|
56
|
-
uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
57
|
-
|
50
|
+
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
51
|
+
uri = URI.encode((BaseUri + queryStr))
|
52
|
+
begin
|
58
53
|
@page = @a.get uri
|
54
|
+
BaiduResult.new(@page)
|
59
55
|
rescue Net::HTTP::Persistent::Error
|
60
56
|
warn "#{uri}timeout"
|
57
|
+
return false
|
61
58
|
end
|
62
|
-
clean
|
63
|
-
@number = self.how_many
|
64
|
-
@maxpage = (@number / @perpage.to_f).round
|
65
|
-
@currpage =0
|
66
59
|
=begin
|
67
60
|
query = "#{query}"
|
68
|
-
@uri =
|
61
|
+
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
69
62
|
@page = @a.get @uri
|
70
63
|
self.clean
|
71
64
|
@number = self.how_many
|
@@ -75,96 +68,63 @@ class Baidu
|
|
75
68
|
=end
|
76
69
|
end
|
77
70
|
|
71
|
+
=begin
|
72
|
+
def maxpage
|
73
|
+
@maxpage ||= (how_many / PerPage.to_f).round
|
74
|
+
end
|
75
|
+
=end
|
76
|
+
|
78
77
|
#site:xxx.yyy.com
|
79
78
|
def how_many_pages(host)
|
80
|
-
|
81
|
-
query("site:#{host}")
|
82
|
-
return how_many
|
79
|
+
query("site:#{host}").how_many
|
83
80
|
end
|
84
81
|
|
85
82
|
#domain:xxx.yyy.com/path/file.html
|
86
83
|
def how_many_links(uri)
|
87
|
-
|
88
|
-
query("domain:\"#{uri}\"")
|
89
|
-
return how_many
|
84
|
+
query("domain:\"#{uri}\"").how_many
|
90
85
|
end
|
91
86
|
|
92
87
|
#site:xxx.yyy.com inurl:zzz
|
93
88
|
def how_many_pages_with(host,string)
|
94
|
-
|
95
|
-
query("site:#{host} inurl:#{string}")
|
96
|
-
return how_many
|
89
|
+
query("site:#{host} inurl:#{string}").how_many
|
97
90
|
end
|
98
|
-
|
99
|
-
#look up a word and get the rank of a uri with $host
|
100
|
-
def rank(host)#on base of ranks
|
101
|
-
return @data[:rank][host] if @data.has_key?:rank and @data[:rank].has_key?host
|
102
|
-
ranks.each_with_index do |uri,index|
|
103
|
-
if URI.parse(URI.encode(uri).host)
|
104
|
-
@data << {:rank=>{host=>index+1}}
|
105
|
-
return index+1
|
106
|
-
end
|
107
|
-
end
|
91
|
+
|
108
92
|
=begin
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
puts "invalid uri:#{href}" if @debug
|
115
|
-
end
|
116
|
-
end
|
117
|
-
return false
|
118
|
-
=end
|
93
|
+
private
|
94
|
+
def clean
|
95
|
+
@page.body.force_encoding('GBK')
|
96
|
+
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
97
|
+
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
119
98
|
end
|
99
|
+
=end
|
100
|
+
end
|
120
101
|
|
121
|
-
|
122
|
-
|
123
|
-
raise
|
124
|
-
|
125
|
-
ranks = Array.new
|
126
|
-
@page.search("//table[@class=\"result\"]").each do |table|
|
127
|
-
ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
128
|
-
end
|
129
|
-
@data[:ranks] = ranks
|
130
|
-
return ranks
|
102
|
+
class BaiduResult
|
103
|
+
def initialize(page)
|
104
|
+
raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
105
|
+
@page = page
|
131
106
|
end
|
132
|
-
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
107
|
+
|
108
|
+
def ranks
|
109
|
+
@ranks ||= @page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//a").first['href'] }
|
110
|
+
end
|
111
|
+
|
112
|
+
#look up a word and get the rank of a uri with $host
|
113
|
+
def rank(host)#on base of ranks
|
114
|
+
ranks.each_with_index do |uri,index|
|
115
|
+
index+1 if URI.parse(URI.encode(uri)).host == host
|
141
116
|
end
|
142
|
-
@data[:realated_keywords] = keywords
|
143
|
-
return keywords
|
144
|
-
#m = /href="[^"]+">([^<]+)<\/a>/.match(related.content)
|
145
117
|
end
|
146
118
|
|
147
119
|
def how_many
|
148
|
-
|
149
|
-
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
150
|
-
numSpan = @page.search("//span[@class='nums']").first
|
151
|
-
return false if numSpan.nil?
|
152
|
-
return numSpan.content.gsub(/\D/,'').to_i
|
153
|
-
#return false if @page.search("//span[@class='nums']").first.nil?
|
154
|
-
#return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
120
|
+
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
155
121
|
end
|
156
122
|
|
157
|
-
def
|
158
|
-
|
159
|
-
return false if (nextbtn.nil? or @currpage >= @maxpage)
|
160
|
-
@page = @a.click(nextbtn)
|
161
|
-
self.clean
|
162
|
-
return true
|
123
|
+
def related_keywords
|
124
|
+
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
163
125
|
end
|
164
|
-
|
165
|
-
def
|
166
|
-
@page.
|
167
|
-
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
168
|
-
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
126
|
+
|
127
|
+
def next
|
128
|
+
@page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
169
129
|
end
|
170
130
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,17 +9,17 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: to get keyword ranking,related queries and popularity from baidu.com.
|
15
|
-
this is built by a newbie, so please be careful
|
15
|
+
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
16
16
|
email: seoaqua@qq.com
|
17
17
|
executables: []
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
21
|
- lib/baidu.rb
|
22
|
-
homepage:
|
22
|
+
homepage: https://github.com/seoaqua/ruby-baidu
|
23
23
|
licenses: []
|
24
24
|
post_install_message:
|
25
25
|
rdoc_options: []
|
@@ -43,5 +43,5 @@ rubygems_version: 1.8.21
|
|
43
43
|
signing_key:
|
44
44
|
specification_version: 3
|
45
45
|
summary: to get keyword ranking,related queries and popularity from baidu.com. this
|
46
|
-
is built by a newbie, so please be careful
|
46
|
+
is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
47
47
|
test_files: []
|