baidu 0.2.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baidu.rb +47 -87
- metadata +5 -5
data/lib/baidu.rb
CHANGED
@@ -4,27 +4,23 @@ require 'mechanize'
|
|
4
4
|
require 'json'
|
5
5
|
require 'uri'
|
6
6
|
class Baidu
|
7
|
-
attr_accessor :perpage,:pagenumber,:debug
|
8
|
-
attr_reader :page,:wd,:data
|
9
7
|
BaseUri = 'http://www.baidu.com/s?'
|
8
|
+
PerPage = 100
|
9
|
+
|
10
10
|
def initialize
|
11
11
|
@a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
12
12
|
@a.idle_timeout = 2
|
13
13
|
@a.max_history = 1
|
14
|
-
@perpage = 100
|
15
14
|
@page = nil
|
16
|
-
@debug = false
|
17
|
-
@data = Hash.new
|
18
|
-
#@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
|
19
15
|
end
|
20
16
|
|
21
|
-
public
|
22
17
|
def suggestions(wd)
|
23
18
|
json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
24
19
|
m = /\[([^\]]*)\]/.match json
|
25
20
|
return JSON.parse m[0]
|
26
21
|
end
|
27
22
|
|
23
|
+
=begin
|
28
24
|
def extend(words,level=3,sleeptime=1)
|
29
25
|
level = level.to_i - 1
|
30
26
|
words = [words] unless words.respond_to? 'each'
|
@@ -40,32 +36,29 @@ class Baidu
|
|
40
36
|
return extensions if level < 1
|
41
37
|
return extensions + extend(extensions,level)
|
42
38
|
end
|
39
|
+
=end
|
43
40
|
|
44
41
|
def popular?(wd)
|
45
42
|
return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
|
46
43
|
end
|
47
44
|
|
48
45
|
def query(wd)
|
49
|
-
@data.clear
|
50
|
-
@wd = wd
|
51
|
-
@data.clear
|
52
46
|
q = Array.new
|
53
47
|
q << "wd=#{wd}"
|
54
|
-
q << "rn=#{
|
48
|
+
q << "rn=#{PerPage}"
|
55
49
|
queryStr = q.join("&")
|
56
|
-
uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
57
|
-
|
50
|
+
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
51
|
+
uri = URI.encode((BaseUri + queryStr))
|
52
|
+
begin
|
58
53
|
@page = @a.get uri
|
54
|
+
BaiduResult.new(@page)
|
59
55
|
rescue Net::HTTP::Persistent::Error
|
60
56
|
warn "#{uri}timeout"
|
57
|
+
return false
|
61
58
|
end
|
62
|
-
clean
|
63
|
-
@number = self.how_many
|
64
|
-
@maxpage = (@number / @perpage.to_f).round
|
65
|
-
@currpage =0
|
66
59
|
=begin
|
67
60
|
query = "#{query}"
|
68
|
-
@uri =
|
61
|
+
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
69
62
|
@page = @a.get @uri
|
70
63
|
self.clean
|
71
64
|
@number = self.how_many
|
@@ -75,96 +68,63 @@ class Baidu
|
|
75
68
|
=end
|
76
69
|
end
|
77
70
|
|
71
|
+
=begin
|
72
|
+
def maxpage
|
73
|
+
@maxpage ||= (how_many / PerPage.to_f).round
|
74
|
+
end
|
75
|
+
=end
|
76
|
+
|
78
77
|
#site:xxx.yyy.com
|
79
78
|
def how_many_pages(host)
|
80
|
-
|
81
|
-
query("site:#{host}")
|
82
|
-
return how_many
|
79
|
+
query("site:#{host}").how_many
|
83
80
|
end
|
84
81
|
|
85
82
|
#domain:xxx.yyy.com/path/file.html
|
86
83
|
def how_many_links(uri)
|
87
|
-
|
88
|
-
query("domain:\"#{uri}\"")
|
89
|
-
return how_many
|
84
|
+
query("domain:\"#{uri}\"").how_many
|
90
85
|
end
|
91
86
|
|
92
87
|
#site:xxx.yyy.com inurl:zzz
|
93
88
|
def how_many_pages_with(host,string)
|
94
|
-
|
95
|
-
query("site:#{host} inurl:#{string}")
|
96
|
-
return how_many
|
89
|
+
query("site:#{host} inurl:#{string}").how_many
|
97
90
|
end
|
98
|
-
|
99
|
-
#look up a word and get the rank of a uri with $host
|
100
|
-
def rank(host)#on base of ranks
|
101
|
-
return @data[:rank][host] if @data.has_key?:rank and @data[:rank].has_key?host
|
102
|
-
ranks.each_with_index do |uri,index|
|
103
|
-
if URI.parse(URI.encode(uri).host)
|
104
|
-
@data << {:rank=>{host=>index+1}}
|
105
|
-
return index+1
|
106
|
-
end
|
107
|
-
end
|
91
|
+
|
108
92
|
=begin
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
puts "invalid uri:#{href}" if @debug
|
115
|
-
end
|
116
|
-
end
|
117
|
-
return false
|
118
|
-
=end
|
93
|
+
private
|
94
|
+
def clean
|
95
|
+
@page.body.force_encoding('GBK')
|
96
|
+
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
97
|
+
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
119
98
|
end
|
99
|
+
=end
|
100
|
+
end
|
120
101
|
|
121
|
-
|
122
|
-
|
123
|
-
raise
|
124
|
-
|
125
|
-
ranks = Array.new
|
126
|
-
@page.search("//table[@class=\"result\"]").each do |table|
|
127
|
-
ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
|
128
|
-
end
|
129
|
-
@data[:ranks] = ranks
|
130
|
-
return ranks
|
102
|
+
class BaiduResult
|
103
|
+
def initialize(page)
|
104
|
+
raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
105
|
+
@page = page
|
131
106
|
end
|
132
|
-
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
107
|
+
|
108
|
+
def ranks
|
109
|
+
@ranks ||= @page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//a").first['href'] }
|
110
|
+
end
|
111
|
+
|
112
|
+
#look up a word and get the rank of a uri with $host
|
113
|
+
def rank(host)#on base of ranks
|
114
|
+
ranks.each_with_index do |uri,index|
|
115
|
+
index+1 if URI.parse(URI.encode(uri)).host == host
|
141
116
|
end
|
142
|
-
@data[:realated_keywords] = keywords
|
143
|
-
return keywords
|
144
|
-
#m = /href="[^"]+">([^<]+)<\/a>/.match(related.content)
|
145
117
|
end
|
146
118
|
|
147
119
|
def how_many
|
148
|
-
|
149
|
-
raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
|
150
|
-
numSpan = @page.search("//span[@class='nums']").first
|
151
|
-
return false if numSpan.nil?
|
152
|
-
return numSpan.content.gsub(/\D/,'').to_i
|
153
|
-
#return false if @page.search("//span[@class='nums']").first.nil?
|
154
|
-
#return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
|
120
|
+
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
155
121
|
end
|
156
122
|
|
157
|
-
def
|
158
|
-
|
159
|
-
return false if (nextbtn.nil? or @currpage >= @maxpage)
|
160
|
-
@page = @a.click(nextbtn)
|
161
|
-
self.clean
|
162
|
-
return true
|
123
|
+
def related_keywords
|
124
|
+
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
163
125
|
end
|
164
|
-
|
165
|
-
def
|
166
|
-
@page.
|
167
|
-
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
168
|
-
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
126
|
+
|
127
|
+
def next
|
128
|
+
@page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
169
129
|
end
|
170
130
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,17 +9,17 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: to get keyword ranking,related queries and popularity from baidu.com.
|
15
|
-
this is built by a newbie, so please be careful
|
15
|
+
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
16
16
|
email: seoaqua@qq.com
|
17
17
|
executables: []
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
21
|
- lib/baidu.rb
|
22
|
-
homepage:
|
22
|
+
homepage: https://github.com/seoaqua/ruby-baidu
|
23
23
|
licenses: []
|
24
24
|
post_install_message:
|
25
25
|
rdoc_options: []
|
@@ -43,5 +43,5 @@ rubygems_version: 1.8.21
|
|
43
43
|
signing_key:
|
44
44
|
specification_version: 3
|
45
45
|
summary: to get keyword ranking,related queries and popularity from baidu.com. this
|
46
|
-
is built by a newbie, so please be careful
|
46
|
+
is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
47
47
|
test_files: []
|