baidu 1.2.3 → 1.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/baidu.rb +24 -26
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ad277bccec5ed902f59dbaab5e18aaa5fa5af8f
|
4
|
+
data.tar.gz: 08a52511a1952bc3d0ab3398cca376d7f8960bed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b063ef6a9d4d85dea496b1b28fabe9ce184384452991de9872551cd39535e1bb55084156205bbdafadda7ee4f5be80f28517004d2ec5be683d7949647f4dbd6
|
7
|
+
data.tar.gz: b0950ebbb8c6aa2fd49388d92bc21306cbf323f5a2971380194525ee834f07a2abe862da3f10a87b3d375d1728699a087229eaceb221617f265f10b3c9fbd12e
|
data/lib/baidu.rb
CHANGED
@@ -5,6 +5,9 @@ require 'addressable/uri'
|
|
5
5
|
require 'httparty'
|
6
6
|
class SearchEngine
|
7
7
|
#是否收录
|
8
|
+
def initialize(perpage = 100)
|
9
|
+
@perpage = perpage
|
10
|
+
end
|
8
11
|
def indexed?(url)
|
9
12
|
URI(url)
|
10
13
|
result = query(url)
|
@@ -22,7 +25,14 @@ class SearchResult
|
|
22
25
|
@pagenumber = pagenumber
|
23
26
|
end
|
24
27
|
end
|
25
|
-
|
28
|
+
def whole
|
29
|
+
{
|
30
|
+
'ads_top'=>ads_top,
|
31
|
+
'ads_right'=>ads_right,
|
32
|
+
'ads_bottom'=>ads_bottom,
|
33
|
+
'ranks'=>ranks
|
34
|
+
}
|
35
|
+
end
|
26
36
|
#返回当前页中host满足条件的结果
|
27
37
|
def ranks_for(specific_host)
|
28
38
|
host_ranks = Hash.new
|
@@ -64,7 +74,6 @@ end
|
|
64
74
|
|
65
75
|
class QihooResult < SearchResult
|
66
76
|
Host = 'www.so.com'
|
67
|
-
|
68
77
|
#返回所有当前页的排名结果
|
69
78
|
def ranks
|
70
79
|
return @ranks unless @ranks.nil?
|
@@ -267,15 +276,6 @@ class MbaiduResult < SearchResult
|
|
267
276
|
end
|
268
277
|
class Baidu < SearchEngine
|
269
278
|
BaseUri = 'http://www.baidu.com/s?'
|
270
|
-
PerPage = 100
|
271
|
-
|
272
|
-
def initialize
|
273
|
-
# @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
274
|
-
# @a.idle_timeout = 2
|
275
|
-
# @a.max_history = 1
|
276
|
-
@page = nil
|
277
|
-
end
|
278
|
-
|
279
279
|
def suggestions(wd)
|
280
280
|
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
281
281
|
m = /\[([^\]]*)\]/.match json
|
@@ -313,7 +313,7 @@ class Baidu < SearchEngine
|
|
313
313
|
def query(wd)
|
314
314
|
q = Array.new
|
315
315
|
q << "wd=#{wd}"
|
316
|
-
q << "rn=#{
|
316
|
+
q << "rn=#{@perpage}"
|
317
317
|
queryStr = q.join("&")
|
318
318
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
319
319
|
uri = URI.encode((BaseUri + queryStr))
|
@@ -364,7 +364,6 @@ end
|
|
364
364
|
|
365
365
|
class BaiduResult < SearchResult
|
366
366
|
def initialize(page,baseuri,pagenumber=1)
|
367
|
-
File.open('/tmp/file','w'){|f|f.puts page}
|
368
367
|
@page = Nokogiri::HTML page
|
369
368
|
@baseuri = baseuri
|
370
369
|
@pagenumber = pagenumber
|
@@ -396,31 +395,30 @@ class BaiduResult < SearchResult
|
|
396
395
|
end
|
397
396
|
|
398
397
|
def ads_bottom
|
398
|
+
id = 0
|
399
399
|
ads = {}
|
400
|
-
|
401
|
-
|
402
|
-
next if table['id'].nil?
|
400
|
+
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
401
|
+
next unless table['id'].nil?
|
403
402
|
id += 1
|
404
|
-
|
405
|
-
title = table.search("a").first.text.strip
|
406
|
-
ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
|
403
|
+
ads[id]= parse_ad(table)
|
407
404
|
end
|
408
405
|
ads
|
409
406
|
end
|
410
407
|
def ads_top
|
411
408
|
id = 0
|
412
409
|
ads = {}
|
413
|
-
@page.search("//table[@
|
410
|
+
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
411
|
+
next if id.nil?
|
414
412
|
id += 1
|
415
|
-
|
416
|
-
next unless id.nil?
|
417
|
-
# id = id[-1,1]
|
418
|
-
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
419
|
-
title = table.search("a").first.text.strip
|
420
|
-
ads[id]= {'title'=>title,'href' => href,'host'=>href}
|
413
|
+
ads[id]= parse_ad(table)
|
421
414
|
end
|
422
415
|
ads
|
423
416
|
end
|
417
|
+
def parse_ad(table)
|
418
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
419
|
+
title = table.search("a").first.text.strip
|
420
|
+
{'title'=>title,'href' => href,'host'=>href}
|
421
|
+
end
|
424
422
|
def ads_right
|
425
423
|
ads = {}
|
426
424
|
@page.search("//div[@id='ec_im_container']").each do |table|
|