baidu 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/baidu.rb +24 -26
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ad277bccec5ed902f59dbaab5e18aaa5fa5af8f
|
4
|
+
data.tar.gz: 08a52511a1952bc3d0ab3398cca376d7f8960bed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b063ef6a9d4d85dea496b1b28fabe9ce184384452991de9872551cd39535e1bb55084156205bbdafadda7ee4f5be80f28517004d2ec5be683d7949647f4dbd6
|
7
|
+
data.tar.gz: b0950ebbb8c6aa2fd49388d92bc21306cbf323f5a2971380194525ee834f07a2abe862da3f10a87b3d375d1728699a087229eaceb221617f265f10b3c9fbd12e
|
data/lib/baidu.rb
CHANGED
@@ -5,6 +5,9 @@ require 'addressable/uri'
|
|
5
5
|
require 'httparty'
|
6
6
|
class SearchEngine
|
7
7
|
#是否收录
|
8
|
+
def initialize(perpage = 100)
|
9
|
+
@perpage = perpage
|
10
|
+
end
|
8
11
|
def indexed?(url)
|
9
12
|
URI(url)
|
10
13
|
result = query(url)
|
@@ -22,7 +25,14 @@ class SearchResult
|
|
22
25
|
@pagenumber = pagenumber
|
23
26
|
end
|
24
27
|
end
|
25
|
-
|
28
|
+
def whole
|
29
|
+
{
|
30
|
+
'ads_top'=>ads_top,
|
31
|
+
'ads_right'=>ads_right,
|
32
|
+
'ads_bottom'=>ads_bottom,
|
33
|
+
'ranks'=>ranks
|
34
|
+
}
|
35
|
+
end
|
26
36
|
#返回当前页中host满足条件的结果
|
27
37
|
def ranks_for(specific_host)
|
28
38
|
host_ranks = Hash.new
|
@@ -64,7 +74,6 @@ end
|
|
64
74
|
|
65
75
|
class QihooResult < SearchResult
|
66
76
|
Host = 'www.so.com'
|
67
|
-
|
68
77
|
#返回所有当前页的排名结果
|
69
78
|
def ranks
|
70
79
|
return @ranks unless @ranks.nil?
|
@@ -267,15 +276,6 @@ class MbaiduResult < SearchResult
|
|
267
276
|
end
|
268
277
|
class Baidu < SearchEngine
|
269
278
|
BaseUri = 'http://www.baidu.com/s?'
|
270
|
-
PerPage = 100
|
271
|
-
|
272
|
-
def initialize
|
273
|
-
# @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
|
274
|
-
# @a.idle_timeout = 2
|
275
|
-
# @a.max_history = 1
|
276
|
-
@page = nil
|
277
|
-
end
|
278
|
-
|
279
279
|
def suggestions(wd)
|
280
280
|
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
281
281
|
m = /\[([^\]]*)\]/.match json
|
@@ -313,7 +313,7 @@ class Baidu < SearchEngine
|
|
313
313
|
def query(wd)
|
314
314
|
q = Array.new
|
315
315
|
q << "wd=#{wd}"
|
316
|
-
q << "rn=#{
|
316
|
+
q << "rn=#{@perpage}"
|
317
317
|
queryStr = q.join("&")
|
318
318
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
319
319
|
uri = URI.encode((BaseUri + queryStr))
|
@@ -364,7 +364,6 @@ end
|
|
364
364
|
|
365
365
|
class BaiduResult < SearchResult
|
366
366
|
def initialize(page,baseuri,pagenumber=1)
|
367
|
-
File.open('/tmp/file','w'){|f|f.puts page}
|
368
367
|
@page = Nokogiri::HTML page
|
369
368
|
@baseuri = baseuri
|
370
369
|
@pagenumber = pagenumber
|
@@ -396,31 +395,30 @@ class BaiduResult < SearchResult
|
|
396
395
|
end
|
397
396
|
|
398
397
|
def ads_bottom
|
398
|
+
id = 0
|
399
399
|
ads = {}
|
400
|
-
|
401
|
-
|
402
|
-
next if table['id'].nil?
|
400
|
+
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
401
|
+
next unless table['id'].nil?
|
403
402
|
id += 1
|
404
|
-
|
405
|
-
title = table.search("a").first.text.strip
|
406
|
-
ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
|
403
|
+
ads[id]= parse_ad(table)
|
407
404
|
end
|
408
405
|
ads
|
409
406
|
end
|
410
407
|
def ads_top
|
411
408
|
id = 0
|
412
409
|
ads = {}
|
413
|
-
@page.search("//table[@
|
410
|
+
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
411
|
+
next if id.nil?
|
414
412
|
id += 1
|
415
|
-
|
416
|
-
next unless id.nil?
|
417
|
-
# id = id[-1,1]
|
418
|
-
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
419
|
-
title = table.search("a").first.text.strip
|
420
|
-
ads[id]= {'title'=>title,'href' => href,'host'=>href}
|
413
|
+
ads[id]= parse_ad(table)
|
421
414
|
end
|
422
415
|
ads
|
423
416
|
end
|
417
|
+
def parse_ad(table)
|
418
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
419
|
+
title = table.search("a").first.text.strip
|
420
|
+
{'title'=>title,'href' => href,'host'=>href}
|
421
|
+
end
|
424
422
|
def ads_right
|
425
423
|
ads = {}
|
426
424
|
@page.search("//div[@id='ec_im_container']").each do |table|
|