baidu 1.1.4 → 1.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/baidu.rb +76 -21
- metadata +52 -12
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6e5e2be5751728aa5aab6cc36fc9552a74aadba1
|
4
|
+
data.tar.gz: 28a8bb12fad2e0d3164908a61f13a9fc1d1d03e2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d2b99a90386ebad73fac60db3331d0ea3e8021c657ef7d6d4c0b703023df86f8b3bfc2df434862a07dcc7b1496f026779bcbd592b9159e11ae2790ccdb9d0481
|
7
|
+
data.tar.gz: ea3191864319521de14ab735a770e08204a64fc40e23d836be4e196669f24e2b0f1c994f2ab9ed2e5f924ebb650f8dba6f6ae318a472bdc46d64c27b3afb684d
|
data/lib/baidu.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
#
|
2
|
-
require 'mechanize'
|
1
|
+
# encoding: utf-8
|
3
2
|
require 'nokogiri'
|
4
3
|
require 'json'
|
5
4
|
require 'addressable/uri'
|
@@ -10,7 +9,7 @@ class SearchEngine
|
|
10
9
|
URI(url)
|
11
10
|
result = query(url)
|
12
11
|
return result.has_result?
|
13
|
-
end
|
12
|
+
end
|
14
13
|
end
|
15
14
|
class SearchResult
|
16
15
|
def initialize(body,baseuri,pagenumber=nil)
|
@@ -141,7 +140,9 @@ class MbaiduResult < SearchResult
|
|
141
140
|
href,text,host,is_mobile = '','','',false
|
142
141
|
a = result.search("a").first
|
143
142
|
is_mobile = true unless a.search("img").empty?
|
144
|
-
host = result.search('[@class="site"]').first
|
143
|
+
host = result.search('[@class="site"]').first
|
144
|
+
next if host.nil?
|
145
|
+
host = host.text
|
145
146
|
href = a['href']
|
146
147
|
text = a.text
|
147
148
|
id = href.scan(/&order=(\d+)&/)
|
@@ -237,7 +238,7 @@ class Baidu < SearchEngine
|
|
237
238
|
def extend(words,level=3,sleeptime=1)
|
238
239
|
level = level.to_i - 1
|
239
240
|
words = [words] unless words.respond_to? 'each'
|
240
|
-
|
241
|
+
|
241
242
|
extensions = Array.new
|
242
243
|
words.each do |word|
|
243
244
|
self.query(word)
|
@@ -254,7 +255,7 @@ class Baidu < SearchEngine
|
|
254
255
|
def popular?(wd)
|
255
256
|
return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
|
256
257
|
end
|
257
|
-
|
258
|
+
|
258
259
|
def query(wd)
|
259
260
|
q = Array.new
|
260
261
|
q << "wd=#{wd}"
|
@@ -263,8 +264,9 @@ class Baidu < SearchEngine
|
|
263
264
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
264
265
|
uri = URI.encode((BaseUri + queryStr))
|
265
266
|
begin
|
266
|
-
@page = @a.get uri
|
267
|
-
|
267
|
+
# @page = @a.get uri
|
268
|
+
@page = HTTParty.get uri
|
269
|
+
BaiduResult.new(@page,uri)
|
268
270
|
rescue Net::HTTP::Persistent::Error
|
269
271
|
warn "[timeout] #{uri}"
|
270
272
|
return false
|
@@ -307,23 +309,28 @@ class Baidu < SearchEngine
|
|
307
309
|
end
|
308
310
|
|
309
311
|
class BaiduResult < SearchResult
|
310
|
-
def initialize(page)
|
311
|
-
|
312
|
-
@page = page
|
312
|
+
def initialize(page,baseuri,pagenumber=1)
|
313
|
+
File.open('/tmp/file','w'){|f|f.puts page}
|
314
|
+
@page = Nokogiri::HTML page
|
315
|
+
@baseuri = baseuri
|
316
|
+
@pagenumber = pagenumber
|
317
|
+
# raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
318
|
+
# @page = page
|
313
319
|
end
|
314
|
-
|
320
|
+
|
315
321
|
def ranks
|
316
322
|
return @ranks unless @ranks.nil?
|
317
323
|
@ranks = Hash.new
|
318
|
-
@page.search("//table[@class=\"result\"]").each do |table|
|
324
|
+
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
319
325
|
id = table['id']
|
320
326
|
@ranks[id] = Hash.new
|
321
|
-
url =
|
322
|
-
|
327
|
+
url = table.search("[@class=\"g\"]").first
|
328
|
+
url = url.text unless url.nil?
|
329
|
+
a = table.search("a").first
|
323
330
|
@ranks[id]['text'] = a.text
|
324
|
-
@ranks[id]['href'] = a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
331
|
+
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
325
332
|
unless url.nil?
|
326
|
-
url = url.
|
333
|
+
url = url.strip
|
327
334
|
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
328
335
|
else
|
329
336
|
@ranks[id]['host'] = nil
|
@@ -332,7 +339,49 @@ class BaiduResult < SearchResult
|
|
332
339
|
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
333
340
|
@ranks
|
334
341
|
end
|
335
|
-
|
342
|
+
|
343
|
+
def ads_top
|
344
|
+
ads = {}
|
345
|
+
id=0
|
346
|
+
@page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
|
347
|
+
table_id = table['id']
|
348
|
+
next if table_id.nil?
|
349
|
+
id += 1
|
350
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
351
|
+
title = table.search("a").first.text.strip
|
352
|
+
ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
|
353
|
+
end
|
354
|
+
ads
|
355
|
+
end
|
356
|
+
def ads_bottom
|
357
|
+
ads = {}
|
358
|
+
@page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
|
359
|
+
id = table['id']
|
360
|
+
next unless id.nil?
|
361
|
+
id = id[-1,1]
|
362
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
363
|
+
title = table.search("a").first.text.strip
|
364
|
+
ads[id]= {'title'=>title,'href' => href,'host'=>href}
|
365
|
+
end
|
366
|
+
ads
|
367
|
+
end
|
368
|
+
def ads_right
|
369
|
+
ads = {}
|
370
|
+
@page.search("//div[@id='ec_im_container']").each do |table|
|
371
|
+
table.search("div[@id]").each do |div|
|
372
|
+
id = div['id'][-1,1].to_i+1
|
373
|
+
title = div.search("a").first
|
374
|
+
next if title.nil?
|
375
|
+
title = title.text
|
376
|
+
url = div.search("font[@color='#008000']").first
|
377
|
+
next if url.nil?
|
378
|
+
url = url.text
|
379
|
+
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
380
|
+
end
|
381
|
+
end
|
382
|
+
ads
|
383
|
+
end
|
384
|
+
|
336
385
|
#return the top rank number from @ranks with the input host
|
337
386
|
# def rank(host)#on base of ranks
|
338
387
|
# ranks.each do |id,line|
|
@@ -353,13 +402,19 @@ class BaiduResult < SearchResult
|
|
353
402
|
def related_keywords
|
354
403
|
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
355
404
|
end
|
356
|
-
|
405
|
+
|
357
406
|
def next
|
358
|
-
|
407
|
+
url = @page.xpath('//a[text()="下一页>"]').first
|
408
|
+
return if url.nil?
|
409
|
+
url = url['href']
|
410
|
+
url = URI.join(@baseuri,url).to_s
|
411
|
+
body = HTTParty.get(url)
|
412
|
+
return BaiduResult.new(body,url,@pagenumber+1)
|
413
|
+
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
359
414
|
end
|
360
415
|
|
361
416
|
def has_result?
|
362
417
|
@page.search('//div[@class="nors"]').empty?
|
363
418
|
end
|
364
|
-
|
419
|
+
|
365
420
|
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
5
|
-
prerelease:
|
4
|
+
version: 1.1.5
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- seoaqua
|
@@ -10,8 +9,50 @@ autorequire:
|
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
11
|
date: 2012-06-13 00:00:00.000000000 Z
|
13
|
-
dependencies:
|
14
|
-
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: addressable
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: httparty
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
|
15
56
|
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
16
57
|
email: seoaqua@qq.com
|
17
58
|
executables: []
|
@@ -21,27 +62,26 @@ files:
|
|
21
62
|
- lib/baidu.rb
|
22
63
|
homepage: https://github.com/seoaqua/ruby-baidu
|
23
64
|
licenses: []
|
65
|
+
metadata: {}
|
24
66
|
post_install_message:
|
25
67
|
rdoc_options: []
|
26
68
|
require_paths:
|
27
69
|
- lib
|
28
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
-
none: false
|
30
71
|
requirements:
|
31
|
-
- -
|
72
|
+
- - '>='
|
32
73
|
- !ruby/object:Gem::Version
|
33
74
|
version: '0'
|
34
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
76
|
requirements:
|
37
|
-
- -
|
77
|
+
- - '>='
|
38
78
|
- !ruby/object:Gem::Version
|
39
79
|
version: '0'
|
40
80
|
requirements: []
|
41
81
|
rubyforge_project:
|
42
|
-
rubygems_version:
|
82
|
+
rubygems_version: 2.0.3
|
43
83
|
signing_key:
|
44
|
-
specification_version:
|
45
|
-
summary: to get keyword ranking,related queries and popularity from baidu.com.
|
46
|
-
is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
84
|
+
specification_version: 4
|
85
|
+
summary: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
|
86
|
+
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
47
87
|
test_files: []
|