baidu 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/baidu.rb +76 -21
- metadata +52 -12
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6e5e2be5751728aa5aab6cc36fc9552a74aadba1
|
4
|
+
data.tar.gz: 28a8bb12fad2e0d3164908a61f13a9fc1d1d03e2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d2b99a90386ebad73fac60db3331d0ea3e8021c657ef7d6d4c0b703023df86f8b3bfc2df434862a07dcc7b1496f026779bcbd592b9159e11ae2790ccdb9d0481
|
7
|
+
data.tar.gz: ea3191864319521de14ab735a770e08204a64fc40e23d836be4e196669f24e2b0f1c994f2ab9ed2e5f924ebb650f8dba6f6ae318a472bdc46d64c27b3afb684d
|
data/lib/baidu.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
#
|
2
|
-
require 'mechanize'
|
1
|
+
# encoding: utf-8
|
3
2
|
require 'nokogiri'
|
4
3
|
require 'json'
|
5
4
|
require 'addressable/uri'
|
@@ -10,7 +9,7 @@ class SearchEngine
|
|
10
9
|
URI(url)
|
11
10
|
result = query(url)
|
12
11
|
return result.has_result?
|
13
|
-
end
|
12
|
+
end
|
14
13
|
end
|
15
14
|
class SearchResult
|
16
15
|
def initialize(body,baseuri,pagenumber=nil)
|
@@ -141,7 +140,9 @@ class MbaiduResult < SearchResult
|
|
141
140
|
href,text,host,is_mobile = '','','',false
|
142
141
|
a = result.search("a").first
|
143
142
|
is_mobile = true unless a.search("img").empty?
|
144
|
-
host = result.search('[@class="site"]').first
|
143
|
+
host = result.search('[@class="site"]').first
|
144
|
+
next if host.nil?
|
145
|
+
host = host.text
|
145
146
|
href = a['href']
|
146
147
|
text = a.text
|
147
148
|
id = href.scan(/&order=(\d+)&/)
|
@@ -237,7 +238,7 @@ class Baidu < SearchEngine
|
|
237
238
|
def extend(words,level=3,sleeptime=1)
|
238
239
|
level = level.to_i - 1
|
239
240
|
words = [words] unless words.respond_to? 'each'
|
240
|
-
|
241
|
+
|
241
242
|
extensions = Array.new
|
242
243
|
words.each do |word|
|
243
244
|
self.query(word)
|
@@ -254,7 +255,7 @@ class Baidu < SearchEngine
|
|
254
255
|
def popular?(wd)
|
255
256
|
return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
|
256
257
|
end
|
257
|
-
|
258
|
+
|
258
259
|
def query(wd)
|
259
260
|
q = Array.new
|
260
261
|
q << "wd=#{wd}"
|
@@ -263,8 +264,9 @@ class Baidu < SearchEngine
|
|
263
264
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
264
265
|
uri = URI.encode((BaseUri + queryStr))
|
265
266
|
begin
|
266
|
-
@page = @a.get uri
|
267
|
-
|
267
|
+
# @page = @a.get uri
|
268
|
+
@page = HTTParty.get uri
|
269
|
+
BaiduResult.new(@page,uri)
|
268
270
|
rescue Net::HTTP::Persistent::Error
|
269
271
|
warn "[timeout] #{uri}"
|
270
272
|
return false
|
@@ -307,23 +309,28 @@ class Baidu < SearchEngine
|
|
307
309
|
end
|
308
310
|
|
309
311
|
class BaiduResult < SearchResult
|
310
|
-
def initialize(page)
|
311
|
-
|
312
|
-
@page = page
|
312
|
+
def initialize(page,baseuri,pagenumber=1)
|
313
|
+
File.open('/tmp/file','w'){|f|f.puts page}
|
314
|
+
@page = Nokogiri::HTML page
|
315
|
+
@baseuri = baseuri
|
316
|
+
@pagenumber = pagenumber
|
317
|
+
# raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
318
|
+
# @page = page
|
313
319
|
end
|
314
|
-
|
320
|
+
|
315
321
|
def ranks
|
316
322
|
return @ranks unless @ranks.nil?
|
317
323
|
@ranks = Hash.new
|
318
|
-
@page.search("//table[@class=\"result\"]").each do |table|
|
324
|
+
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
319
325
|
id = table['id']
|
320
326
|
@ranks[id] = Hash.new
|
321
|
-
url =
|
322
|
-
|
327
|
+
url = table.search("[@class=\"g\"]").first
|
328
|
+
url = url.text unless url.nil?
|
329
|
+
a = table.search("a").first
|
323
330
|
@ranks[id]['text'] = a.text
|
324
|
-
@ranks[id]['href'] = a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
331
|
+
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
325
332
|
unless url.nil?
|
326
|
-
url = url.
|
333
|
+
url = url.strip
|
327
334
|
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
328
335
|
else
|
329
336
|
@ranks[id]['host'] = nil
|
@@ -332,7 +339,49 @@ class BaiduResult < SearchResult
|
|
332
339
|
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
333
340
|
@ranks
|
334
341
|
end
|
335
|
-
|
342
|
+
|
343
|
+
def ads_top
|
344
|
+
ads = {}
|
345
|
+
id=0
|
346
|
+
@page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
|
347
|
+
table_id = table['id']
|
348
|
+
next if table_id.nil?
|
349
|
+
id += 1
|
350
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
351
|
+
title = table.search("a").first.text.strip
|
352
|
+
ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
|
353
|
+
end
|
354
|
+
ads
|
355
|
+
end
|
356
|
+
def ads_bottom
|
357
|
+
ads = {}
|
358
|
+
@page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
|
359
|
+
id = table['id']
|
360
|
+
next unless id.nil?
|
361
|
+
id = id[-1,1]
|
362
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
363
|
+
title = table.search("a").first.text.strip
|
364
|
+
ads[id]= {'title'=>title,'href' => href,'host'=>href}
|
365
|
+
end
|
366
|
+
ads
|
367
|
+
end
|
368
|
+
def ads_right
|
369
|
+
ads = {}
|
370
|
+
@page.search("//div[@id='ec_im_container']").each do |table|
|
371
|
+
table.search("div[@id]").each do |div|
|
372
|
+
id = div['id'][-1,1].to_i+1
|
373
|
+
title = div.search("a").first
|
374
|
+
next if title.nil?
|
375
|
+
title = title.text
|
376
|
+
url = div.search("font[@color='#008000']").first
|
377
|
+
next if url.nil?
|
378
|
+
url = url.text
|
379
|
+
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
380
|
+
end
|
381
|
+
end
|
382
|
+
ads
|
383
|
+
end
|
384
|
+
|
336
385
|
#return the top rank number from @ranks with the input host
|
337
386
|
# def rank(host)#on base of ranks
|
338
387
|
# ranks.each do |id,line|
|
@@ -353,13 +402,19 @@ class BaiduResult < SearchResult
|
|
353
402
|
def related_keywords
|
354
403
|
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
355
404
|
end
|
356
|
-
|
405
|
+
|
357
406
|
def next
|
358
|
-
|
407
|
+
url = @page.xpath('//a[text()="下一页>"]').first
|
408
|
+
return if url.nil?
|
409
|
+
url = url['href']
|
410
|
+
url = URI.join(@baseuri,url).to_s
|
411
|
+
body = HTTParty.get(url)
|
412
|
+
return BaiduResult.new(body,url,@pagenumber+1)
|
413
|
+
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
359
414
|
end
|
360
415
|
|
361
416
|
def has_result?
|
362
417
|
@page.search('//div[@class="nors"]').empty?
|
363
418
|
end
|
364
|
-
|
419
|
+
|
365
420
|
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baidu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
5
|
-
prerelease:
|
4
|
+
version: 1.1.5
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- seoaqua
|
@@ -10,8 +9,50 @@ autorequire:
|
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
11
|
date: 2012-06-13 00:00:00.000000000 Z
|
13
|
-
dependencies:
|
14
|
-
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: addressable
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: httparty
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
|
15
56
|
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
16
57
|
email: seoaqua@qq.com
|
17
58
|
executables: []
|
@@ -21,27 +62,26 @@ files:
|
|
21
62
|
- lib/baidu.rb
|
22
63
|
homepage: https://github.com/seoaqua/ruby-baidu
|
23
64
|
licenses: []
|
65
|
+
metadata: {}
|
24
66
|
post_install_message:
|
25
67
|
rdoc_options: []
|
26
68
|
require_paths:
|
27
69
|
- lib
|
28
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
-
none: false
|
30
71
|
requirements:
|
31
|
-
- -
|
72
|
+
- - '>='
|
32
73
|
- !ruby/object:Gem::Version
|
33
74
|
version: '0'
|
34
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
76
|
requirements:
|
37
|
-
- -
|
77
|
+
- - '>='
|
38
78
|
- !ruby/object:Gem::Version
|
39
79
|
version: '0'
|
40
80
|
requirements: []
|
41
81
|
rubyforge_project:
|
42
|
-
rubygems_version:
|
82
|
+
rubygems_version: 2.0.3
|
43
83
|
signing_key:
|
44
|
-
specification_version:
|
45
|
-
summary: to get keyword ranking,related queries and popularity from baidu.com.
|
46
|
-
is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
84
|
+
specification_version: 4
|
85
|
+
summary: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
|
86
|
+
this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
|
47
87
|
test_files: []
|