baiduserp 2.1.1 → 2.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c411628080652da333ee3833dc986e0ede74639c
4
- data.tar.gz: 555ff4417a419c469459883781a2e01fc7e2c5e4
3
+ metadata.gz: 333ba9624a535b422d93a318e5b5db9f50681fab
4
+ data.tar.gz: fcc4bce281984552a81f1ecf138b91146ce3042d
5
5
  SHA512:
6
- metadata.gz: a7aa377b0ad8b77909eb916394554994bb3e5649b215ae68bf8ff5e794c1297313e9fef5464ece6f85db6af0638358d141f2e749d119dcd574b4f6dccfbe8d27
7
- data.tar.gz: 7b1a2a7e8f3c56bd60eef8e148c15aebd913ee27bae78fcf8719923973932f2eda10bd46656818ac10118ab05afd8ebed53eab405d08c9abeb1b5308e1daf7ed
6
+ metadata.gz: ad3edca21e77a528e13d2231737d8d78dd0fa4c221d6107a5e7ea933e0c51ee1e785c5fda5ffb79306e914b687a04dd6aa48ee8a34911474219d4bb6d0b81514
7
+ data.tar.gz: 5702393d3254700fd2ff721363ab68a7a394b7f1e3cae00a54adfbd4bff18a8cf4715f977729a90531c7b29977a9bfe1825980b4e8244d5d366092baa57e5d54
@@ -6,6 +6,10 @@ module Baiduserp
6
6
  Parser.new.search(keyword,page)
7
7
  end
8
8
 
9
+ def self.get_search_html(keyword,page=1)
10
+ Parser.new.get_search_html(keyword,page)
11
+ end
12
+
9
13
  def self.parse(html)
10
14
  Parser.new.parse html
11
15
  end
@@ -23,7 +23,7 @@ module Baiduserp
23
23
  sleep(10)
24
24
  retry
25
25
  end
26
- if response.code == 301
26
+ if response.code != 200
27
27
  sleep(rand(60)+60)
28
28
  response = self.get_serp(url,retries - 1)
29
29
  end
@@ -40,10 +40,16 @@ module Baiduserp
40
40
  end
41
41
 
42
42
  def search(keyword,page=1)
43
+ html = get_search_html(keyword,page)
44
+ parse html
45
+ end
46
+
47
+ def get_search_html(keyword,page=1)
43
48
  keyword = keyword.gsub(" ","+")
44
49
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
45
50
  serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
46
- parse_file(serp_url)
51
+ p serp_url
52
+ Client.get_serp(serp_url)
47
53
  end
48
54
 
49
55
  def parse_file(file_path)
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.1.1"
2
+ VERSION = "2.1.5"
3
3
  end
@@ -1,29 +1,33 @@
1
1
  class Baiduserp::Parser
2
2
  def _parse_ads_top(file)
3
3
  result = []
4
- file[:doc].search('div.ec_pp_f').each do |div|
4
+ rank = 0
5
+
6
+ file[:doc].search('div#content_left').first.children.each do |div|
5
7
  id = div['id'].to_i
6
- next unless id >= 3000
7
- r = {rank: id}
8
+ break if id > 0 && id < 3000
9
+ next unless div['class'].to_s.include?('ec_pp_f')
10
+ rank += 1
8
11
 
9
- r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
12
+ if div.name == 'div'
13
+ r = {rank: rank, id: id}
10
14
 
11
- r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
15
+ r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
12
16
 
13
- r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
17
+ r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
14
18
 
15
- result << r
16
- end
19
+ r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
20
+
21
+ result << r
17
22
 
18
- if result.empty?
19
- file[:doc].search('table.ec_pp_f').each_with_index do |table,i|
20
- r = {rank: i + 1}
23
+ else # div.name == 'table'
24
+ r = {rank: rank, id: id}
21
25
 
22
- r[:title] = Baiduserp::Helper.get_content_safe(table.search('td.EC_header/a'))
26
+ r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a'))
23
27
 
24
- r[:content] = Baiduserp::Helper.get_content_safe(table.search('a.EC_desc'))
28
+ r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc'))
25
29
 
26
- r[:site] = Baiduserp::Helper.get_content_safe(table.search('a.EC_url'))
30
+ r[:site] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url'))
27
31
 
28
32
  result << r
29
33
 
@@ -5,8 +5,19 @@ class Baiduserp::Parser
5
5
  next if table.nil?
6
6
  id = table['id'].to_i
7
7
  next unless id > 0 && id < 3000
8
+
8
9
  r = {:rank => id}
9
10
 
11
+ r[:result_op] = table['class'].to_s.include?('result-op')
12
+
13
+ r[:fk] = table['fk']
14
+
15
+ r[:srcid] = table['srcid']
16
+
17
+ r[:tpl] = table['tpl']
18
+
19
+ r[:mu] = table['mu']
20
+
10
21
  url = table.search('h3/a').first
11
22
  unless url.nil?
12
23
  url = url['href']
@@ -23,8 +34,6 @@ class Baiduserp::Parser
23
34
 
24
35
  r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))
25
36
 
26
- r[:mu] = table['mu']
27
-
28
37
  table.search('a').each do |link|
29
38
  r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
30
39
  end
@@ -2,7 +2,11 @@ class Baiduserp::Parser
2
2
  def _parse_zhixin(file)
3
3
  result = []
4
4
  file[:doc].search("div#content_left .result-zxl").each do |zxl|
5
- result << {:id => zxl['id'], :tpl => zxl['tpl'], :mu => zxl['mu'] }
5
+ result << {:id => zxl['id'],
6
+ :srcid => zxl['srcid'],
7
+ :fk => zxl['fk'],
8
+ :tpl => zxl['tpl'],
9
+ :mu => zxl['mu'] }
6
10
  end
7
11
  result
8
12
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-05 00:00:00.000000000 Z
11
+ date: 2013-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri