baiduserp 2.1.1 → 2.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c411628080652da333ee3833dc986e0ede74639c
4
- data.tar.gz: 555ff4417a419c469459883781a2e01fc7e2c5e4
3
+ metadata.gz: 333ba9624a535b422d93a318e5b5db9f50681fab
4
+ data.tar.gz: fcc4bce281984552a81f1ecf138b91146ce3042d
5
5
  SHA512:
6
- metadata.gz: a7aa377b0ad8b77909eb916394554994bb3e5649b215ae68bf8ff5e794c1297313e9fef5464ece6f85db6af0638358d141f2e749d119dcd574b4f6dccfbe8d27
7
- data.tar.gz: 7b1a2a7e8f3c56bd60eef8e148c15aebd913ee27bae78fcf8719923973932f2eda10bd46656818ac10118ab05afd8ebed53eab405d08c9abeb1b5308e1daf7ed
6
+ metadata.gz: ad3edca21e77a528e13d2231737d8d78dd0fa4c221d6107a5e7ea933e0c51ee1e785c5fda5ffb79306e914b687a04dd6aa48ee8a34911474219d4bb6d0b81514
7
+ data.tar.gz: 5702393d3254700fd2ff721363ab68a7a394b7f1e3cae00a54adfbd4bff18a8cf4715f977729a90531c7b29977a9bfe1825980b4e8244d5d366092baa57e5d54
@@ -6,6 +6,10 @@ module Baiduserp
6
6
  Parser.new.search(keyword,page)
7
7
  end
8
8
 
9
+ def self.get_search_html(keyword,page=1)
10
+ Parser.new.get_search_html(keyword,page)
11
+ end
12
+
9
13
  def self.parse(html)
10
14
  Parser.new.parse html
11
15
  end
@@ -23,7 +23,7 @@ module Baiduserp
23
23
  sleep(10)
24
24
  retry
25
25
  end
26
- if response.code == 301
26
+ if response.code != 200
27
27
  sleep(rand(60)+60)
28
28
  response = self.get_serp(url,retries - 1)
29
29
  end
@@ -40,10 +40,16 @@ module Baiduserp
40
40
  end
41
41
 
42
42
  def search(keyword,page=1)
43
+ html = get_search_html(keyword,page)
44
+ parse html
45
+ end
46
+
47
+ def get_search_html(keyword,page=1)
43
48
  keyword = keyword.gsub(" ","+")
44
49
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
45
50
  serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
46
- parse_file(serp_url)
51
+ p serp_url
52
+ Client.get_serp(serp_url)
47
53
  end
48
54
 
49
55
  def parse_file(file_path)
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.1.1"
2
+ VERSION = "2.1.5"
3
3
  end
@@ -1,29 +1,33 @@
1
1
  class Baiduserp::Parser
2
2
  def _parse_ads_top(file)
3
3
  result = []
4
- file[:doc].search('div.ec_pp_f').each do |div|
4
+ rank = 0
5
+
6
+ file[:doc].search('div#content_left').first.children.each do |div|
5
7
  id = div['id'].to_i
6
- next unless id >= 3000
7
- r = {rank: id}
8
+ break if id > 0 && id < 3000
9
+ next unless div['class'].to_s.include?('ec_pp_f')
10
+ rank += 1
8
11
 
9
- r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
12
+ if div.name == 'div'
13
+ r = {rank: rank, id: id}
10
14
 
11
- r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
15
+ r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))
12
16
 
13
- r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
17
+ r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))
14
18
 
15
- result << r
16
- end
19
+ r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))
20
+
21
+ result << r
17
22
 
18
- if result.empty?
19
- file[:doc].search('table.ec_pp_f').each_with_index do |table,i|
20
- r = {rank: i + 1}
23
+ else # div.name == 'table'
24
+ r = {rank: rank, id: id}
21
25
 
22
- r[:title] = Baiduserp::Helper.get_content_safe(table.search('td.EC_header/a'))
26
+ r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a'))
23
27
 
24
- r[:content] = Baiduserp::Helper.get_content_safe(table.search('a.EC_desc'))
28
+ r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc'))
25
29
 
26
- r[:site] = Baiduserp::Helper.get_content_safe(table.search('a.EC_url'))
30
+ r[:site] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url'))
27
31
 
28
32
  result << r
29
33
 
@@ -5,8 +5,19 @@ class Baiduserp::Parser
5
5
  next if table.nil?
6
6
  id = table['id'].to_i
7
7
  next unless id > 0 && id < 3000
8
+
8
9
  r = {:rank => id}
9
10
 
11
+ r[:result_op] = table['class'].to_s.include?('result-op')
12
+
13
+ r[:fk] = table['fk']
14
+
15
+ r[:srcid] = table['srcid']
16
+
17
+ r[:tpl] = table['tpl']
18
+
19
+ r[:mu] = table['mu']
20
+
10
21
  url = table.search('h3/a').first
11
22
  unless url.nil?
12
23
  url = url['href']
@@ -23,8 +34,6 @@ class Baiduserp::Parser
23
34
 
24
35
  r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))
25
36
 
26
- r[:mu] = table['mu']
27
-
28
37
  table.search('a').each do |link|
29
38
  r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
30
39
  end
@@ -2,7 +2,11 @@ class Baiduserp::Parser
2
2
  def _parse_zhixin(file)
3
3
  result = []
4
4
  file[:doc].search("div#content_left .result-zxl").each do |zxl|
5
- result << {:id => zxl['id'], :tpl => zxl['tpl'], :mu => zxl['mu'] }
5
+ result << {:id => zxl['id'],
6
+ :srcid => zxl['srcid'],
7
+ :fk => zxl['fk'],
8
+ :tpl => zxl['tpl'],
9
+ :mu => zxl['mu'] }
6
10
  end
7
11
  result
8
12
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-05 00:00:00.000000000 Z
11
+ date: 2013-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri