baiduserp 2.1.6 → 2.1.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f9ab6771775f8898485a11610cbb943f18c2104f
4
- data.tar.gz: dcc6a0c371330b1cc9fffcd675b5f54f185a02bf
3
+ metadata.gz: d5fe173a0b067ff22f37a68ab8cd4d633b556630
4
+ data.tar.gz: e42a0a457467435480fd7cf7e82dfbfac88b9691
5
5
  SHA512:
6
- metadata.gz: 8c3589e23d461f3629dd9419d3b595f17c6cab74262bd9764f105d719dc3be39c25f9bae3ad0be55a2dd820cac68eac5fb1000dbe187f2b2b3c98cf57dc5470b
7
- data.tar.gz: bb1609eaffcf0f201f52956c6883927127383d8f19ab41edbd90fe6204bebbbd3a97ef6a2ce4908ab6fea07dcaf4c0091e69c4baa41f47addc205a7dba9b3aa5
6
+ metadata.gz: c4d75fa5fb5429aaa2e9a0293203dd624b4724aab1a1a7e5f3b39dba74b40a9c2bcbd82ec21c6318bc8f8955fa70a7d8a17c7dfef57049ba4e8e8f8366d6ab55
7
+ data.tar.gz: 90f9f95d9f822d278f0c755ce0f4048b6201f089f999ebf9b3976981d504e35b3f1b09ddf5d85a698e002f56fd30f769689f0e8d56c737f58d815a31a53156ca
@@ -11,23 +11,48 @@ module Baiduserp
11
11
  include HTTParty
12
12
  base_uri 'www.baidu.com'
13
13
  follow_redirects false
14
- headers "User-Agent" => self.rand_ua
14
+ headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
15
15
 
16
- def self.get_serp(url, retries = 6)
16
+ def self.get_serp(url,retries = 3)
17
+ self.new.get_serp(url,retries)
18
+ end
19
+
20
+ def get_serp(url, retries = 3)
17
21
  if retries > 0
18
22
  begin
19
- response = self.get(url)
20
- rescue Timeout::Error => e
23
+ response = self.class.get(url)
24
+ rescue StandardError => e
21
25
  puts e.class
22
26
  puts e.message
23
27
  sleep(10)
24
28
  retry
25
29
  end
30
+
26
31
  if response.code != 200
27
- sleep(rand(60)+60)
28
- response = self.get_serp(url,retries - 1)
32
+ puts response
33
+ puts "Retry on URL: #{url}"
34
+ sleep(rand(60)+1200)
35
+ response = self.class.get_serp(url,retries - 1)
36
+ end
37
+
38
+ if response.nil?
39
+ puts "Still error after 3 tries, sleep 3600s now."
40
+ sleep(3600)
41
+ response = self.class.get_serp(url)
29
42
  end
30
- return response.body
43
+
44
+ if response.headers['Content-Length'].to_i != response.body.bytesize
45
+ issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
46
+ open(issue_file,'w').puts(response.body)
47
+ puts "Notice:"
48
+ puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
49
+ puts "Please see file #{issue_file} for body content."
50
+ puts "Sleep 10s and retry"
51
+ sleep(10)
52
+ response = self.class.get_serp(url)
53
+ end
54
+
55
+ return response
31
56
  else
32
57
  return nil
33
58
  end
@@ -8,7 +8,14 @@ module Baiduserp
8
8
  noko.first.content.strip
9
9
  end
10
10
 
11
-
11
+ def parse_data_click(str)
12
+ JSON.parse(str
13
+ .gsub("'",'"')
14
+ .gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
15
+ #.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
16
+ #.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
17
+ )
18
+ end
12
19
  end
13
20
  end
14
21
  end
@@ -1,6 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
+ require 'json'
4
5
  require 'baiduserp/client'
5
6
  require 'baiduserp/helper'
6
7
  require 'baiduserp/result'
@@ -47,15 +48,16 @@ module Baiduserp
47
48
  def get_search_html(keyword,page=1)
48
49
  keyword = keyword.gsub(" ","+")
49
50
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
50
- serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
51
- Client.get_serp(serp_url)
51
+ serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
52
+ # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
53
+ Client.get_serp(serp_url).body
52
54
  end
53
55
 
54
56
  def parse_file(file_path)
55
57
  if File.exists? file_path
56
58
  html = open(file_path).read
57
59
  else
58
- html = Client.get_serp(file_path)
60
+ html = Client.get_serp(file_path).body
59
61
  end
60
62
  parse html
61
63
  end
@@ -1,3 +1,5 @@
1
+ require 'domainatrix'
2
+
1
3
  module Baiduserp
2
4
  class Result < Hash
3
5
  def seo_urls
@@ -9,19 +11,23 @@ module Baiduserp
9
11
  self[:ranks].each do |rank|
10
12
  url = rank[:url].to_s
11
13
  next if url.empty?
12
- result << URI(URI.escape(rank[:url])).host.downcase
14
+ result << Addressable::URI.parse(rank[:url]).host
13
15
  end
14
16
  result
15
17
  end
16
18
 
17
- def sem_sites
19
+ def sem_urls
18
20
  result = []
19
21
  (self[:ads_top] + self[:ads_right]).each do |ad|
20
22
  site = ad[:site].to_s
21
23
  next if site.empty?
22
- result << ad[:site].downcase
24
+ result << ad[:site]
23
25
  end
24
26
  result
25
27
  end
28
+
29
+ def sem_sites
30
+ sem_urls
31
+ end
26
32
  end
27
33
  end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.1.6"
2
+ VERSION = "2.1.14"
3
3
  end
@@ -0,0 +1,14 @@
1
+ class Baiduserp::Parser
2
+ def _parse_con_ar(file)
3
+ result = []
4
+ divs = file[:doc].search("div#content_right div#con-ar").first
5
+ return [] if divs.nil?
6
+ divs.children.each do |div|
7
+ next unless div['class'].to_s.include?('result-op')
8
+ result << {:tpl => div['tpl'],
9
+ :data_click => Baiduserp::Helper.parse_data_click(div['data-click'])
10
+ }
11
+ end
12
+ result
13
+ end
14
+ end
@@ -6,7 +6,9 @@ class Baiduserp::Parser
6
6
  :srcid => zxl['srcid'],
7
7
  :fk => zxl['fk'],
8
8
  :tpl => zxl['tpl'],
9
- :mu => zxl['mu'] }
9
+ :mu => zxl['mu'],
10
+ :data_click => Baiduserp::Helper.parse_data_click(zxl['data-click'])
11
+ }
10
12
  end
11
13
  result
12
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.6
4
+ version: 2.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-07 00:00:00.000000000 Z
11
+ date: 2013-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: domainatrix
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Parse Baidu SERP result page.
42
56
  email:
43
57
  - zmingqian@qq.com
@@ -54,6 +68,7 @@ files:
54
68
  - lib/baiduserp.rb
55
69
  - lib/parsers/ads_right.rb
56
70
  - lib/parsers/ads_top.rb
71
+ - lib/parsers/con_ar.rb
57
72
  - lib/parsers/pinpaizhuanqu.rb
58
73
  - lib/parsers/ranks.rb
59
74
  - lib/parsers/related_keywords.rb