baiduserp 2.1.6 → 2.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f9ab6771775f8898485a11610cbb943f18c2104f
4
- data.tar.gz: dcc6a0c371330b1cc9fffcd675b5f54f185a02bf
3
+ metadata.gz: d5fe173a0b067ff22f37a68ab8cd4d633b556630
4
+ data.tar.gz: e42a0a457467435480fd7cf7e82dfbfac88b9691
5
5
  SHA512:
6
- metadata.gz: 8c3589e23d461f3629dd9419d3b595f17c6cab74262bd9764f105d719dc3be39c25f9bae3ad0be55a2dd820cac68eac5fb1000dbe187f2b2b3c98cf57dc5470b
7
- data.tar.gz: bb1609eaffcf0f201f52956c6883927127383d8f19ab41edbd90fe6204bebbbd3a97ef6a2ce4908ab6fea07dcaf4c0091e69c4baa41f47addc205a7dba9b3aa5
6
+ metadata.gz: c4d75fa5fb5429aaa2e9a0293203dd624b4724aab1a1a7e5f3b39dba74b40a9c2bcbd82ec21c6318bc8f8955fa70a7d8a17c7dfef57049ba4e8e8f8366d6ab55
7
+ data.tar.gz: 90f9f95d9f822d278f0c755ce0f4048b6201f089f999ebf9b3976981d504e35b3f1b09ddf5d85a698e002f56fd30f769689f0e8d56c737f58d815a31a53156ca
@@ -11,23 +11,48 @@ module Baiduserp
11
11
  include HTTParty
12
12
  base_uri 'www.baidu.com'
13
13
  follow_redirects false
14
- headers "User-Agent" => self.rand_ua
14
+ headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
15
15
 
16
- def self.get_serp(url, retries = 6)
16
+ def self.get_serp(url,retries = 3)
17
+ self.new.get_serp(url,retries)
18
+ end
19
+
20
+ def get_serp(url, retries = 3)
17
21
  if retries > 0
18
22
  begin
19
- response = self.get(url)
20
- rescue Timeout::Error => e
23
+ response = self.class.get(url)
24
+ rescue StandardError => e
21
25
  puts e.class
22
26
  puts e.message
23
27
  sleep(10)
24
28
  retry
25
29
  end
30
+
26
31
  if response.code != 200
27
- sleep(rand(60)+60)
28
- response = self.get_serp(url,retries - 1)
32
+ puts response
33
+ puts "Retry on URL: #{url}"
34
+ sleep(rand(60)+1200)
35
+ response = self.class.get_serp(url,retries - 1)
36
+ end
37
+
38
+ if response.nil?
39
+ puts "Still error after 3 tries, sleep 3600s now."
40
+ sleep(3600)
41
+ response = self.class.get_serp(url)
29
42
  end
30
- return response.body
43
+
44
+ if response.headers['Content-Length'].to_i != response.body.bytesize
45
+ issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
46
+ open(issue_file,'w').puts(response.body)
47
+ puts "Notice:"
48
+ puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
49
+ puts "Please see file #{issue_file} for body content."
50
+ puts "Sleep 10s and retry"
51
+ sleep(10)
52
+ response = self.class.get_serp(url)
53
+ end
54
+
55
+ return response
31
56
  else
32
57
  return nil
33
58
  end
@@ -8,7 +8,14 @@ module Baiduserp
8
8
  noko.first.content.strip
9
9
  end
10
10
 
11
-
11
+ def parse_data_click(str)
12
+ JSON.parse(str
13
+ .gsub("'",'"')
14
+ .gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
15
+ #.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
16
+ #.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
17
+ )
18
+ end
12
19
  end
13
20
  end
14
21
  end
@@ -1,6 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
+ require 'json'
4
5
  require 'baiduserp/client'
5
6
  require 'baiduserp/helper'
6
7
  require 'baiduserp/result'
@@ -47,15 +48,16 @@ module Baiduserp
47
48
  def get_search_html(keyword,page=1)
48
49
  keyword = keyword.gsub(" ","+")
49
50
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
50
- serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
51
- Client.get_serp(serp_url)
51
+ serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
52
+ # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
53
+ Client.get_serp(serp_url).body
52
54
  end
53
55
 
54
56
  def parse_file(file_path)
55
57
  if File.exists? file_path
56
58
  html = open(file_path).read
57
59
  else
58
- html = Client.get_serp(file_path)
60
+ html = Client.get_serp(file_path).body
59
61
  end
60
62
  parse html
61
63
  end
@@ -1,3 +1,5 @@
1
+ require 'domainatrix'
2
+
1
3
  module Baiduserp
2
4
  class Result < Hash
3
5
  def seo_urls
@@ -9,19 +11,23 @@ module Baiduserp
9
11
  self[:ranks].each do |rank|
10
12
  url = rank[:url].to_s
11
13
  next if url.empty?
12
- result << URI(URI.escape(rank[:url])).host.downcase
14
+ result << Addressable::URI.parse(rank[:url]).host
13
15
  end
14
16
  result
15
17
  end
16
18
 
17
- def sem_sites
19
+ def sem_urls
18
20
  result = []
19
21
  (self[:ads_top] + self[:ads_right]).each do |ad|
20
22
  site = ad[:site].to_s
21
23
  next if site.empty?
22
- result << ad[:site].downcase
24
+ result << ad[:site]
23
25
  end
24
26
  result
25
27
  end
28
+
29
+ def sem_sites
30
+ sem_urls
31
+ end
26
32
  end
27
33
  end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.1.6"
2
+ VERSION = "2.1.14"
3
3
  end
@@ -0,0 +1,14 @@
1
+ class Baiduserp::Parser
2
+ def _parse_con_ar(file)
3
+ result = []
4
+ divs = file[:doc].search("div#content_right div#con-ar").first
5
+ return [] if divs.nil?
6
+ divs.children.each do |div|
7
+ next unless div['class'].to_s.include?('result-op')
8
+ result << {:tpl => div['tpl'],
9
+ :data_click => Baiduserp::Helper.parse_data_click(div['data-click'])
10
+ }
11
+ end
12
+ result
13
+ end
14
+ end
@@ -6,7 +6,9 @@ class Baiduserp::Parser
6
6
  :srcid => zxl['srcid'],
7
7
  :fk => zxl['fk'],
8
8
  :tpl => zxl['tpl'],
9
- :mu => zxl['mu'] }
9
+ :mu => zxl['mu'],
10
+ :data_click => Baiduserp::Helper.parse_data_click(zxl['data-click'])
11
+ }
10
12
  end
11
13
  result
12
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.6
4
+ version: 2.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-07 00:00:00.000000000 Z
11
+ date: 2013-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: domainatrix
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Parse Baidu SERP result page.
42
56
  email:
43
57
  - zmingqian@qq.com
@@ -54,6 +68,7 @@ files:
54
68
  - lib/baiduserp.rb
55
69
  - lib/parsers/ads_right.rb
56
70
  - lib/parsers/ads_top.rb
71
+ - lib/parsers/con_ar.rb
57
72
  - lib/parsers/pinpaizhuanqu.rb
58
73
  - lib/parsers/ranks.rb
59
74
  - lib/parsers/related_keywords.rb