baiduserp 2.1.6 → 2.1.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/baiduserp/client.rb +32 -7
- data/lib/baiduserp/helper.rb +8 -1
- data/lib/baiduserp/parser.rb +5 -3
- data/lib/baiduserp/result.rb +9 -3
- data/lib/baiduserp/version.rb +1 -1
- data/lib/parsers/con_ar.rb +14 -0
- data/lib/parsers/zhixin.rb +3 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5fe173a0b067ff22f37a68ab8cd4d633b556630
|
4
|
+
data.tar.gz: e42a0a457467435480fd7cf7e82dfbfac88b9691
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4d75fa5fb5429aaa2e9a0293203dd624b4724aab1a1a7e5f3b39dba74b40a9c2bcbd82ec21c6318bc8f8955fa70a7d8a17c7dfef57049ba4e8e8f8366d6ab55
|
7
|
+
data.tar.gz: 90f9f95d9f822d278f0c755ce0f4048b6201f089f999ebf9b3976981d504e35b3f1b09ddf5d85a698e002f56fd30f769689f0e8d56c737f58d815a31a53156ca
|
data/lib/baiduserp/client.rb
CHANGED
@@ -11,23 +11,48 @@ module Baiduserp
|
|
11
11
|
include HTTParty
|
12
12
|
base_uri 'www.baidu.com'
|
13
13
|
follow_redirects false
|
14
|
-
headers "User-Agent" => self.rand_ua
|
14
|
+
headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
|
15
15
|
|
16
|
-
def self.get_serp(url,
|
16
|
+
def self.get_serp(url,retries = 3)
|
17
|
+
self.new.get_serp(url,retries)
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_serp(url, retries = 3)
|
17
21
|
if retries > 0
|
18
22
|
begin
|
19
|
-
response = self.get(url)
|
20
|
-
rescue
|
23
|
+
response = self.class.get(url)
|
24
|
+
rescue StandardError => e
|
21
25
|
puts e.class
|
22
26
|
puts e.message
|
23
27
|
sleep(10)
|
24
28
|
retry
|
25
29
|
end
|
30
|
+
|
26
31
|
if response.code != 200
|
27
|
-
|
28
|
-
|
32
|
+
puts response
|
33
|
+
puts "Retry on URL: #{url}"
|
34
|
+
sleep(rand(60)+1200)
|
35
|
+
response = self.class.get_serp(url,retries - 1)
|
36
|
+
end
|
37
|
+
|
38
|
+
if response.nil?
|
39
|
+
puts "Still error after 3 tries, sleep 3600s now."
|
40
|
+
sleep(3600)
|
41
|
+
response = self.class.get_serp(url)
|
29
42
|
end
|
30
|
-
|
43
|
+
|
44
|
+
if response.headers['Content-Length'].to_i != response.body.bytesize
|
45
|
+
issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
46
|
+
open(issue_file,'w').puts(response.body)
|
47
|
+
puts "Notice:"
|
48
|
+
puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
49
|
+
puts "Please see file #{issue_file} for body content."
|
50
|
+
puts "Sleep 10s and retry"
|
51
|
+
sleep(10)
|
52
|
+
response = self.class.get_serp(url)
|
53
|
+
end
|
54
|
+
|
55
|
+
return response
|
31
56
|
else
|
32
57
|
return nil
|
33
58
|
end
|
data/lib/baiduserp/helper.rb
CHANGED
@@ -8,7 +8,14 @@ module Baiduserp
|
|
8
8
|
noko.first.content.strip
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
def parse_data_click(str)
|
12
|
+
JSON.parse(str
|
13
|
+
.gsub("'",'"')
|
14
|
+
.gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
|
15
|
+
#.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
|
16
|
+
#.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
|
17
|
+
)
|
18
|
+
end
|
12
19
|
end
|
13
20
|
end
|
14
21
|
end
|
data/lib/baiduserp/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
+
require 'json'
|
4
5
|
require 'baiduserp/client'
|
5
6
|
require 'baiduserp/helper'
|
6
7
|
require 'baiduserp/result'
|
@@ -47,15 +48,16 @@ module Baiduserp
|
|
47
48
|
def get_search_html(keyword,page=1)
|
48
49
|
keyword = keyword.gsub(" ","+")
|
49
50
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
50
|
-
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
51
|
-
|
51
|
+
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
|
52
|
+
# serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
53
|
+
Client.get_serp(serp_url).body
|
52
54
|
end
|
53
55
|
|
54
56
|
def parse_file(file_path)
|
55
57
|
if File.exists? file_path
|
56
58
|
html = open(file_path).read
|
57
59
|
else
|
58
|
-
html = Client.get_serp(file_path)
|
60
|
+
html = Client.get_serp(file_path).body
|
59
61
|
end
|
60
62
|
parse html
|
61
63
|
end
|
data/lib/baiduserp/result.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'domainatrix'
|
2
|
+
|
1
3
|
module Baiduserp
|
2
4
|
class Result < Hash
|
3
5
|
def seo_urls
|
@@ -9,19 +11,23 @@ module Baiduserp
|
|
9
11
|
self[:ranks].each do |rank|
|
10
12
|
url = rank[:url].to_s
|
11
13
|
next if url.empty?
|
12
|
-
result << URI
|
14
|
+
result << Addressable::URI.parse(rank[:url]).host
|
13
15
|
end
|
14
16
|
result
|
15
17
|
end
|
16
18
|
|
17
|
-
def
|
19
|
+
def sem_urls
|
18
20
|
result = []
|
19
21
|
(self[:ads_top] + self[:ads_right]).each do |ad|
|
20
22
|
site = ad[:site].to_s
|
21
23
|
next if site.empty?
|
22
|
-
result << ad[:site]
|
24
|
+
result << ad[:site]
|
23
25
|
end
|
24
26
|
result
|
25
27
|
end
|
28
|
+
|
29
|
+
def sem_sites
|
30
|
+
sem_urls
|
31
|
+
end
|
26
32
|
end
|
27
33
|
end
|
data/lib/baiduserp/version.rb
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
class Baiduserp::Parser
|
2
|
+
def _parse_con_ar(file)
|
3
|
+
result = []
|
4
|
+
divs = file[:doc].search("div#content_right div#con-ar").first
|
5
|
+
return [] if divs.nil?
|
6
|
+
divs.children.each do |div|
|
7
|
+
next unless div['class'].to_s.include?('result-op')
|
8
|
+
result << {:tpl => div['tpl'],
|
9
|
+
:data_click => Baiduserp::Helper.parse_data_click(div['data-click'])
|
10
|
+
}
|
11
|
+
end
|
12
|
+
result
|
13
|
+
end
|
14
|
+
end
|
data/lib/parsers/zhixin.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: domainatrix
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Parse Baidu SERP result page.
|
42
56
|
email:
|
43
57
|
- zmingqian@qq.com
|
@@ -54,6 +68,7 @@ files:
|
|
54
68
|
- lib/baiduserp.rb
|
55
69
|
- lib/parsers/ads_right.rb
|
56
70
|
- lib/parsers/ads_top.rb
|
71
|
+
- lib/parsers/con_ar.rb
|
57
72
|
- lib/parsers/pinpaizhuanqu.rb
|
58
73
|
- lib/parsers/ranks.rb
|
59
74
|
- lib/parsers/related_keywords.rb
|