baiduserp 2.1.6 → 2.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/baiduserp/client.rb +32 -7
- data/lib/baiduserp/helper.rb +8 -1
- data/lib/baiduserp/parser.rb +5 -3
- data/lib/baiduserp/result.rb +9 -3
- data/lib/baiduserp/version.rb +1 -1
- data/lib/parsers/con_ar.rb +14 -0
- data/lib/parsers/zhixin.rb +3 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5fe173a0b067ff22f37a68ab8cd4d633b556630
|
4
|
+
data.tar.gz: e42a0a457467435480fd7cf7e82dfbfac88b9691
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4d75fa5fb5429aaa2e9a0293203dd624b4724aab1a1a7e5f3b39dba74b40a9c2bcbd82ec21c6318bc8f8955fa70a7d8a17c7dfef57049ba4e8e8f8366d6ab55
|
7
|
+
data.tar.gz: 90f9f95d9f822d278f0c755ce0f4048b6201f089f999ebf9b3976981d504e35b3f1b09ddf5d85a698e002f56fd30f769689f0e8d56c737f58d815a31a53156ca
|
data/lib/baiduserp/client.rb
CHANGED
@@ -11,23 +11,48 @@ module Baiduserp
|
|
11
11
|
include HTTParty
|
12
12
|
base_uri 'www.baidu.com'
|
13
13
|
follow_redirects false
|
14
|
-
headers "User-Agent" => self.rand_ua
|
14
|
+
headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
|
15
15
|
|
16
|
-
def self.get_serp(url,
|
16
|
+
def self.get_serp(url,retries = 3)
|
17
|
+
self.new.get_serp(url,retries)
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_serp(url, retries = 3)
|
17
21
|
if retries > 0
|
18
22
|
begin
|
19
|
-
response = self.get(url)
|
20
|
-
rescue
|
23
|
+
response = self.class.get(url)
|
24
|
+
rescue StandardError => e
|
21
25
|
puts e.class
|
22
26
|
puts e.message
|
23
27
|
sleep(10)
|
24
28
|
retry
|
25
29
|
end
|
30
|
+
|
26
31
|
if response.code != 200
|
27
|
-
|
28
|
-
|
32
|
+
puts response
|
33
|
+
puts "Retry on URL: #{url}"
|
34
|
+
sleep(rand(60)+1200)
|
35
|
+
response = self.class.get_serp(url,retries - 1)
|
36
|
+
end
|
37
|
+
|
38
|
+
if response.nil?
|
39
|
+
puts "Still error after 3 tries, sleep 3600s now."
|
40
|
+
sleep(3600)
|
41
|
+
response = self.class.get_serp(url)
|
29
42
|
end
|
30
|
-
|
43
|
+
|
44
|
+
if response.headers['Content-Length'].to_i != response.body.bytesize
|
45
|
+
issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
46
|
+
open(issue_file,'w').puts(response.body)
|
47
|
+
puts "Notice:"
|
48
|
+
puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
49
|
+
puts "Please see file #{issue_file} for body content."
|
50
|
+
puts "Sleep 10s and retry"
|
51
|
+
sleep(10)
|
52
|
+
response = self.class.get_serp(url)
|
53
|
+
end
|
54
|
+
|
55
|
+
return response
|
31
56
|
else
|
32
57
|
return nil
|
33
58
|
end
|
data/lib/baiduserp/helper.rb
CHANGED
@@ -8,7 +8,14 @@ module Baiduserp
|
|
8
8
|
noko.first.content.strip
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
def parse_data_click(str)
|
12
|
+
JSON.parse(str
|
13
|
+
.gsub("'",'"')
|
14
|
+
.gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
|
15
|
+
#.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
|
16
|
+
#.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
|
17
|
+
)
|
18
|
+
end
|
12
19
|
end
|
13
20
|
end
|
14
21
|
end
|
data/lib/baiduserp/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
+
require 'json'
|
4
5
|
require 'baiduserp/client'
|
5
6
|
require 'baiduserp/helper'
|
6
7
|
require 'baiduserp/result'
|
@@ -47,15 +48,16 @@ module Baiduserp
|
|
47
48
|
def get_search_html(keyword,page=1)
|
48
49
|
keyword = keyword.gsub(" ","+")
|
49
50
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
50
|
-
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
51
|
-
|
51
|
+
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
|
52
|
+
# serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
53
|
+
Client.get_serp(serp_url).body
|
52
54
|
end
|
53
55
|
|
54
56
|
def parse_file(file_path)
|
55
57
|
if File.exists? file_path
|
56
58
|
html = open(file_path).read
|
57
59
|
else
|
58
|
-
html = Client.get_serp(file_path)
|
60
|
+
html = Client.get_serp(file_path).body
|
59
61
|
end
|
60
62
|
parse html
|
61
63
|
end
|
data/lib/baiduserp/result.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'domainatrix'
|
2
|
+
|
1
3
|
module Baiduserp
|
2
4
|
class Result < Hash
|
3
5
|
def seo_urls
|
@@ -9,19 +11,23 @@ module Baiduserp
|
|
9
11
|
self[:ranks].each do |rank|
|
10
12
|
url = rank[:url].to_s
|
11
13
|
next if url.empty?
|
12
|
-
result << URI
|
14
|
+
result << Addressable::URI.parse(rank[:url]).host
|
13
15
|
end
|
14
16
|
result
|
15
17
|
end
|
16
18
|
|
17
|
-
def
|
19
|
+
def sem_urls
|
18
20
|
result = []
|
19
21
|
(self[:ads_top] + self[:ads_right]).each do |ad|
|
20
22
|
site = ad[:site].to_s
|
21
23
|
next if site.empty?
|
22
|
-
result << ad[:site]
|
24
|
+
result << ad[:site]
|
23
25
|
end
|
24
26
|
result
|
25
27
|
end
|
28
|
+
|
29
|
+
def sem_sites
|
30
|
+
sem_urls
|
31
|
+
end
|
26
32
|
end
|
27
33
|
end
|
data/lib/baiduserp/version.rb
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
class Baiduserp::Parser
|
2
|
+
def _parse_con_ar(file)
|
3
|
+
result = []
|
4
|
+
divs = file[:doc].search("div#content_right div#con-ar").first
|
5
|
+
return [] if divs.nil?
|
6
|
+
divs.children.each do |div|
|
7
|
+
next unless div['class'].to_s.include?('result-op')
|
8
|
+
result << {:tpl => div['tpl'],
|
9
|
+
:data_click => Baiduserp::Helper.parse_data_click(div['data-click'])
|
10
|
+
}
|
11
|
+
end
|
12
|
+
result
|
13
|
+
end
|
14
|
+
end
|
data/lib/parsers/zhixin.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: domainatrix
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Parse Baidu SERP result page.
|
42
56
|
email:
|
43
57
|
- zmingqian@qq.com
|
@@ -54,6 +68,7 @@ files:
|
|
54
68
|
- lib/baiduserp.rb
|
55
69
|
- lib/parsers/ads_right.rb
|
56
70
|
- lib/parsers/ads_top.rb
|
71
|
+
- lib/parsers/con_ar.rb
|
57
72
|
- lib/parsers/pinpaizhuanqu.rb
|
58
73
|
- lib/parsers/ranks.rb
|
59
74
|
- lib/parsers/related_keywords.rb
|