baiduserp 2.5.2 → 2.5.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/baiduserp/analyser.rb +3 -6
- data/lib/baiduserp/client.rb +18 -16
- data/lib/baiduserp/parser.rb +1 -1
- data/lib/baiduserp/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c607f8765a54bd7ad35fb07d98b99e9fd81c525
|
4
|
+
data.tar.gz: e942c9538d47efe080a438401c7dc71e713b184b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 196029dde69973fd778e821b416a6f2a70de085923f91c6c0d9d951d1be0c23ceddf4780383ae84edb6c545f29c5eaff5ed9ad502ccfcfa89e7ab9b536bd844e
|
7
|
+
data.tar.gz: 041858553ce31abf38933f568b385c4ac0adf53add4dba35fd9c66301fe96d5f7aba1d2d1623f48c194b77dd0d41455179d876617bbe9301ee0c8744f340acb0
|
data/lib/baiduserp/analyser.rb
CHANGED
@@ -61,11 +61,10 @@ module Baiduserp
|
|
61
61
|
def search(date=Date.today)
|
62
62
|
htmls = model_htmls(date)
|
63
63
|
serps = model_serps(date)
|
64
|
-
p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count)
|
64
|
+
p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count, :format => '%t (%c/%C) %a %E |%w')
|
65
65
|
@keywords.each do |k|
|
66
66
|
htmls.find_or_create(:keyword_id => k[:id]) {|r| r.content = Baiduserp.get_search_html(k[:term]) }
|
67
67
|
serps.find_or_create(:keyword_id => k[:id]) {|r| r.content = YAML.dump(Baiduserp.parse(htmls.where(:keyword_id => k[:id]).first[:content])) }
|
68
|
-
p.log k.to_hash
|
69
68
|
p.increment
|
70
69
|
end
|
71
70
|
end
|
@@ -73,21 +72,20 @@ module Baiduserp
|
|
73
72
|
def regenerate_serps(date=Date.today)
|
74
73
|
htmls = model_htmls(date)
|
75
74
|
serps = model_serps(date)
|
76
|
-
p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count)
|
75
|
+
p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count, :format => '%t (%c/%C) %a %E |%w')
|
77
76
|
htmls.each do |html|
|
78
77
|
keyword_id = html[:keyword_id]
|
79
78
|
html = html[:content]
|
80
79
|
r = serps.find_or_create(:keyword_id => keyword_id)
|
81
80
|
r.update(:content => YAML.dump(Baiduserp.parse(html)))
|
82
81
|
|
83
|
-
p.log keyword_id
|
84
82
|
p.increment
|
85
83
|
end
|
86
84
|
end
|
87
85
|
|
88
86
|
def generate_weights(date=Date.today)
|
89
87
|
serps = model_serps(date)
|
90
|
-
p = ProgressBar.create(:title => "Generating Weights", :total => serps.count)
|
88
|
+
p = ProgressBar.create(:title => "Generating Weights", :total => serps.count, :format => '%t (%c/%C) %a %E |%w')
|
91
89
|
serps.each do |s|
|
92
90
|
keyword_id = s[:keyword_id]
|
93
91
|
serp = YAML.load(s[:content])
|
@@ -115,7 +113,6 @@ module Baiduserp
|
|
115
113
|
r.normalized_weight = normalized_weight
|
116
114
|
end
|
117
115
|
end
|
118
|
-
p.log keyword_id
|
119
116
|
p.increment
|
120
117
|
end
|
121
118
|
end
|
data/lib/baiduserp/client.rb
CHANGED
@@ -57,24 +57,26 @@ module Baiduserp
|
|
57
57
|
response = self.class.get_serp(url)
|
58
58
|
end
|
59
59
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
60
|
+
##Baidu Stopped response Content-Length in headers...
|
61
|
+
#if response.headers['Content-Length'].nil?
|
62
|
+
# puts "Can't read Content-Length from response, retry."
|
63
|
+
# response = self.class.get_serp(url,retries-1)
|
64
|
+
#end
|
65
|
+
#
|
66
|
+
#if response.headers['Content-Length'].to_i != response.body.bytesize
|
67
|
+
# issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
68
|
+
# open(issue_file,'w').puts(response.body)
|
69
|
+
# puts "Notice:"
|
70
|
+
# puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
71
|
+
# puts "Please see file #{issue_file} for body content."
|
72
|
+
# puts "Sleep 10s and retry"
|
73
|
+
# sleep(10)
|
74
|
+
# response = self.class.get_serp(url)
|
75
|
+
#end
|
74
76
|
|
75
|
-
|
77
|
+
response
|
76
78
|
else
|
77
|
-
|
79
|
+
nil
|
78
80
|
end
|
79
81
|
end
|
80
82
|
|
data/lib/baiduserp/parser.rb
CHANGED
@@ -48,7 +48,7 @@ module Baiduserp
|
|
48
48
|
def get_search_html(keyword,page=1)
|
49
49
|
keyword = keyword.gsub(" ","+")
|
50
50
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
51
|
-
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&
|
51
|
+
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}")
|
52
52
|
# serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
53
53
|
Client.get_serp(serp_url).body
|
54
54
|
end
|
data/lib/baiduserp/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.5.
|
4
|
+
version: 2.5.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|