baiduserp 2.5.2 → 2.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/baiduserp/analyser.rb +3 -6
- data/lib/baiduserp/client.rb +18 -16
- data/lib/baiduserp/parser.rb +1 -1
- data/lib/baiduserp/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c607f8765a54bd7ad35fb07d98b99e9fd81c525
|
4
|
+
data.tar.gz: e942c9538d47efe080a438401c7dc71e713b184b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 196029dde69973fd778e821b416a6f2a70de085923f91c6c0d9d951d1be0c23ceddf4780383ae84edb6c545f29c5eaff5ed9ad502ccfcfa89e7ab9b536bd844e
|
7
|
+
data.tar.gz: 041858553ce31abf38933f568b385c4ac0adf53add4dba35fd9c66301fe96d5f7aba1d2d1623f48c194b77dd0d41455179d876617bbe9301ee0c8744f340acb0
|
data/lib/baiduserp/analyser.rb
CHANGED
@@ -61,11 +61,10 @@ module Baiduserp
|
|
61
61
|
def search(date=Date.today)
|
62
62
|
htmls = model_htmls(date)
|
63
63
|
serps = model_serps(date)
|
64
|
-
p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count)
|
64
|
+
p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count, :format => '%t (%c/%C) %a %E |%w')
|
65
65
|
@keywords.each do |k|
|
66
66
|
htmls.find_or_create(:keyword_id => k[:id]) {|r| r.content = Baiduserp.get_search_html(k[:term]) }
|
67
67
|
serps.find_or_create(:keyword_id => k[:id]) {|r| r.content = YAML.dump(Baiduserp.parse(htmls.where(:keyword_id => k[:id]).first[:content])) }
|
68
|
-
p.log k.to_hash
|
69
68
|
p.increment
|
70
69
|
end
|
71
70
|
end
|
@@ -73,21 +72,20 @@ module Baiduserp
|
|
73
72
|
def regenerate_serps(date=Date.today)
|
74
73
|
htmls = model_htmls(date)
|
75
74
|
serps = model_serps(date)
|
76
|
-
p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count)
|
75
|
+
p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count, :format => '%t (%c/%C) %a %E |%w')
|
77
76
|
htmls.each do |html|
|
78
77
|
keyword_id = html[:keyword_id]
|
79
78
|
html = html[:content]
|
80
79
|
r = serps.find_or_create(:keyword_id => keyword_id)
|
81
80
|
r.update(:content => YAML.dump(Baiduserp.parse(html)))
|
82
81
|
|
83
|
-
p.log keyword_id
|
84
82
|
p.increment
|
85
83
|
end
|
86
84
|
end
|
87
85
|
|
88
86
|
def generate_weights(date=Date.today)
|
89
87
|
serps = model_serps(date)
|
90
|
-
p = ProgressBar.create(:title => "Generating Weights", :total => serps.count)
|
88
|
+
p = ProgressBar.create(:title => "Generating Weights", :total => serps.count, :format => '%t (%c/%C) %a %E |%w')
|
91
89
|
serps.each do |s|
|
92
90
|
keyword_id = s[:keyword_id]
|
93
91
|
serp = YAML.load(s[:content])
|
@@ -115,7 +113,6 @@ module Baiduserp
|
|
115
113
|
r.normalized_weight = normalized_weight
|
116
114
|
end
|
117
115
|
end
|
118
|
-
p.log keyword_id
|
119
116
|
p.increment
|
120
117
|
end
|
121
118
|
end
|
data/lib/baiduserp/client.rb
CHANGED
@@ -57,24 +57,26 @@ module Baiduserp
|
|
57
57
|
response = self.class.get_serp(url)
|
58
58
|
end
|
59
59
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
60
|
+
##Baidu Stopped response Content-Length in headers...
|
61
|
+
#if response.headers['Content-Length'].nil?
|
62
|
+
# puts "Can't read Content-Length from response, retry."
|
63
|
+
# response = self.class.get_serp(url,retries-1)
|
64
|
+
#end
|
65
|
+
#
|
66
|
+
#if response.headers['Content-Length'].to_i != response.body.bytesize
|
67
|
+
# issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
68
|
+
# open(issue_file,'w').puts(response.body)
|
69
|
+
# puts "Notice:"
|
70
|
+
# puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
71
|
+
# puts "Please see file #{issue_file} for body content."
|
72
|
+
# puts "Sleep 10s and retry"
|
73
|
+
# sleep(10)
|
74
|
+
# response = self.class.get_serp(url)
|
75
|
+
#end
|
74
76
|
|
75
|
-
|
77
|
+
response
|
76
78
|
else
|
77
|
-
|
79
|
+
nil
|
78
80
|
end
|
79
81
|
end
|
80
82
|
|
data/lib/baiduserp/parser.rb
CHANGED
@@ -48,7 +48,7 @@ module Baiduserp
|
|
48
48
|
def get_search_html(keyword,page=1)
|
49
49
|
keyword = keyword.gsub(" ","+")
|
50
50
|
page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
|
51
|
-
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&
|
51
|
+
serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}")
|
52
52
|
# serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
|
53
53
|
Client.get_serp(serp_url).body
|
54
54
|
end
|
data/lib/baiduserp/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.5.
|
4
|
+
version: 2.5.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- MingQian Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|