baiduserp 2.5.2 → 2.5.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89ad01defa5be250a6f32c0e32d2ad9cb64044ac
4
- data.tar.gz: 06d8a6914183c1630f10b036d0f1dacc70f67899
3
+ metadata.gz: 8c607f8765a54bd7ad35fb07d98b99e9fd81c525
4
+ data.tar.gz: e942c9538d47efe080a438401c7dc71e713b184b
5
5
  SHA512:
6
- metadata.gz: 0c52c60cd473b2b7dba88f0c1039311d1a811d6f27690c6c0ceb4350c5e608c995e56d5bc69e94982a43404aaf10dad756749bcad96af2163258f08818a3a3c2
7
- data.tar.gz: f1b21e87c779f24d2b9bd5780e68330071c8f59463d5dfebd369831036f8f909532442092dc656074140540bd5b69c2c6d9e078f2b7347ba698601685b724f75
6
+ metadata.gz: 196029dde69973fd778e821b416a6f2a70de085923f91c6c0d9d951d1be0c23ceddf4780383ae84edb6c545f29c5eaff5ed9ad502ccfcfa89e7ab9b536bd844e
7
+ data.tar.gz: 041858553ce31abf38933f568b385c4ac0adf53add4dba35fd9c66301fe96d5f7aba1d2d1623f48c194b77dd0d41455179d876617bbe9301ee0c8744f340acb0
@@ -61,11 +61,10 @@ module Baiduserp
61
61
  def search(date=Date.today)
62
62
  htmls = model_htmls(date)
63
63
  serps = model_serps(date)
64
- p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count)
64
+ p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count, :format => '%t (%c/%C) %a %E |%w')
65
65
  @keywords.each do |k|
66
66
  htmls.find_or_create(:keyword_id => k[:id]) {|r| r.content = Baiduserp.get_search_html(k[:term]) }
67
67
  serps.find_or_create(:keyword_id => k[:id]) {|r| r.content = YAML.dump(Baiduserp.parse(htmls.where(:keyword_id => k[:id]).first[:content])) }
68
- p.log k.to_hash
69
68
  p.increment
70
69
  end
71
70
  end
@@ -73,21 +72,20 @@ module Baiduserp
73
72
  def regenerate_serps(date=Date.today)
74
73
  htmls = model_htmls(date)
75
74
  serps = model_serps(date)
76
- p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count)
75
+ p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count, :format => '%t (%c/%C) %a %E |%w')
77
76
  htmls.each do |html|
78
77
  keyword_id = html[:keyword_id]
79
78
  html = html[:content]
80
79
  r = serps.find_or_create(:keyword_id => keyword_id)
81
80
  r.update(:content => YAML.dump(Baiduserp.parse(html)))
82
81
 
83
- p.log keyword_id
84
82
  p.increment
85
83
  end
86
84
  end
87
85
 
88
86
  def generate_weights(date=Date.today)
89
87
  serps = model_serps(date)
90
- p = ProgressBar.create(:title => "Generating Weights", :total => serps.count)
88
+ p = ProgressBar.create(:title => "Generating Weights", :total => serps.count, :format => '%t (%c/%C) %a %E |%w')
91
89
  serps.each do |s|
92
90
  keyword_id = s[:keyword_id]
93
91
  serp = YAML.load(s[:content])
@@ -115,7 +113,6 @@ module Baiduserp
115
113
  r.normalized_weight = normalized_weight
116
114
  end
117
115
  end
118
- p.log keyword_id
119
116
  p.increment
120
117
  end
121
118
  end
@@ -57,24 +57,26 @@ module Baiduserp
57
57
  response = self.class.get_serp(url)
58
58
  end
59
59
 
60
- if response.headers['Content-Length'].nil?
61
- response = self.class.get_serp(url,retries)
62
- end
63
-
64
- if response.headers['Content-Length'].to_i != response.body.bytesize
65
- issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
66
- open(issue_file,'w').puts(response.body)
67
- puts "Notice:"
68
- puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
69
- puts "Please see file #{issue_file} for body content."
70
- puts "Sleep 10s and retry"
71
- sleep(10)
72
- response = self.class.get_serp(url)
73
- end
60
+ ##Baidu Stopped response Content-Length in headers...
61
+ #if response.headers['Content-Length'].nil?
62
+ # puts "Can't read Content-Length from response, retry."
63
+ # response = self.class.get_serp(url,retries-1)
64
+ #end
65
+ #
66
+ #if response.headers['Content-Length'].to_i != response.body.bytesize
67
+ # issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
68
+ # open(issue_file,'w').puts(response.body)
69
+ # puts "Notice:"
70
+ # puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
71
+ # puts "Please see file #{issue_file} for body content."
72
+ # puts "Sleep 10s and retry"
73
+ # sleep(10)
74
+ # response = self.class.get_serp(url)
75
+ #end
74
76
 
75
- return response
77
+ response
76
78
  else
77
- return nil
79
+ nil
78
80
  end
79
81
  end
80
82
 
@@ -48,7 +48,7 @@ module Baiduserp
48
48
  def get_search_html(keyword,page=1)
49
49
  keyword = keyword.gsub(" ","+")
50
50
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
51
- serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
51
+ serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}")
52
52
  # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
53
53
  Client.get_serp(serp_url).body
54
54
  end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.5.2"
2
+ VERSION = "2.5.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.2
4
+ version: 2.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-17 00:00:00.000000000 Z
11
+ date: 2014-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri