baiduserp 2.5.2 → 2.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89ad01defa5be250a6f32c0e32d2ad9cb64044ac
4
- data.tar.gz: 06d8a6914183c1630f10b036d0f1dacc70f67899
3
+ metadata.gz: 8c607f8765a54bd7ad35fb07d98b99e9fd81c525
4
+ data.tar.gz: e942c9538d47efe080a438401c7dc71e713b184b
5
5
  SHA512:
6
- metadata.gz: 0c52c60cd473b2b7dba88f0c1039311d1a811d6f27690c6c0ceb4350c5e608c995e56d5bc69e94982a43404aaf10dad756749bcad96af2163258f08818a3a3c2
7
- data.tar.gz: f1b21e87c779f24d2b9bd5780e68330071c8f59463d5dfebd369831036f8f909532442092dc656074140540bd5b69c2c6d9e078f2b7347ba698601685b724f75
6
+ metadata.gz: 196029dde69973fd778e821b416a6f2a70de085923f91c6c0d9d951d1be0c23ceddf4780383ae84edb6c545f29c5eaff5ed9ad502ccfcfa89e7ab9b536bd844e
7
+ data.tar.gz: 041858553ce31abf38933f568b385c4ac0adf53add4dba35fd9c66301fe96d5f7aba1d2d1623f48c194b77dd0d41455179d876617bbe9301ee0c8744f340acb0
@@ -61,11 +61,10 @@ module Baiduserp
61
61
  def search(date=Date.today)
62
62
  htmls = model_htmls(date)
63
63
  serps = model_serps(date)
64
- p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count)
64
+ p = ProgressBar.create(:title => "Searching Keywords", :total => @keywords.all.count, :format => '%t (%c/%C) %a %E |%w')
65
65
  @keywords.each do |k|
66
66
  htmls.find_or_create(:keyword_id => k[:id]) {|r| r.content = Baiduserp.get_search_html(k[:term]) }
67
67
  serps.find_or_create(:keyword_id => k[:id]) {|r| r.content = YAML.dump(Baiduserp.parse(htmls.where(:keyword_id => k[:id]).first[:content])) }
68
- p.log k.to_hash
69
68
  p.increment
70
69
  end
71
70
  end
@@ -73,21 +72,20 @@ module Baiduserp
73
72
  def regenerate_serps(date=Date.today)
74
73
  htmls = model_htmls(date)
75
74
  serps = model_serps(date)
76
- p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count)
75
+ p = ProgressBar.create(:title => "ReGenerating SERPS", :total => htmls.count, :format => '%t (%c/%C) %a %E |%w')
77
76
  htmls.each do |html|
78
77
  keyword_id = html[:keyword_id]
79
78
  html = html[:content]
80
79
  r = serps.find_or_create(:keyword_id => keyword_id)
81
80
  r.update(:content => YAML.dump(Baiduserp.parse(html)))
82
81
 
83
- p.log keyword_id
84
82
  p.increment
85
83
  end
86
84
  end
87
85
 
88
86
  def generate_weights(date=Date.today)
89
87
  serps = model_serps(date)
90
- p = ProgressBar.create(:title => "Generating Weights", :total => serps.count)
88
+ p = ProgressBar.create(:title => "Generating Weights", :total => serps.count, :format => '%t (%c/%C) %a %E |%w')
91
89
  serps.each do |s|
92
90
  keyword_id = s[:keyword_id]
93
91
  serp = YAML.load(s[:content])
@@ -115,7 +113,6 @@ module Baiduserp
115
113
  r.normalized_weight = normalized_weight
116
114
  end
117
115
  end
118
- p.log keyword_id
119
116
  p.increment
120
117
  end
121
118
  end
@@ -57,24 +57,26 @@ module Baiduserp
57
57
  response = self.class.get_serp(url)
58
58
  end
59
59
 
60
- if response.headers['Content-Length'].nil?
61
- response = self.class.get_serp(url,retries)
62
- end
63
-
64
- if response.headers['Content-Length'].to_i != response.body.bytesize
65
- issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
66
- open(issue_file,'w').puts(response.body)
67
- puts "Notice:"
68
- puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
69
- puts "Please see file #{issue_file} for body content."
70
- puts "Sleep 10s and retry"
71
- sleep(10)
72
- response = self.class.get_serp(url)
73
- end
60
+ ##Baidu Stopped response Content-Length in headers...
61
+ #if response.headers['Content-Length'].nil?
62
+ # puts "Can't read Content-Length from response, retry."
63
+ # response = self.class.get_serp(url,retries-1)
64
+ #end
65
+ #
66
+ #if response.headers['Content-Length'].to_i != response.body.bytesize
67
+ # issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
68
+ # open(issue_file,'w').puts(response.body)
69
+ # puts "Notice:"
70
+ # puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
71
+ # puts "Please see file #{issue_file} for body content."
72
+ # puts "Sleep 10s and retry"
73
+ # sleep(10)
74
+ # response = self.class.get_serp(url)
75
+ #end
74
76
 
75
- return response
77
+ response
76
78
  else
77
- return nil
79
+ nil
78
80
  end
79
81
  end
80
82
 
@@ -48,7 +48,7 @@ module Baiduserp
48
48
  def get_search_html(keyword,page=1)
49
49
  keyword = keyword.gsub(" ","+")
50
50
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
51
- serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
51
+ serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}")
52
52
  # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
53
53
  Client.get_serp(serp_url).body
54
54
  end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "2.5.2"
2
+ VERSION = "2.5.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.2
4
+ version: 2.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - MingQian Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-17 00:00:00.000000000 Z
11
+ date: 2014-01-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri