baiduserp 0.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ require 'httparty'
2
+
3
+ module Baiduserp
4
+ class Client
5
+ include HTTParty
6
+ base_uri 'www.baidu.com'
7
+ follow_redirects false
8
+ end
9
+ end
@@ -0,0 +1,14 @@
1
+ module Baiduserp
2
+ module Helper
3
+ class << self
4
+ # get content safe from nokogiri search reasult
5
+ def get_content_safe(noko)
6
+ return nil if noko.nil?
7
+ return nil if noko.empty?
8
+ noko.first.content
9
+ end
10
+
11
+
12
+ end
13
+ end
14
+ end
@@ -1,17 +1,26 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
- require 'open-uri'
4
+ require 'baiduserp/client'
5
+ require 'baiduserp/helper'
5
6
 
6
7
  module Baiduserp
7
8
  class Parser
9
+ Dir[File.expand_path('../parser/*.rb', __FILE__)].each{|f| require f}
10
+
8
11
  def parse(html)
9
- @html = html
10
- @doc = Nokogiri::HTML(@html)
11
- @results = []
12
+ @file = Hash.new
12
13
  @serp = Hash.new
13
- parse_serp_results
14
- parse_serp_attrs
14
+
15
+ @file[:html] = html
16
+ @file[:doc] = Nokogiri::HTML(html)
17
+
18
+ self.class.constants.each do |m|
19
+ #puts m
20
+ eval "@serp[:#{m.downcase}] = #{m}.parse @file"
21
+ #p @serp.keys
22
+ end
23
+
15
24
  @serp
16
25
  end
17
26
 
@@ -21,144 +30,12 @@ module Baiduserp
21
30
 
22
31
  def parse_file(file_path)
23
32
  if File.exists? file_path
24
- html = open(file_path)
33
+ html = open(file_path).read
25
34
  else
26
- html = open(URI.escape(file_path))
35
+ html = Client.get(URI.escape(file_path)).body
27
36
  end
28
- html = html.read.encode!('UTF-8','UTF-8',:invalid => :replace)
37
+ html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
29
38
  parse html
30
39
  end
31
-
32
- private
33
- def get_content_safe(noko)
34
- return nil if noko.nil?
35
- return nil if noko.empty?
36
- noko.first.content
37
- end
38
-
39
- def parse_serp_results
40
- # left side results
41
- @doc.search("//table").each do |table|
42
- id = table['id'].to_i
43
- parse_serp_table(id,table) if id > 0
44
- end
45
- # right side ads
46
- parse_right_side_ads
47
- @serp[:serp_results] = @results
48
- end
49
-
50
- def parse_right_side_ads
51
- @doc.search("//div[@class='EC_fr EC_PP']").each do |table|
52
- id = table['id'].to_s.sub('bdfs','').to_i
53
- rank = id + 1
54
- url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
55
- title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
56
- content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
57
- @results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
58
- end
59
- end
60
-
61
- def get_url_part_from_string(str)
62
- str.split(/( |\s)/).each do |s|
63
- return s if s.include? '.'
64
- end
65
- nil
66
- end
67
-
68
- def parse_serp_url(table_id)
69
- id = table_id
70
- url = nil
71
- if id > 3000
72
- link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
73
- else
74
- link_types = ["//table[@id='#{id}']//span[@class='g']",
75
- "//table[@id='#{id}']//font[@color='#008000']",
76
- "//table[@id='#{id}']//span[@style='color:#008000']",
77
- "//table[@id='#{id}']//span[@style='color:#008000;']",
78
- "//table[@id='#{id}']//span[@color='#008000']",
79
- "//table[@id='#{id}']//p[@class='g']",
80
- "//table[@id='#{id}']//cite[@color='#008000']",
81
- "//table[@id='#{id}']//cite",
82
- "//table[@id='#{id}']//span[@id='ala_img_desc']"
83
- ]
84
- end
85
- link_types.each do |link_type|
86
- link_search = @doc.search(link_type)
87
- url2 = nil
88
- url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
89
- if url.nil? && (not url2.nil?)
90
- url = url2
91
- end
92
- end
93
- url
94
- end
95
-
96
- def parse_serp_content(id)
97
- get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
98
- end
99
-
100
- def parse_serp_table(id,table)
101
- result = Hash.new
102
- result[:rank] = id
103
- result[:url] = parse_serp_url(id)
104
- result[:title] = get_content_safe(table.css('h3'))
105
- result[:content] = parse_serp_content(id)
106
-
107
- if id >= 3000 # sem ads
108
- result[:paid] = 1
109
- else # organic results
110
- result[:paid] = 0
111
-
112
- # baidu open
113
- table.css('a').each do |link|
114
- result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
115
- end
116
-
117
- # baidu table mu attr (for maps,baike)
118
- result[:mu] = table['mu'] unless table['mu'].nil?
119
-
120
- end
121
-
122
- @results << result
123
- end
124
-
125
- # parse baidu serp attrs : result_num, baidubrand, related_keywords
126
- def parse_serp_attrs
127
- @serp[:result_num] = parse_serp_result_num
128
- @serp[:baidubrand] = parse_serp_baidu_brand
129
- @serp[:related_keywords] = parse_serp_related_search
130
- end
131
-
132
- def parse_serp_related_search
133
- result = []
134
- @doc.search('div[@id="rs"]').each do |rs|
135
- rs.css('a').each do |link|
136
- result << link.content
137
- end
138
- end
139
- result
140
- end
141
-
142
- def parse_serp_baidu_brand
143
- if @html.include? 'bs.baidu.com/adcoup-mat'
144
- result = 1
145
- else
146
- result = 0
147
- end
148
- result
149
- end
150
-
151
- def parse_serp_result_num
152
- str = @html.scan(/找到相关结果(.*)个/).join
153
- str = str.gsub('约','')
154
- if str.include?('万')
155
- parts = str.split('万')
156
- return parts[0].to_i * 10000 + parts[1].to_i
157
- end
158
- str.gsub(',', '').to_i
159
- end
160
-
161
40
  end
162
41
  end
163
-
164
-
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::Ads_Left
2
+ def self.parse(file)
3
+
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::Ads_Right
2
+ def self.parse(file)
3
+
4
+ end
5
+ end
@@ -0,0 +1,28 @@
1
+ module Baiduserp::Parser::Organic
2
+ def self.parse(file)
3
+ result = []
4
+ file[:doc].search("//table").each do |table|
5
+ id = table['id'].to_i
6
+ next unless id > 0
7
+ r = Hash.new
8
+
9
+ url = table.search("h3/a").first['href']
10
+ url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
11
+ r[:url] = url
12
+
13
+ r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))
14
+
15
+ r[:content] = Baiduserp::Helper.get_content_safe(table.search("div[@class='c-abstract']"))
16
+
17
+ r[:mu] = table['mu']
18
+
19
+ table.search('a').each do |link|
20
+ r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
21
+ end
22
+ r[:baiduopen] = false if r[:baiduopen].nil?
23
+
24
+ result << r
25
+ end
26
+ result
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::PinPaiZhuanQu
2
+ def self.parse(file)
3
+ file[:html].include? 'bs.baidu.com/adcoup-mat'
4
+ end
5
+ end
@@ -0,0 +1,11 @@
1
+ module Baiduserp::Parser::Related_Keywords
2
+ def self.parse(file)
3
+ result = []
4
+ file[:doc].search('div[@id="rs"]').each do |rs|
5
+ rs.css('a').each do |link|
6
+ result << link.content
7
+ end
8
+ end
9
+ result
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+
3
+ module Baiduserp::Parser::Result_Num
4
+ def self.parse(file)
5
+ html = file[:html]
6
+ str = html.scan(/找到相关结果(.*)个/).join
7
+ str = str.gsub('约','')
8
+ if str.include?('万')
9
+ parts = str.split('万')
10
+ result = parts[0].to_i * 10000 + parts[1].to_i
11
+ else
12
+ result = str.gsub(',', '').to_i
13
+ end
14
+
15
+ result
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "0.1.1"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 2.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-21 00:00:00.000000000 Z
12
+ date: 2013-06-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httparty
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  description: Parse Baidu SERP result page.
31
47
  email:
32
48
  - zmingqian@qq.com
@@ -35,6 +51,14 @@ executables:
35
51
  extensions: []
36
52
  extra_rdoc_files: []
37
53
  files:
54
+ - lib/baiduserp/client.rb
55
+ - lib/baiduserp/helper.rb
56
+ - lib/baiduserp/parser/ads_left.rb
57
+ - lib/baiduserp/parser/ads_right.rb
58
+ - lib/baiduserp/parser/organic.rb
59
+ - lib/baiduserp/parser/pinpaizhuanqu.rb
60
+ - lib/baiduserp/parser/related_keywords.rb
61
+ - lib/baiduserp/parser/result_num.rb
38
62
  - lib/baiduserp/parser.rb
39
63
  - lib/baiduserp/version.rb
40
64
  - lib/baiduserp.rb