baiduserp 0.1.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ require 'httparty'
2
+
3
+ module Baiduserp
4
+ class Client
5
+ include HTTParty
6
+ base_uri 'www.baidu.com'
7
+ follow_redirects false
8
+ end
9
+ end
@@ -0,0 +1,14 @@
1
+ module Baiduserp
2
+ module Helper
3
+ class << self
4
+ # get content safe from nokogiri search reasult
5
+ def get_content_safe(noko)
6
+ return nil if noko.nil?
7
+ return nil if noko.empty?
8
+ noko.first.content
9
+ end
10
+
11
+
12
+ end
13
+ end
14
+ end
@@ -1,17 +1,26 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'nokogiri'
3
3
  require 'uri'
4
- require 'open-uri'
4
+ require 'baiduserp/client'
5
+ require 'baiduserp/helper'
5
6
 
6
7
  module Baiduserp
7
8
  class Parser
9
+ Dir[File.expand_path('../parser/*.rb', __FILE__)].each{|f| require f}
10
+
8
11
  def parse(html)
9
- @html = html
10
- @doc = Nokogiri::HTML(@html)
11
- @results = []
12
+ @file = Hash.new
12
13
  @serp = Hash.new
13
- parse_serp_results
14
- parse_serp_attrs
14
+
15
+ @file[:html] = html
16
+ @file[:doc] = Nokogiri::HTML(html)
17
+
18
+ self.class.constants.each do |m|
19
+ #puts m
20
+ eval "@serp[:#{m.downcase}] = #{m}.parse @file"
21
+ #p @serp.keys
22
+ end
23
+
15
24
  @serp
16
25
  end
17
26
 
@@ -21,144 +30,12 @@ module Baiduserp
21
30
 
22
31
  def parse_file(file_path)
23
32
  if File.exists? file_path
24
- html = open(file_path)
33
+ html = open(file_path).read
25
34
  else
26
- html = open(URI.escape(file_path))
35
+ html = Client.get(URI.escape(file_path)).body
27
36
  end
28
- html = html.read.encode!('UTF-8','UTF-8',:invalid => :replace)
37
+ html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
29
38
  parse html
30
39
  end
31
-
32
- private
33
- def get_content_safe(noko)
34
- return nil if noko.nil?
35
- return nil if noko.empty?
36
- noko.first.content
37
- end
38
-
39
- def parse_serp_results
40
- # left side results
41
- @doc.search("//table").each do |table|
42
- id = table['id'].to_i
43
- parse_serp_table(id,table) if id > 0
44
- end
45
- # right side ads
46
- parse_right_side_ads
47
- @serp[:serp_results] = @results
48
- end
49
-
50
- def parse_right_side_ads
51
- @doc.search("//div[@class='EC_fr EC_PP']").each do |table|
52
- id = table['id'].to_s.sub('bdfs','').to_i
53
- rank = id + 1
54
- url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
55
- title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
56
- content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
57
- @results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
58
- end
59
- end
60
-
61
- def get_url_part_from_string(str)
62
- str.split(/( |\s)/).each do |s|
63
- return s if s.include? '.'
64
- end
65
- nil
66
- end
67
-
68
- def parse_serp_url(table_id)
69
- id = table_id
70
- url = nil
71
- if id > 3000
72
- link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
73
- else
74
- link_types = ["//table[@id='#{id}']//span[@class='g']",
75
- "//table[@id='#{id}']//font[@color='#008000']",
76
- "//table[@id='#{id}']//span[@style='color:#008000']",
77
- "//table[@id='#{id}']//span[@style='color:#008000;']",
78
- "//table[@id='#{id}']//span[@color='#008000']",
79
- "//table[@id='#{id}']//p[@class='g']",
80
- "//table[@id='#{id}']//cite[@color='#008000']",
81
- "//table[@id='#{id}']//cite",
82
- "//table[@id='#{id}']//span[@id='ala_img_desc']"
83
- ]
84
- end
85
- link_types.each do |link_type|
86
- link_search = @doc.search(link_type)
87
- url2 = nil
88
- url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
89
- if url.nil? && (not url2.nil?)
90
- url = url2
91
- end
92
- end
93
- url
94
- end
95
-
96
- def parse_serp_content(id)
97
- get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
98
- end
99
-
100
- def parse_serp_table(id,table)
101
- result = Hash.new
102
- result[:rank] = id
103
- result[:url] = parse_serp_url(id)
104
- result[:title] = get_content_safe(table.css('h3'))
105
- result[:content] = parse_serp_content(id)
106
-
107
- if id >= 3000 # sem ads
108
- result[:paid] = 1
109
- else # organic results
110
- result[:paid] = 0
111
-
112
- # baidu open
113
- table.css('a').each do |link|
114
- result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
115
- end
116
-
117
- # baidu table mu attr (for maps,baike)
118
- result[:mu] = table['mu'] unless table['mu'].nil?
119
-
120
- end
121
-
122
- @results << result
123
- end
124
-
125
- # parse baidu serp attrs : result_num, baidubrand, related_keywords
126
- def parse_serp_attrs
127
- @serp[:result_num] = parse_serp_result_num
128
- @serp[:baidubrand] = parse_serp_baidu_brand
129
- @serp[:related_keywords] = parse_serp_related_search
130
- end
131
-
132
- def parse_serp_related_search
133
- result = []
134
- @doc.search('div[@id="rs"]').each do |rs|
135
- rs.css('a').each do |link|
136
- result << link.content
137
- end
138
- end
139
- result
140
- end
141
-
142
- def parse_serp_baidu_brand
143
- if @html.include? 'bs.baidu.com/adcoup-mat'
144
- result = 1
145
- else
146
- result = 0
147
- end
148
- result
149
- end
150
-
151
- def parse_serp_result_num
152
- str = @html.scan(/找到相关结果(.*)个/).join
153
- str = str.gsub('约','')
154
- if str.include?('万')
155
- parts = str.split('万')
156
- return parts[0].to_i * 10000 + parts[1].to_i
157
- end
158
- str.gsub(',', '').to_i
159
- end
160
-
161
40
  end
162
41
  end
163
-
164
-
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::Ads_Left
2
+ def self.parse(file)
3
+
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::Ads_Right
2
+ def self.parse(file)
3
+
4
+ end
5
+ end
@@ -0,0 +1,28 @@
1
+ module Baiduserp::Parser::Organic
2
+ def self.parse(file)
3
+ result = []
4
+ file[:doc].search("//table").each do |table|
5
+ id = table['id'].to_i
6
+ next unless id > 0
7
+ r = Hash.new
8
+
9
+ url = table.search("h3/a").first['href']
10
+ url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
11
+ r[:url] = url
12
+
13
+ r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))
14
+
15
+ r[:content] = Baiduserp::Helper.get_content_safe(table.search("div[@class='c-abstract']"))
16
+
17
+ r[:mu] = table['mu']
18
+
19
+ table.search('a').each do |link|
20
+ r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
21
+ end
22
+ r[:baiduopen] = false if r[:baiduopen].nil?
23
+
24
+ result << r
25
+ end
26
+ result
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module Baiduserp::Parser::PinPaiZhuanQu
2
+ def self.parse(file)
3
+ file[:html].include? 'bs.baidu.com/adcoup-mat'
4
+ end
5
+ end
@@ -0,0 +1,11 @@
1
+ module Baiduserp::Parser::Related_Keywords
2
+ def self.parse(file)
3
+ result = []
4
+ file[:doc].search('div[@id="rs"]').each do |rs|
5
+ rs.css('a').each do |link|
6
+ result << link.content
7
+ end
8
+ end
9
+ result
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+
3
+ module Baiduserp::Parser::Result_Num
4
+ def self.parse(file)
5
+ html = file[:html]
6
+ str = html.scan(/找到相关结果(.*)个/).join
7
+ str = str.gsub('约','')
8
+ if str.include?('万')
9
+ parts = str.split('万')
10
+ result = parts[0].to_i * 10000 + parts[1].to_i
11
+ else
12
+ result = str.gsub(',', '').to_i
13
+ end
14
+
15
+ result
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  module Baiduserp
2
- VERSION = "0.1.1"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baiduserp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 2.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-21 00:00:00.000000000 Z
12
+ date: 2013-06-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httparty
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  description: Parse Baidu SERP result page.
31
47
  email:
32
48
  - zmingqian@qq.com
@@ -35,6 +51,14 @@ executables:
35
51
  extensions: []
36
52
  extra_rdoc_files: []
37
53
  files:
54
+ - lib/baiduserp/client.rb
55
+ - lib/baiduserp/helper.rb
56
+ - lib/baiduserp/parser/ads_left.rb
57
+ - lib/baiduserp/parser/ads_right.rb
58
+ - lib/baiduserp/parser/organic.rb
59
+ - lib/baiduserp/parser/pinpaizhuanqu.rb
60
+ - lib/baiduserp/parser/related_keywords.rb
61
+ - lib/baiduserp/parser/result_num.rb
38
62
  - lib/baiduserp/parser.rb
39
63
  - lib/baiduserp/version.rb
40
64
  - lib/baiduserp.rb