baiduserp 0.1.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baiduserp/client.rb +9 -0
- data/lib/baiduserp/helper.rb +14 -0
- data/lib/baiduserp/parser.rb +18 -141
- data/lib/baiduserp/parser/ads_left.rb +5 -0
- data/lib/baiduserp/parser/ads_right.rb +5 -0
- data/lib/baiduserp/parser/organic.rb +28 -0
- data/lib/baiduserp/parser/pinpaizhuanqu.rb +5 -0
- data/lib/baiduserp/parser/related_keywords.rb +11 -0
- data/lib/baiduserp/parser/result_num.rb +17 -0
- data/lib/baiduserp/version.rb +1 -1
- metadata +26 -2
data/lib/baiduserp/parser.rb
CHANGED
@@ -1,17 +1,26 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
-
require '
|
4
|
+
require 'baiduserp/client'
|
5
|
+
require 'baiduserp/helper'
|
5
6
|
|
6
7
|
module Baiduserp
|
7
8
|
class Parser
|
9
|
+
Dir[File.expand_path('../parser/*.rb', __FILE__)].each{|f| require f}
|
10
|
+
|
8
11
|
def parse(html)
|
9
|
-
@
|
10
|
-
@doc = Nokogiri::HTML(@html)
|
11
|
-
@results = []
|
12
|
+
@file = Hash.new
|
12
13
|
@serp = Hash.new
|
13
|
-
|
14
|
-
|
14
|
+
|
15
|
+
@file[:html] = html
|
16
|
+
@file[:doc] = Nokogiri::HTML(html)
|
17
|
+
|
18
|
+
self.class.constants.each do |m|
|
19
|
+
#puts m
|
20
|
+
eval "@serp[:#{m.downcase}] = #{m}.parse @file"
|
21
|
+
#p @serp.keys
|
22
|
+
end
|
23
|
+
|
15
24
|
@serp
|
16
25
|
end
|
17
26
|
|
@@ -21,144 +30,12 @@ module Baiduserp
|
|
21
30
|
|
22
31
|
def parse_file(file_path)
|
23
32
|
if File.exists? file_path
|
24
|
-
html = open(file_path)
|
33
|
+
html = open(file_path).read
|
25
34
|
else
|
26
|
-
html =
|
35
|
+
html = Client.get(URI.escape(file_path)).body
|
27
36
|
end
|
28
|
-
html = html.
|
37
|
+
html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
|
29
38
|
parse html
|
30
39
|
end
|
31
|
-
|
32
|
-
private
|
33
|
-
def get_content_safe(noko)
|
34
|
-
return nil if noko.nil?
|
35
|
-
return nil if noko.empty?
|
36
|
-
noko.first.content
|
37
|
-
end
|
38
|
-
|
39
|
-
def parse_serp_results
|
40
|
-
# left side results
|
41
|
-
@doc.search("//table").each do |table|
|
42
|
-
id = table['id'].to_i
|
43
|
-
parse_serp_table(id,table) if id > 0
|
44
|
-
end
|
45
|
-
# right side ads
|
46
|
-
parse_right_side_ads
|
47
|
-
@serp[:serp_results] = @results
|
48
|
-
end
|
49
|
-
|
50
|
-
def parse_right_side_ads
|
51
|
-
@doc.search("//div[@class='EC_fr EC_PP']").each do |table|
|
52
|
-
id = table['id'].to_s.sub('bdfs','').to_i
|
53
|
-
rank = id + 1
|
54
|
-
url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
|
55
|
-
title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
|
56
|
-
content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
|
57
|
-
@results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def get_url_part_from_string(str)
|
62
|
-
str.split(/( |\s)/).each do |s|
|
63
|
-
return s if s.include? '.'
|
64
|
-
end
|
65
|
-
nil
|
66
|
-
end
|
67
|
-
|
68
|
-
def parse_serp_url(table_id)
|
69
|
-
id = table_id
|
70
|
-
url = nil
|
71
|
-
if id > 3000
|
72
|
-
link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
|
73
|
-
else
|
74
|
-
link_types = ["//table[@id='#{id}']//span[@class='g']",
|
75
|
-
"//table[@id='#{id}']//font[@color='#008000']",
|
76
|
-
"//table[@id='#{id}']//span[@style='color:#008000']",
|
77
|
-
"//table[@id='#{id}']//span[@style='color:#008000;']",
|
78
|
-
"//table[@id='#{id}']//span[@color='#008000']",
|
79
|
-
"//table[@id='#{id}']//p[@class='g']",
|
80
|
-
"//table[@id='#{id}']//cite[@color='#008000']",
|
81
|
-
"//table[@id='#{id}']//cite",
|
82
|
-
"//table[@id='#{id}']//span[@id='ala_img_desc']"
|
83
|
-
]
|
84
|
-
end
|
85
|
-
link_types.each do |link_type|
|
86
|
-
link_search = @doc.search(link_type)
|
87
|
-
url2 = nil
|
88
|
-
url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
|
89
|
-
if url.nil? && (not url2.nil?)
|
90
|
-
url = url2
|
91
|
-
end
|
92
|
-
end
|
93
|
-
url
|
94
|
-
end
|
95
|
-
|
96
|
-
def parse_serp_content(id)
|
97
|
-
get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
|
98
|
-
end
|
99
|
-
|
100
|
-
def parse_serp_table(id,table)
|
101
|
-
result = Hash.new
|
102
|
-
result[:rank] = id
|
103
|
-
result[:url] = parse_serp_url(id)
|
104
|
-
result[:title] = get_content_safe(table.css('h3'))
|
105
|
-
result[:content] = parse_serp_content(id)
|
106
|
-
|
107
|
-
if id >= 3000 # sem ads
|
108
|
-
result[:paid] = 1
|
109
|
-
else # organic results
|
110
|
-
result[:paid] = 0
|
111
|
-
|
112
|
-
# baidu open
|
113
|
-
table.css('a').each do |link|
|
114
|
-
result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
|
115
|
-
end
|
116
|
-
|
117
|
-
# baidu table mu attr (for maps,baike)
|
118
|
-
result[:mu] = table['mu'] unless table['mu'].nil?
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
@results << result
|
123
|
-
end
|
124
|
-
|
125
|
-
# parse baidu serp attrs : result_num, baidubrand, related_keywords
|
126
|
-
def parse_serp_attrs
|
127
|
-
@serp[:result_num] = parse_serp_result_num
|
128
|
-
@serp[:baidubrand] = parse_serp_baidu_brand
|
129
|
-
@serp[:related_keywords] = parse_serp_related_search
|
130
|
-
end
|
131
|
-
|
132
|
-
def parse_serp_related_search
|
133
|
-
result = []
|
134
|
-
@doc.search('div[@id="rs"]').each do |rs|
|
135
|
-
rs.css('a').each do |link|
|
136
|
-
result << link.content
|
137
|
-
end
|
138
|
-
end
|
139
|
-
result
|
140
|
-
end
|
141
|
-
|
142
|
-
def parse_serp_baidu_brand
|
143
|
-
if @html.include? 'bs.baidu.com/adcoup-mat'
|
144
|
-
result = 1
|
145
|
-
else
|
146
|
-
result = 0
|
147
|
-
end
|
148
|
-
result
|
149
|
-
end
|
150
|
-
|
151
|
-
def parse_serp_result_num
|
152
|
-
str = @html.scan(/找到相关结果(.*)个/).join
|
153
|
-
str = str.gsub('约','')
|
154
|
-
if str.include?('万')
|
155
|
-
parts = str.split('万')
|
156
|
-
return parts[0].to_i * 10000 + parts[1].to_i
|
157
|
-
end
|
158
|
-
str.gsub(',', '').to_i
|
159
|
-
end
|
160
|
-
|
161
40
|
end
|
162
41
|
end
|
163
|
-
|
164
|
-
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Baiduserp::Parser::Organic
|
2
|
+
def self.parse(file)
|
3
|
+
result = []
|
4
|
+
file[:doc].search("//table").each do |table|
|
5
|
+
id = table['id'].to_i
|
6
|
+
next unless id > 0
|
7
|
+
r = Hash.new
|
8
|
+
|
9
|
+
url = table.search("h3/a").first['href']
|
10
|
+
url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
|
11
|
+
r[:url] = url
|
12
|
+
|
13
|
+
r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))
|
14
|
+
|
15
|
+
r[:content] = Baiduserp::Helper.get_content_safe(table.search("div[@class='c-abstract']"))
|
16
|
+
|
17
|
+
r[:mu] = table['mu']
|
18
|
+
|
19
|
+
table.search('a').each do |link|
|
20
|
+
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
21
|
+
end
|
22
|
+
r[:baiduopen] = false if r[:baiduopen].nil?
|
23
|
+
|
24
|
+
result << r
|
25
|
+
end
|
26
|
+
result
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module Baiduserp::Parser::Result_Num
|
4
|
+
def self.parse(file)
|
5
|
+
html = file[:html]
|
6
|
+
str = html.scan(/找到相关结果(.*)个/).join
|
7
|
+
str = str.gsub('约','')
|
8
|
+
if str.include?('万')
|
9
|
+
parts = str.split('万')
|
10
|
+
result = parts[0].to_i * 10000 + parts[1].to_i
|
11
|
+
else
|
12
|
+
result = str.gsub(',', '').to_i
|
13
|
+
end
|
14
|
+
|
15
|
+
result
|
16
|
+
end
|
17
|
+
end
|
data/lib/baiduserp/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: httparty
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
description: Parse Baidu SERP result page.
|
31
47
|
email:
|
32
48
|
- zmingqian@qq.com
|
@@ -35,6 +51,14 @@ executables:
|
|
35
51
|
extensions: []
|
36
52
|
extra_rdoc_files: []
|
37
53
|
files:
|
54
|
+
- lib/baiduserp/client.rb
|
55
|
+
- lib/baiduserp/helper.rb
|
56
|
+
- lib/baiduserp/parser/ads_left.rb
|
57
|
+
- lib/baiduserp/parser/ads_right.rb
|
58
|
+
- lib/baiduserp/parser/organic.rb
|
59
|
+
- lib/baiduserp/parser/pinpaizhuanqu.rb
|
60
|
+
- lib/baiduserp/parser/related_keywords.rb
|
61
|
+
- lib/baiduserp/parser/result_num.rb
|
38
62
|
- lib/baiduserp/parser.rb
|
39
63
|
- lib/baiduserp/version.rb
|
40
64
|
- lib/baiduserp.rb
|