baiduserp 0.1.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baiduserp/client.rb +9 -0
- data/lib/baiduserp/helper.rb +14 -0
- data/lib/baiduserp/parser.rb +18 -141
- data/lib/baiduserp/parser/ads_left.rb +5 -0
- data/lib/baiduserp/parser/ads_right.rb +5 -0
- data/lib/baiduserp/parser/organic.rb +28 -0
- data/lib/baiduserp/parser/pinpaizhuanqu.rb +5 -0
- data/lib/baiduserp/parser/related_keywords.rb +11 -0
- data/lib/baiduserp/parser/result_num.rb +17 -0
- data/lib/baiduserp/version.rb +1 -1
- metadata +26 -2
data/lib/baiduserp/parser.rb
CHANGED
@@ -1,17 +1,26 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
-
require '
|
4
|
+
require 'baiduserp/client'
|
5
|
+
require 'baiduserp/helper'
|
5
6
|
|
6
7
|
module Baiduserp
|
7
8
|
class Parser
|
9
|
+
Dir[File.expand_path('../parser/*.rb', __FILE__)].each{|f| require f}
|
10
|
+
|
8
11
|
def parse(html)
|
9
|
-
@
|
10
|
-
@doc = Nokogiri::HTML(@html)
|
11
|
-
@results = []
|
12
|
+
@file = Hash.new
|
12
13
|
@serp = Hash.new
|
13
|
-
|
14
|
-
|
14
|
+
|
15
|
+
@file[:html] = html
|
16
|
+
@file[:doc] = Nokogiri::HTML(html)
|
17
|
+
|
18
|
+
self.class.constants.each do |m|
|
19
|
+
#puts m
|
20
|
+
eval "@serp[:#{m.downcase}] = #{m}.parse @file"
|
21
|
+
#p @serp.keys
|
22
|
+
end
|
23
|
+
|
15
24
|
@serp
|
16
25
|
end
|
17
26
|
|
@@ -21,144 +30,12 @@ module Baiduserp
|
|
21
30
|
|
22
31
|
def parse_file(file_path)
|
23
32
|
if File.exists? file_path
|
24
|
-
html = open(file_path)
|
33
|
+
html = open(file_path).read
|
25
34
|
else
|
26
|
-
html =
|
35
|
+
html = Client.get(URI.escape(file_path)).body
|
27
36
|
end
|
28
|
-
html = html.
|
37
|
+
html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
|
29
38
|
parse html
|
30
39
|
end
|
31
|
-
|
32
|
-
private
|
33
|
-
def get_content_safe(noko)
|
34
|
-
return nil if noko.nil?
|
35
|
-
return nil if noko.empty?
|
36
|
-
noko.first.content
|
37
|
-
end
|
38
|
-
|
39
|
-
def parse_serp_results
|
40
|
-
# left side results
|
41
|
-
@doc.search("//table").each do |table|
|
42
|
-
id = table['id'].to_i
|
43
|
-
parse_serp_table(id,table) if id > 0
|
44
|
-
end
|
45
|
-
# right side ads
|
46
|
-
parse_right_side_ads
|
47
|
-
@serp[:serp_results] = @results
|
48
|
-
end
|
49
|
-
|
50
|
-
def parse_right_side_ads
|
51
|
-
@doc.search("//div[@class='EC_fr EC_PP']").each do |table|
|
52
|
-
id = table['id'].to_s.sub('bdfs','').to_i
|
53
|
-
rank = id + 1
|
54
|
-
url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
|
55
|
-
title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
|
56
|
-
content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
|
57
|
-
@results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def get_url_part_from_string(str)
|
62
|
-
str.split(/( |\s)/).each do |s|
|
63
|
-
return s if s.include? '.'
|
64
|
-
end
|
65
|
-
nil
|
66
|
-
end
|
67
|
-
|
68
|
-
def parse_serp_url(table_id)
|
69
|
-
id = table_id
|
70
|
-
url = nil
|
71
|
-
if id > 3000
|
72
|
-
link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
|
73
|
-
else
|
74
|
-
link_types = ["//table[@id='#{id}']//span[@class='g']",
|
75
|
-
"//table[@id='#{id}']//font[@color='#008000']",
|
76
|
-
"//table[@id='#{id}']//span[@style='color:#008000']",
|
77
|
-
"//table[@id='#{id}']//span[@style='color:#008000;']",
|
78
|
-
"//table[@id='#{id}']//span[@color='#008000']",
|
79
|
-
"//table[@id='#{id}']//p[@class='g']",
|
80
|
-
"//table[@id='#{id}']//cite[@color='#008000']",
|
81
|
-
"//table[@id='#{id}']//cite",
|
82
|
-
"//table[@id='#{id}']//span[@id='ala_img_desc']"
|
83
|
-
]
|
84
|
-
end
|
85
|
-
link_types.each do |link_type|
|
86
|
-
link_search = @doc.search(link_type)
|
87
|
-
url2 = nil
|
88
|
-
url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
|
89
|
-
if url.nil? && (not url2.nil?)
|
90
|
-
url = url2
|
91
|
-
end
|
92
|
-
end
|
93
|
-
url
|
94
|
-
end
|
95
|
-
|
96
|
-
def parse_serp_content(id)
|
97
|
-
get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
|
98
|
-
end
|
99
|
-
|
100
|
-
def parse_serp_table(id,table)
|
101
|
-
result = Hash.new
|
102
|
-
result[:rank] = id
|
103
|
-
result[:url] = parse_serp_url(id)
|
104
|
-
result[:title] = get_content_safe(table.css('h3'))
|
105
|
-
result[:content] = parse_serp_content(id)
|
106
|
-
|
107
|
-
if id >= 3000 # sem ads
|
108
|
-
result[:paid] = 1
|
109
|
-
else # organic results
|
110
|
-
result[:paid] = 0
|
111
|
-
|
112
|
-
# baidu open
|
113
|
-
table.css('a').each do |link|
|
114
|
-
result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
|
115
|
-
end
|
116
|
-
|
117
|
-
# baidu table mu attr (for maps,baike)
|
118
|
-
result[:mu] = table['mu'] unless table['mu'].nil?
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
@results << result
|
123
|
-
end
|
124
|
-
|
125
|
-
# parse baidu serp attrs : result_num, baidubrand, related_keywords
|
126
|
-
def parse_serp_attrs
|
127
|
-
@serp[:result_num] = parse_serp_result_num
|
128
|
-
@serp[:baidubrand] = parse_serp_baidu_brand
|
129
|
-
@serp[:related_keywords] = parse_serp_related_search
|
130
|
-
end
|
131
|
-
|
132
|
-
def parse_serp_related_search
|
133
|
-
result = []
|
134
|
-
@doc.search('div[@id="rs"]').each do |rs|
|
135
|
-
rs.css('a').each do |link|
|
136
|
-
result << link.content
|
137
|
-
end
|
138
|
-
end
|
139
|
-
result
|
140
|
-
end
|
141
|
-
|
142
|
-
def parse_serp_baidu_brand
|
143
|
-
if @html.include? 'bs.baidu.com/adcoup-mat'
|
144
|
-
result = 1
|
145
|
-
else
|
146
|
-
result = 0
|
147
|
-
end
|
148
|
-
result
|
149
|
-
end
|
150
|
-
|
151
|
-
def parse_serp_result_num
|
152
|
-
str = @html.scan(/找到相关结果(.*)个/).join
|
153
|
-
str = str.gsub('约','')
|
154
|
-
if str.include?('万')
|
155
|
-
parts = str.split('万')
|
156
|
-
return parts[0].to_i * 10000 + parts[1].to_i
|
157
|
-
end
|
158
|
-
str.gsub(',', '').to_i
|
159
|
-
end
|
160
|
-
|
161
40
|
end
|
162
41
|
end
|
163
|
-
|
164
|
-
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Baiduserp::Parser::Organic
|
2
|
+
def self.parse(file)
|
3
|
+
result = []
|
4
|
+
file[:doc].search("//table").each do |table|
|
5
|
+
id = table['id'].to_i
|
6
|
+
next unless id > 0
|
7
|
+
r = Hash.new
|
8
|
+
|
9
|
+
url = table.search("h3/a").first['href']
|
10
|
+
url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
|
11
|
+
r[:url] = url
|
12
|
+
|
13
|
+
r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))
|
14
|
+
|
15
|
+
r[:content] = Baiduserp::Helper.get_content_safe(table.search("div[@class='c-abstract']"))
|
16
|
+
|
17
|
+
r[:mu] = table['mu']
|
18
|
+
|
19
|
+
table.search('a').each do |link|
|
20
|
+
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
21
|
+
end
|
22
|
+
r[:baiduopen] = false if r[:baiduopen].nil?
|
23
|
+
|
24
|
+
result << r
|
25
|
+
end
|
26
|
+
result
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module Baiduserp::Parser::Result_Num
|
4
|
+
def self.parse(file)
|
5
|
+
html = file[:html]
|
6
|
+
str = html.scan(/找到相关结果(.*)个/).join
|
7
|
+
str = str.gsub('约','')
|
8
|
+
if str.include?('万')
|
9
|
+
parts = str.split('万')
|
10
|
+
result = parts[0].to_i * 10000 + parts[1].to_i
|
11
|
+
else
|
12
|
+
result = str.gsub(',', '').to_i
|
13
|
+
end
|
14
|
+
|
15
|
+
result
|
16
|
+
end
|
17
|
+
end
|
data/lib/baiduserp/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: baiduserp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: httparty
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
description: Parse Baidu SERP result page.
|
31
47
|
email:
|
32
48
|
- zmingqian@qq.com
|
@@ -35,6 +51,14 @@ executables:
|
|
35
51
|
extensions: []
|
36
52
|
extra_rdoc_files: []
|
37
53
|
files:
|
54
|
+
- lib/baiduserp/client.rb
|
55
|
+
- lib/baiduserp/helper.rb
|
56
|
+
- lib/baiduserp/parser/ads_left.rb
|
57
|
+
- lib/baiduserp/parser/ads_right.rb
|
58
|
+
- lib/baiduserp/parser/organic.rb
|
59
|
+
- lib/baiduserp/parser/pinpaizhuanqu.rb
|
60
|
+
- lib/baiduserp/parser/related_keywords.rb
|
61
|
+
- lib/baiduserp/parser/result_num.rb
|
38
62
|
- lib/baiduserp/parser.rb
|
39
63
|
- lib/baiduserp/version.rb
|
40
64
|
- lib/baiduserp.rb
|