baiduserp 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +29 -0
- data/bin/baiduserp +53 -0
- data/lib/baiduserp.rb +16 -0
- data/lib/baiduserp/parser.rb +170 -0
- data/lib/baiduserp/version.rb +3 -0
- metadata +68 -0
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Baiduserp
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'baiduserp'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install baiduserp
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/bin/baiduserp
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'baiduserp'
|
4
|
+
require 'optparse'
|
5
|
+
require 'json'
|
6
|
+
require 'pp'
|
7
|
+
|
8
|
+
usage = "Usage:
|
9
|
+
1. baiduserp -s 'keyword' # search 'keyword' and print parse result
|
10
|
+
2. baiduserp -s 'keyword' -o output.json # -o means save result to a file
|
11
|
+
3. baiduserp -f 'file path' # parse html source code from file
|
12
|
+
4. baiduserp -s 'keyword' -j # search 'keyword' and print parse result in JSON format
|
13
|
+
"
|
14
|
+
|
15
|
+
options = {}
|
16
|
+
OptionParser.new do |opts|
|
17
|
+
opts.banner = usage
|
18
|
+
|
19
|
+
opts.on("-s Keyword", "--search Keyword", "Search Keyword & Parse SERP") do |v|
|
20
|
+
options[:keyword] = v
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-j","--jsonprint","Print result in JSON format") do |v|
|
24
|
+
options[:jsonprint] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-o Output", "--output Output", "Save Result to File in JSON format") do |v|
|
28
|
+
options[:output] = v
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-f File", "--file File", "Parse Local File") do |v|
|
32
|
+
options[:file] = v
|
33
|
+
end
|
34
|
+
end.parse!
|
35
|
+
|
36
|
+
result = ''
|
37
|
+
|
38
|
+
if options[:file].nil?
|
39
|
+
result = Baiduserp.search options[:keyword]
|
40
|
+
else
|
41
|
+
result = Baiduserp.parse_file options[:file]
|
42
|
+
end
|
43
|
+
|
44
|
+
if options[:output].nil?
|
45
|
+
if options[:jsonprint].nil?
|
46
|
+
pp result
|
47
|
+
else
|
48
|
+
puts result.to_json
|
49
|
+
end
|
50
|
+
else
|
51
|
+
open(options[:output],'w').puts result.to_json
|
52
|
+
end
|
53
|
+
|
data/lib/baiduserp.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "baiduserp/version"
|
2
|
+
require 'baiduserp/parser'
|
3
|
+
|
4
|
+
module Baiduserp
|
5
|
+
def self.search(keyword)
|
6
|
+
Parser.new.search keyword
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(html)
|
10
|
+
Parser.new.parse html
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse_file(file_path)
|
14
|
+
Parser.new.parse_file file_path
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
module Baiduserp
|
7
|
+
class Parser
|
8
|
+
BAIDU_RESULT = /找到相关结果(.*)个/
|
9
|
+
|
10
|
+
def parse(html)
|
11
|
+
@html = html
|
12
|
+
@doc = Nokogiri::HTML(@html)
|
13
|
+
@results = []
|
14
|
+
@serp = Hash.new
|
15
|
+
parse_serp_results
|
16
|
+
parse_serp_attrs
|
17
|
+
@serp
|
18
|
+
end
|
19
|
+
|
20
|
+
def search(keyword)
|
21
|
+
parse_file("http://www.baidu.com/s?wd=#{keyword}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_file(file_path)
|
25
|
+
if File.exists? file_path
|
26
|
+
html = open(file_path)
|
27
|
+
else
|
28
|
+
html = open(URI.escape(file_path))
|
29
|
+
end
|
30
|
+
html = html.read.encode('UTF-8')
|
31
|
+
parse html
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def get_content_safe(noko)
|
36
|
+
return nil if noko.nil?
|
37
|
+
return nil if noko.empty?
|
38
|
+
noko.first.content
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_serp_results
|
42
|
+
# left side results
|
43
|
+
@doc.search("//table").each do |table|
|
44
|
+
id = table['id'].to_i
|
45
|
+
parse_serp_table(id,table) if id > 0
|
46
|
+
end
|
47
|
+
# right side ads
|
48
|
+
parse_right_side_ads
|
49
|
+
@serp[:serp_results] = @results
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_right_side_ads
|
53
|
+
@doc.search("//div[@class='EC_fr EC_PP']").each do |table|
|
54
|
+
id = table['id'].to_s.sub('bdfs','').to_i
|
55
|
+
rank = id + 1
|
56
|
+
url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
|
57
|
+
title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
|
58
|
+
content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
|
59
|
+
@results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_url_part_from_string(str)
|
64
|
+
str.split(/( |\s)/).each do |s|
|
65
|
+
return s if s.include? '.'
|
66
|
+
end
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_serp_url(table_id)
|
71
|
+
id = table_id
|
72
|
+
url = nil
|
73
|
+
if id > 3000
|
74
|
+
link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
|
75
|
+
else
|
76
|
+
link_types = ["//table[@id='#{id}']//span[@class='g']",
|
77
|
+
"//table[@id='#{id}']//font[@color='#008000']",
|
78
|
+
"//table[@id='#{id}']//span[@style='color:#008000']",
|
79
|
+
"//table[@id='#{id}']//span[@style='color:#008000;']",
|
80
|
+
"//table[@id='#{id}']//span[@color='#008000']",
|
81
|
+
"//table[@id='#{id}']//p[@class='g']",
|
82
|
+
"//table[@id='#{id}']//cite[@color='#008000']",
|
83
|
+
"//table[@id='#{id}']//cite",
|
84
|
+
"//table[@id='#{id}']//span[@id='ala_img_desc']"
|
85
|
+
]
|
86
|
+
end
|
87
|
+
link_types.each do |link_type|
|
88
|
+
link_search = @doc.search(link_type)
|
89
|
+
url2 = nil
|
90
|
+
url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
|
91
|
+
if url.nil? && (not url2.nil?)
|
92
|
+
url = url2
|
93
|
+
end
|
94
|
+
end
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
def parse_serp_title(id)
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
def parse_serp_content(id)
|
103
|
+
get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
|
104
|
+
end
|
105
|
+
|
106
|
+
def parse_serp_table(id,table)
|
107
|
+
result = Hash.new
|
108
|
+
result[:rank] = id
|
109
|
+
result[:url] = parse_serp_url(id)
|
110
|
+
result[:title] = table.css('h3').first.content
|
111
|
+
result[:content] = parse_serp_content(id)
|
112
|
+
|
113
|
+
if id > 3000 # sem ads
|
114
|
+
result[:paid] = 1
|
115
|
+
else # organic results
|
116
|
+
result[:paid] = 0
|
117
|
+
|
118
|
+
# baidu open
|
119
|
+
table.css('a').each do |link|
|
120
|
+
result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
|
121
|
+
end
|
122
|
+
|
123
|
+
# baidu table mu attr (for maps,baike)
|
124
|
+
result[:mu] = table['mu'] unless table['mu'].nil?
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
@results << result
|
129
|
+
end
|
130
|
+
|
131
|
+
# parse baidu serp attrs : result_num, baidubrand, related_keywords
|
132
|
+
def parse_serp_attrs
|
133
|
+
@serp[:result_num] = parse_serp_result_num
|
134
|
+
@serp[:baidubrand] = parse_serp_baidu_brand
|
135
|
+
@serp[:related_keywords] = parse_serp_related_search
|
136
|
+
end
|
137
|
+
|
138
|
+
def parse_serp_related_search
|
139
|
+
result = []
|
140
|
+
@doc.search('div[@id="rs"]').each do |rs|
|
141
|
+
rs.css('a').each do |link|
|
142
|
+
result << link.content
|
143
|
+
end
|
144
|
+
end
|
145
|
+
result
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_serp_baidu_brand
|
149
|
+
if @html.include? 'bs.baidu.com/adcoup-mat'
|
150
|
+
result = 1
|
151
|
+
else
|
152
|
+
result = 0
|
153
|
+
end
|
154
|
+
result
|
155
|
+
end
|
156
|
+
|
157
|
+
def parse_serp_result_num
|
158
|
+
str = @html.scan(BAIDU_RESULT).join
|
159
|
+
str = str.gsub('约','')
|
160
|
+
if str.include?('万')
|
161
|
+
parts = str.split('万')
|
162
|
+
return parts[0].to_i * 10000 + parts[1].to_i
|
163
|
+
end
|
164
|
+
str.gsub(',', '').to_i
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baiduserp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- MingQian Zhang
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: Parse Baidu SERP result page.
|
31
|
+
email:
|
32
|
+
- zmingqian@qq.com
|
33
|
+
executables:
|
34
|
+
- baiduserp
|
35
|
+
extensions: []
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- lib/baiduserp/parser.rb
|
39
|
+
- lib/baiduserp/version.rb
|
40
|
+
- lib/baiduserp.rb
|
41
|
+
- bin/baiduserp
|
42
|
+
- README.md
|
43
|
+
homepage: http://www.baidu.com/
|
44
|
+
licenses: []
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.8.24
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Baidu SERP
|
67
|
+
test_files: []
|
68
|
+
has_rdoc:
|