baiduserp 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +29 -0
- data/bin/baiduserp +53 -0
- data/lib/baiduserp.rb +16 -0
- data/lib/baiduserp/parser.rb +170 -0
- data/lib/baiduserp/version.rb +3 -0
- metadata +68 -0
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Baiduserp
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'baiduserp'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install baiduserp
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/bin/baiduserp
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'baiduserp'
|
4
|
+
require 'optparse'
|
5
|
+
require 'json'
|
6
|
+
require 'pp'
|
7
|
+
|
8
|
+
usage = "Usage:
|
9
|
+
1. baiduserp -s 'keyword' # search 'keyword' and print parse result
|
10
|
+
2. baiduserp -s 'keyword' -o output.json # -o means save result to a file
|
11
|
+
3. baiduserp -f 'file path' # parse html source code from file
|
12
|
+
4. baiduserp -s 'keyword' -j # search 'keyword' and print parse result in JSON format
|
13
|
+
"
|
14
|
+
|
15
|
+
options = {}
|
16
|
+
OptionParser.new do |opts|
|
17
|
+
opts.banner = usage
|
18
|
+
|
19
|
+
opts.on("-s Keyword", "--search Keyword", "Search Keyword & Parse SERP") do |v|
|
20
|
+
options[:keyword] = v
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-j","--jsonprint","Print result in JSON format") do |v|
|
24
|
+
options[:jsonprint] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-o Output", "--output Output", "Save Result to File in JSON format") do |v|
|
28
|
+
options[:output] = v
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-f File", "--file File", "Parse Local File") do |v|
|
32
|
+
options[:file] = v
|
33
|
+
end
|
34
|
+
end.parse!
|
35
|
+
|
36
|
+
result = ''
|
37
|
+
|
38
|
+
if options[:file].nil?
|
39
|
+
result = Baiduserp.search options[:keyword]
|
40
|
+
else
|
41
|
+
result = Baiduserp.parse_file options[:file]
|
42
|
+
end
|
43
|
+
|
44
|
+
if options[:output].nil?
|
45
|
+
if options[:jsonprint].nil?
|
46
|
+
pp result
|
47
|
+
else
|
48
|
+
puts result.to_json
|
49
|
+
end
|
50
|
+
else
|
51
|
+
open(options[:output],'w').puts result.to_json
|
52
|
+
end
|
53
|
+
|
data/lib/baiduserp.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "baiduserp/version"
|
2
|
+
require 'baiduserp/parser'
|
3
|
+
|
4
|
+
module Baiduserp
|
5
|
+
def self.search(keyword)
|
6
|
+
Parser.new.search keyword
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(html)
|
10
|
+
Parser.new.parse html
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse_file(file_path)
|
14
|
+
Parser.new.parse_file file_path
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
module Baiduserp
|
7
|
+
class Parser
|
8
|
+
BAIDU_RESULT = /找到相关结果(.*)个/
|
9
|
+
|
10
|
+
def parse(html)
|
11
|
+
@html = html
|
12
|
+
@doc = Nokogiri::HTML(@html)
|
13
|
+
@results = []
|
14
|
+
@serp = Hash.new
|
15
|
+
parse_serp_results
|
16
|
+
parse_serp_attrs
|
17
|
+
@serp
|
18
|
+
end
|
19
|
+
|
20
|
+
def search(keyword)
|
21
|
+
parse_file("http://www.baidu.com/s?wd=#{keyword}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_file(file_path)
|
25
|
+
if File.exists? file_path
|
26
|
+
html = open(file_path)
|
27
|
+
else
|
28
|
+
html = open(URI.escape(file_path))
|
29
|
+
end
|
30
|
+
html = html.read.encode('UTF-8')
|
31
|
+
parse html
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def get_content_safe(noko)
|
36
|
+
return nil if noko.nil?
|
37
|
+
return nil if noko.empty?
|
38
|
+
noko.first.content
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_serp_results
|
42
|
+
# left side results
|
43
|
+
@doc.search("//table").each do |table|
|
44
|
+
id = table['id'].to_i
|
45
|
+
parse_serp_table(id,table) if id > 0
|
46
|
+
end
|
47
|
+
# right side ads
|
48
|
+
parse_right_side_ads
|
49
|
+
@serp[:serp_results] = @results
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_right_side_ads
|
53
|
+
@doc.search("//div[@class='EC_fr EC_PP']").each do |table|
|
54
|
+
id = table['id'].to_s.sub('bdfs','').to_i
|
55
|
+
rank = id + 1
|
56
|
+
url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
|
57
|
+
title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
|
58
|
+
content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
|
59
|
+
@results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_url_part_from_string(str)
|
64
|
+
str.split(/( |\s)/).each do |s|
|
65
|
+
return s if s.include? '.'
|
66
|
+
end
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_serp_url(table_id)
|
71
|
+
id = table_id
|
72
|
+
url = nil
|
73
|
+
if id > 3000
|
74
|
+
link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
|
75
|
+
else
|
76
|
+
link_types = ["//table[@id='#{id}']//span[@class='g']",
|
77
|
+
"//table[@id='#{id}']//font[@color='#008000']",
|
78
|
+
"//table[@id='#{id}']//span[@style='color:#008000']",
|
79
|
+
"//table[@id='#{id}']//span[@style='color:#008000;']",
|
80
|
+
"//table[@id='#{id}']//span[@color='#008000']",
|
81
|
+
"//table[@id='#{id}']//p[@class='g']",
|
82
|
+
"//table[@id='#{id}']//cite[@color='#008000']",
|
83
|
+
"//table[@id='#{id}']//cite",
|
84
|
+
"//table[@id='#{id}']//span[@id='ala_img_desc']"
|
85
|
+
]
|
86
|
+
end
|
87
|
+
link_types.each do |link_type|
|
88
|
+
link_search = @doc.search(link_type)
|
89
|
+
url2 = nil
|
90
|
+
url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
|
91
|
+
if url.nil? && (not url2.nil?)
|
92
|
+
url = url2
|
93
|
+
end
|
94
|
+
end
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
def parse_serp_title(id)
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
def parse_serp_content(id)
|
103
|
+
get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
|
104
|
+
end
|
105
|
+
|
106
|
+
def parse_serp_table(id,table)
|
107
|
+
result = Hash.new
|
108
|
+
result[:rank] = id
|
109
|
+
result[:url] = parse_serp_url(id)
|
110
|
+
result[:title] = table.css('h3').first.content
|
111
|
+
result[:content] = parse_serp_content(id)
|
112
|
+
|
113
|
+
if id > 3000 # sem ads
|
114
|
+
result[:paid] = 1
|
115
|
+
else # organic results
|
116
|
+
result[:paid] = 0
|
117
|
+
|
118
|
+
# baidu open
|
119
|
+
table.css('a').each do |link|
|
120
|
+
result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
|
121
|
+
end
|
122
|
+
|
123
|
+
# baidu table mu attr (for maps,baike)
|
124
|
+
result[:mu] = table['mu'] unless table['mu'].nil?
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
@results << result
|
129
|
+
end
|
130
|
+
|
131
|
+
# parse baidu serp attrs : result_num, baidubrand, related_keywords
|
132
|
+
def parse_serp_attrs
|
133
|
+
@serp[:result_num] = parse_serp_result_num
|
134
|
+
@serp[:baidubrand] = parse_serp_baidu_brand
|
135
|
+
@serp[:related_keywords] = parse_serp_related_search
|
136
|
+
end
|
137
|
+
|
138
|
+
def parse_serp_related_search
|
139
|
+
result = []
|
140
|
+
@doc.search('div[@id="rs"]').each do |rs|
|
141
|
+
rs.css('a').each do |link|
|
142
|
+
result << link.content
|
143
|
+
end
|
144
|
+
end
|
145
|
+
result
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_serp_baidu_brand
|
149
|
+
if @html.include? 'bs.baidu.com/adcoup-mat'
|
150
|
+
result = 1
|
151
|
+
else
|
152
|
+
result = 0
|
153
|
+
end
|
154
|
+
result
|
155
|
+
end
|
156
|
+
|
157
|
+
def parse_serp_result_num
|
158
|
+
str = @html.scan(BAIDU_RESULT).join
|
159
|
+
str = str.gsub('约','')
|
160
|
+
if str.include?('万')
|
161
|
+
parts = str.split('万')
|
162
|
+
return parts[0].to_i * 10000 + parts[1].to_i
|
163
|
+
end
|
164
|
+
str.gsub(',', '').to_i
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baiduserp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- MingQian Zhang
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: Parse Baidu SERP result page.
|
31
|
+
email:
|
32
|
+
- zmingqian@qq.com
|
33
|
+
executables:
|
34
|
+
- baiduserp
|
35
|
+
extensions: []
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- lib/baiduserp/parser.rb
|
39
|
+
- lib/baiduserp/version.rb
|
40
|
+
- lib/baiduserp.rb
|
41
|
+
- bin/baiduserp
|
42
|
+
- README.md
|
43
|
+
homepage: http://www.baidu.com/
|
44
|
+
licenses: []
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.8.24
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Baidu SERP
|
67
|
+
test_files: []
|
68
|
+
has_rdoc:
|