baiduserp 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,29 @@
1
+ # Baiduserp
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'baiduserp'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install baiduserp
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'baiduserp'
4
+ require 'optparse'
5
+ require 'json'
6
+ require 'pp'
7
+
8
+ usage = "Usage:
9
+ 1. baiduserp -s 'keyword' # search 'keyword' and print parse result
10
+ 2. baiduserp -s 'keyword' -o output.json # -o means save result to a file
11
+ 3. baiduserp -f 'file path' # parse html source code from file
12
+ 4. baiduserp -s 'keyword' -j # search 'keyword' and print parse result in JSON format
13
+ "
14
+
15
+ options = {}
16
+ OptionParser.new do |opts|
17
+ opts.banner = usage
18
+
19
+ opts.on("-s Keyword", "--search Keyword", "Search Keyword & Parse SERP") do |v|
20
+ options[:keyword] = v
21
+ end
22
+
23
+ opts.on("-j","--jsonprint","Print result in JSON format") do |v|
24
+ options[:jsonprint] = v
25
+ end
26
+
27
+ opts.on("-o Output", "--output Output", "Save Result to File in JSON format") do |v|
28
+ options[:output] = v
29
+ end
30
+
31
+ opts.on("-f File", "--file File", "Parse Local File") do |v|
32
+ options[:file] = v
33
+ end
34
+ end.parse!
35
+
36
+ result = ''
37
+
38
+ if options[:file].nil?
39
+ result = Baiduserp.search options[:keyword]
40
+ else
41
+ result = Baiduserp.parse_file options[:file]
42
+ end
43
+
44
+ if options[:output].nil?
45
+ if options[:jsonprint].nil?
46
+ pp result
47
+ else
48
+ puts result.to_json
49
+ end
50
+ else
51
+ open(options[:output],'w').puts result.to_json
52
+ end
53
+
@@ -0,0 +1,16 @@
1
+ require "baiduserp/version"
2
+ require 'baiduserp/parser'
3
+
4
+ module Baiduserp
5
+ def self.search(keyword)
6
+ Parser.new.search keyword
7
+ end
8
+
9
+ def self.parse(html)
10
+ Parser.new.parse html
11
+ end
12
+
13
+ def self.parse_file(file_path)
14
+ Parser.new.parse_file file_path
15
+ end
16
+ end
@@ -0,0 +1,170 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'open-uri'
5
+
6
+ module Baiduserp
7
+ class Parser
8
+ BAIDU_RESULT = /找到相关结果(.*)个/
9
+
10
+ def parse(html)
11
+ @html = html
12
+ @doc = Nokogiri::HTML(@html)
13
+ @results = []
14
+ @serp = Hash.new
15
+ parse_serp_results
16
+ parse_serp_attrs
17
+ @serp
18
+ end
19
+
20
+ def search(keyword)
21
+ parse_file("http://www.baidu.com/s?wd=#{keyword}")
22
+ end
23
+
24
+ def parse_file(file_path)
25
+ if File.exists? file_path
26
+ html = open(file_path)
27
+ else
28
+ html = open(URI.escape(file_path))
29
+ end
30
+ html = html.read.encode('UTF-8')
31
+ parse html
32
+ end
33
+
34
+ private
35
+ def get_content_safe(noko)
36
+ return nil if noko.nil?
37
+ return nil if noko.empty?
38
+ noko.first.content
39
+ end
40
+
41
+ def parse_serp_results
42
+ # left side results
43
+ @doc.search("//table").each do |table|
44
+ id = table['id'].to_i
45
+ parse_serp_table(id,table) if id > 0
46
+ end
47
+ # right side ads
48
+ parse_right_side_ads
49
+ @serp[:serp_results] = @results
50
+ end
51
+
52
+ def parse_right_side_ads
53
+ @doc.search("//div[@class='EC_fr EC_PP']").each do |table|
54
+ id = table['id'].to_s.sub('bdfs','').to_i
55
+ rank = id + 1
56
+ url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
57
+ title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
58
+ content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
59
+ @results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
60
+ end
61
+ end
62
+
63
+ def get_url_part_from_string(str)
64
+ str.split(/( |\s)/).each do |s|
65
+ return s if s.include? '.'
66
+ end
67
+ nil
68
+ end
69
+
70
+ def parse_serp_url(table_id)
71
+ id = table_id
72
+ url = nil
73
+ if id > 3000
74
+ link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
75
+ else
76
+ link_types = ["//table[@id='#{id}']//span[@class='g']",
77
+ "//table[@id='#{id}']//font[@color='#008000']",
78
+ "//table[@id='#{id}']//span[@style='color:#008000']",
79
+ "//table[@id='#{id}']//span[@style='color:#008000;']",
80
+ "//table[@id='#{id}']//span[@color='#008000']",
81
+ "//table[@id='#{id}']//p[@class='g']",
82
+ "//table[@id='#{id}']//cite[@color='#008000']",
83
+ "//table[@id='#{id}']//cite",
84
+ "//table[@id='#{id}']//span[@id='ala_img_desc']"
85
+ ]
86
+ end
87
+ link_types.each do |link_type|
88
+ link_search = @doc.search(link_type)
89
+ url2 = nil
90
+ url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
91
+ if url.nil? && (not url2.nil?)
92
+ url = url2
93
+ end
94
+ end
95
+ url
96
+ end
97
+
98
+ def parse_serp_title(id)
99
+
100
+ end
101
+
102
+ def parse_serp_content(id)
103
+ get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
104
+ end
105
+
106
+ def parse_serp_table(id,table)
107
+ result = Hash.new
108
+ result[:rank] = id
109
+ result[:url] = parse_serp_url(id)
110
+ result[:title] = table.css('h3').first.content
111
+ result[:content] = parse_serp_content(id)
112
+
113
+ if id > 3000 # sem ads
114
+ result[:paid] = 1
115
+ else # organic results
116
+ result[:paid] = 0
117
+
118
+ # baidu open
119
+ table.css('a').each do |link|
120
+ result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
121
+ end
122
+
123
+ # baidu table mu attr (for maps,baike)
124
+ result[:mu] = table['mu'] unless table['mu'].nil?
125
+
126
+ end
127
+
128
+ @results << result
129
+ end
130
+
131
+ # parse baidu serp attrs : result_num, baidubrand, related_keywords
132
+ def parse_serp_attrs
133
+ @serp[:result_num] = parse_serp_result_num
134
+ @serp[:baidubrand] = parse_serp_baidu_brand
135
+ @serp[:related_keywords] = parse_serp_related_search
136
+ end
137
+
138
+ def parse_serp_related_search
139
+ result = []
140
+ @doc.search('div[@id="rs"]').each do |rs|
141
+ rs.css('a').each do |link|
142
+ result << link.content
143
+ end
144
+ end
145
+ result
146
+ end
147
+
148
+ def parse_serp_baidu_brand
149
+ if @html.include? 'bs.baidu.com/adcoup-mat'
150
+ result = 1
151
+ else
152
+ result = 0
153
+ end
154
+ result
155
+ end
156
+
157
+ def parse_serp_result_num
158
+ str = @html.scan(BAIDU_RESULT).join
159
+ str = str.gsub('约','')
160
+ if str.include?('万')
161
+ parts = str.split('万')
162
+ return parts[0].to_i * 10000 + parts[1].to_i
163
+ end
164
+ str.gsub(',', '').to_i
165
+ end
166
+
167
+ end
168
+ end
169
+
170
+
@@ -0,0 +1,3 @@
1
+ module Baiduserp
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baiduserp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - MingQian Zhang
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Parse Baidu SERP result page.
31
+ email:
32
+ - zmingqian@qq.com
33
+ executables:
34
+ - baiduserp
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/baiduserp/parser.rb
39
+ - lib/baiduserp/version.rb
40
+ - lib/baiduserp.rb
41
+ - bin/baiduserp
42
+ - README.md
43
+ homepage: http://www.baidu.com/
44
+ licenses: []
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubyforge_project:
63
+ rubygems_version: 1.8.24
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Baidu SERP
67
+ test_files: []
68
+ has_rdoc: