baiduserp 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ # Baiduserp
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'baiduserp'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install baiduserp
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'baiduserp'
4
+ require 'optparse'
5
+ require 'json'
6
+ require 'pp'
7
+
8
+ usage = "Usage:
9
+ 1. baiduserp -s 'keyword' # search 'keyword' and print parse result
10
+ 2. baiduserp -s 'keyword' -o output.json # -o means save result to a file
11
+ 3. baiduserp -f 'file path' # parse html source code from file
12
+ 4. baiduserp -s 'keyword' -j # search 'keyword' and print parse result in JSON format
13
+ "
14
+
15
+ options = {}
16
+ OptionParser.new do |opts|
17
+ opts.banner = usage
18
+
19
+ opts.on("-s Keyword", "--search Keyword", "Search Keyword & Parse SERP") do |v|
20
+ options[:keyword] = v
21
+ end
22
+
23
+ opts.on("-j","--jsonprint","Print result in JSON format") do |v|
24
+ options[:jsonprint] = v
25
+ end
26
+
27
+ opts.on("-o Output", "--output Output", "Save Result to File in JSON format") do |v|
28
+ options[:output] = v
29
+ end
30
+
31
+ opts.on("-f File", "--file File", "Parse Local File") do |v|
32
+ options[:file] = v
33
+ end
34
+ end.parse!
35
+
36
+ result = ''
37
+
38
+ if options[:file].nil?
39
+ result = Baiduserp.search options[:keyword]
40
+ else
41
+ result = Baiduserp.parse_file options[:file]
42
+ end
43
+
44
+ if options[:output].nil?
45
+ if options[:jsonprint].nil?
46
+ pp result
47
+ else
48
+ puts result.to_json
49
+ end
50
+ else
51
+ open(options[:output],'w').puts result.to_json
52
+ end
53
+
@@ -0,0 +1,16 @@
1
+ require "baiduserp/version"
2
+ require 'baiduserp/parser'
3
+
4
+ module Baiduserp
5
+ def self.search(keyword)
6
+ Parser.new.search keyword
7
+ end
8
+
9
+ def self.parse(html)
10
+ Parser.new.parse html
11
+ end
12
+
13
+ def self.parse_file(file_path)
14
+ Parser.new.parse_file file_path
15
+ end
16
+ end
@@ -0,0 +1,170 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'open-uri'
5
+
6
+ module Baiduserp
7
+ class Parser
8
+ BAIDU_RESULT = /找到相关结果(.*)个/
9
+
10
+ def parse(html)
11
+ @html = html
12
+ @doc = Nokogiri::HTML(@html)
13
+ @results = []
14
+ @serp = Hash.new
15
+ parse_serp_results
16
+ parse_serp_attrs
17
+ @serp
18
+ end
19
+
20
+ def search(keyword)
21
+ parse_file("http://www.baidu.com/s?wd=#{keyword}")
22
+ end
23
+
24
+ def parse_file(file_path)
25
+ if File.exists? file_path
26
+ html = open(file_path)
27
+ else
28
+ html = open(URI.escape(file_path))
29
+ end
30
+ html = html.read.encode('UTF-8')
31
+ parse html
32
+ end
33
+
34
+ private
35
+ def get_content_safe(noko)
36
+ return nil if noko.nil?
37
+ return nil if noko.empty?
38
+ noko.first.content
39
+ end
40
+
41
+ def parse_serp_results
42
+ # left side results
43
+ @doc.search("//table").each do |table|
44
+ id = table['id'].to_i
45
+ parse_serp_table(id,table) if id > 0
46
+ end
47
+ # right side ads
48
+ parse_right_side_ads
49
+ @serp[:serp_results] = @results
50
+ end
51
+
52
+ def parse_right_side_ads
53
+ @doc.search("//div[@class='EC_fr EC_PP']").each do |table|
54
+ id = table['id'].to_s.sub('bdfs','').to_i
55
+ rank = id + 1
56
+ url = @doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1' and @color='#008000']").first.content
57
+ title = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//a"))
58
+ content = get_content_safe(@doc.search("//div[@id='bdfs#{id}' and @class='EC_fr EC_PP']//font[@size='-1']"))
59
+ @results << {:paid => 2, :rank => rank, :url => url, :title => title, :content => content}
60
+ end
61
+ end
62
+
63
+ def get_url_part_from_string(str)
64
+ str.split(/( |\s)/).each do |s|
65
+ return s if s.include? '.'
66
+ end
67
+ nil
68
+ end
69
+
70
+ def parse_serp_url(table_id)
71
+ id = table_id
72
+ url = nil
73
+ if id > 3000
74
+ link_types = ["//table[@id='#{id}']//font[@size='-1' and @color='#008000']"]
75
+ else
76
+ link_types = ["//table[@id='#{id}']//span[@class='g']",
77
+ "//table[@id='#{id}']//font[@color='#008000']",
78
+ "//table[@id='#{id}']//span[@style='color:#008000']",
79
+ "//table[@id='#{id}']//span[@style='color:#008000;']",
80
+ "//table[@id='#{id}']//span[@color='#008000']",
81
+ "//table[@id='#{id}']//p[@class='g']",
82
+ "//table[@id='#{id}']//cite[@color='#008000']",
83
+ "//table[@id='#{id}']//cite",
84
+ "//table[@id='#{id}']//span[@id='ala_img_desc']"
85
+ ]
86
+ end
87
+ link_types.each do |link_type|
88
+ link_search = @doc.search(link_type)
89
+ url2 = nil
90
+ url2 = get_url_part_from_string(link_search[0].content) if link_search.size > 0
91
+ if url.nil? && (not url2.nil?)
92
+ url = url2
93
+ end
94
+ end
95
+ url
96
+ end
97
+
98
+ def parse_serp_title(id)
99
+
100
+ end
101
+
102
+ def parse_serp_content(id)
103
+ get_content_safe(@doc.search("//table[@id='#{id}']//font[@size='-1']"))
104
+ end
105
+
106
+ def parse_serp_table(id,table)
107
+ result = Hash.new
108
+ result[:rank] = id
109
+ result[:url] = parse_serp_url(id)
110
+ result[:title] = table.css('h3').first.content
111
+ result[:content] = parse_serp_content(id)
112
+
113
+ if id > 3000 # sem ads
114
+ result[:paid] = 1
115
+ else # organic results
116
+ result[:paid] = 0
117
+
118
+ # baidu open
119
+ table.css('a').each do |link|
120
+ result[:baiduopen] = 1 if link['href'].to_s.include? 'open.baidu.com'
121
+ end
122
+
123
+ # baidu table mu attr (for maps,baike)
124
+ result[:mu] = table['mu'] unless table['mu'].nil?
125
+
126
+ end
127
+
128
+ @results << result
129
+ end
130
+
131
+ # parse baidu serp attrs : result_num, baidubrand, related_keywords
132
+ def parse_serp_attrs
133
+ @serp[:result_num] = parse_serp_result_num
134
+ @serp[:baidubrand] = parse_serp_baidu_brand
135
+ @serp[:related_keywords] = parse_serp_related_search
136
+ end
137
+
138
+ def parse_serp_related_search
139
+ result = []
140
+ @doc.search('div[@id="rs"]').each do |rs|
141
+ rs.css('a').each do |link|
142
+ result << link.content
143
+ end
144
+ end
145
+ result
146
+ end
147
+
148
+ def parse_serp_baidu_brand
149
+ if @html.include? 'bs.baidu.com/adcoup-mat'
150
+ result = 1
151
+ else
152
+ result = 0
153
+ end
154
+ result
155
+ end
156
+
157
+ def parse_serp_result_num
158
+ str = @html.scan(BAIDU_RESULT).join
159
+ str = str.gsub('约','')
160
+ if str.include?('万')
161
+ parts = str.split('万')
162
+ return parts[0].to_i * 10000 + parts[1].to_i
163
+ end
164
+ str.gsub(',', '').to_i
165
+ end
166
+
167
+ end
168
+ end
169
+
170
+
@@ -0,0 +1,3 @@
1
+ module Baiduserp
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baiduserp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - MingQian Zhang
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Parse Baidu SERP result page.
31
+ email:
32
+ - zmingqian@qq.com
33
+ executables:
34
+ - baiduserp
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/baiduserp/parser.rb
39
+ - lib/baiduserp/version.rb
40
+ - lib/baiduserp.rb
41
+ - bin/baiduserp
42
+ - README.md
43
+ homepage: http://www.baidu.com/
44
+ licenses: []
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubyforge_project:
63
+ rubygems_version: 1.8.24
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Baidu SERP
67
+ test_files: []
68
+ has_rdoc: