serper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +1 -0
- data/bin/serper +59 -0
- data/lib/serper.rb +26 -0
- data/lib/serper/analyser.rb +112 -0
- data/lib/serper/baidu/crawler.rb +7 -0
- data/lib/serper/baidu/parser.rb +185 -0
- data/lib/serper/baidu/weight.rb +144 -0
- data/lib/serper/crawler.rb +84 -0
- data/lib/serper/helper.rb +79 -0
- data/lib/serper/parser.rb +77 -0
- data/lib/serper/user_agents.yml +183 -0
- data/lib/serper/version.rb +3 -0
- metadata +141 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 149badc447dec8ed55714a9ed6768e3f85a9b94e
|
4
|
+
data.tar.gz: 8204acacee0b068b61421c0bc59507ea30717895
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d329ccb1dbf584a4a100bf945740a5ff9ffc391a79f34aa598776cc5db901fdefdf0e04e26920060784c302fd9eb935dcf0a2a623a0acdda8b76bdc61c4235b2
|
7
|
+
data.tar.gz: 36a41f00ff396bde0e3c18587bd58359300591572ff346135f340520b16145272c622f228709d0fe72d770a2174f5bf48cc6c9e98e75fc3b22eef78e8ea14f41
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
解析百度的搜索结果页面, 并返回结构化数据以进行后续分析.
|
data/bin/serper
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'serper'
|
4
|
+
require 'optparse'
|
5
|
+
require 'json'
|
6
|
+
require 'pp'
|
7
|
+
require 'docopt'
|
8
|
+
|
9
|
+
cmd = File.basename(__FILE__)
|
10
|
+
|
11
|
+
doc = <<DOCOPT
|
12
|
+
1. serper -s 'keyword' # search 'keyword' and print parse result
|
13
|
+
2. serper -s 'keyword' -o output.json # -o means save result to a file
|
14
|
+
3. serper -f 'file path' # parse html source code from file
|
15
|
+
4. serper -s 'keyword' -j # search 'keyword' and print parse result in JSON format
|
16
|
+
|
17
|
+
Usage:
|
18
|
+
#{cmd} [options]
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-h --help show this help message and exit
|
22
|
+
-v --version show version and exit
|
23
|
+
-a --analyse Name analyse as the given name
|
24
|
+
--keywords File uses with -a, import give keywords File before search
|
25
|
+
-s --search Keyword search Keyword and show result
|
26
|
+
-f --file File parse local file or given url
|
27
|
+
-j --json print JSON output
|
28
|
+
-o --output File output JSON result to File
|
29
|
+
|
30
|
+
DOCOPT
|
31
|
+
|
32
|
+
begin
|
33
|
+
options = Docopt::docopt(doc, version: Serper::VERSION)
|
34
|
+
# pp options
|
35
|
+
rescue Docopt::Exit => e
|
36
|
+
puts e.message
|
37
|
+
end
|
38
|
+
|
39
|
+
result = ''
|
40
|
+
if options['--analyse']
|
41
|
+
analyse = Serper.analyse(options['--analyse'])
|
42
|
+
analyse.import_keywords(options('--keywords'))
|
43
|
+
analyse.search
|
44
|
+
result = 'Analyse finished!'
|
45
|
+
elsif options['--search']
|
46
|
+
result = Serper.search options['--search']
|
47
|
+
elsif options['--file']
|
48
|
+
result = Serper.parse_file options['--file']
|
49
|
+
else
|
50
|
+
puts "At least given one of -a/-s/-f"
|
51
|
+
end
|
52
|
+
|
53
|
+
if options['--json']
|
54
|
+
puts result.to_json
|
55
|
+
else
|
56
|
+
pp result
|
57
|
+
end
|
58
|
+
|
59
|
+
open(options['--output'],'w').puts result.to_json if options['--output']
|
data/lib/serper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require "serper/version"
|
2
|
+
require "serper/parser"
|
3
|
+
require "serper/analyser"
|
4
|
+
|
5
|
+
[:baidu].each do |engine_name|
|
6
|
+
%w{crawler parser weight}.each do |part|
|
7
|
+
require File.expand_path("../serper/#{engine_name}/#{part}.rb",__FILE__)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Serper
|
12
|
+
ENGINES = {
|
13
|
+
:baidu => Baidu
|
14
|
+
}
|
15
|
+
|
16
|
+
def self.search(engine_name,keyword,page=1)
|
17
|
+
serp = Parser.new(engine_name,keyword,page)
|
18
|
+
serp.search
|
19
|
+
serp
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.analyse(connection)
|
23
|
+
Analyser.new(connection)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
require 'csv'
|
3
|
+
require 'date'
|
4
|
+
require 'yaml'
|
5
|
+
require 'ruby-progressbar'
|
6
|
+
|
7
|
+
module Serper
|
8
|
+
class Analyser
|
9
|
+
def initialize(connection)
|
10
|
+
ActiveRecord::Base.establish_connection(connection)
|
11
|
+
end
|
12
|
+
|
13
|
+
def import_keywords(file)
|
14
|
+
CSV.foreach(file) do |l|
|
15
|
+
Keyword.find_or_create_by(:term => l[0]) do |r|
|
16
|
+
r.pv = l[1]
|
17
|
+
r.category = l[2]
|
18
|
+
r.url_type = l[3]
|
19
|
+
r.url_id = l[4]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def run(date=Date.today,skip=true)
|
25
|
+
puts "Serper Analyser on #{date}"
|
26
|
+
ENGINES.keys.each do |engine_name|
|
27
|
+
puts engine_name
|
28
|
+
search_engine(engine_name,date,skip)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def search_engine(engine_name,date,skip=true)
|
33
|
+
p = ProgressBar.create(:title => "Searching #{engine_name} - #{date}", :total => Keyword.all.count, :format => '%t (%c/%C) %a %E |%w')
|
34
|
+
Keyword.all.each do |k|
|
35
|
+
check_exists = Weight.where(:engine => engine_name, :date => date, :keyword_id => k.id)
|
36
|
+
if check_exists.any?
|
37
|
+
if skip
|
38
|
+
next
|
39
|
+
else
|
40
|
+
check_exists.destroy_all
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
serp = Serper.search(engine_name,k.term)
|
45
|
+
serp.weights.each do |w|
|
46
|
+
Weight.create(:date => date,
|
47
|
+
:keyword_id => k.id,
|
48
|
+
:engine => engine_name,
|
49
|
+
:side => w[:side],
|
50
|
+
:part => w[:part],
|
51
|
+
:source => w[:type],
|
52
|
+
:name => w[:name],
|
53
|
+
:site => w[:site],
|
54
|
+
:subdomain => w[:subdomain],
|
55
|
+
:path => w[:path],
|
56
|
+
:part_rank => w[:part_rank],
|
57
|
+
:side_rank => w[:side_rank],
|
58
|
+
:side_weight => w[:side_weight],
|
59
|
+
:weight => w[:weight]
|
60
|
+
)
|
61
|
+
end
|
62
|
+
|
63
|
+
p.increment
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def migrate!
|
68
|
+
ActiveRecord::Schema.define do
|
69
|
+
create_table :serper_keywords do |t|
|
70
|
+
t.string :term
|
71
|
+
t.integer :pv
|
72
|
+
t.string :category
|
73
|
+
t.string :url_type
|
74
|
+
t.integer :url_id
|
75
|
+
|
76
|
+
t.timestamps
|
77
|
+
|
78
|
+
t.index :term
|
79
|
+
end
|
80
|
+
|
81
|
+
create_table :serper_weights do |t|
|
82
|
+
t.date :date
|
83
|
+
t.string :engine
|
84
|
+
t.integer :keyword_id
|
85
|
+
t.string :side # Left Right
|
86
|
+
t.string :part
|
87
|
+
t.string :source # SEO SEM Special
|
88
|
+
t.string :name
|
89
|
+
t.string :site
|
90
|
+
t.string :subdomain
|
91
|
+
t.string :path
|
92
|
+
t.integer :part_rank
|
93
|
+
t.integer :side_rank
|
94
|
+
t.float :side_weight
|
95
|
+
t.float :weight
|
96
|
+
|
97
|
+
t.timestamps
|
98
|
+
|
99
|
+
t.index [:date, :engine, :keyword_id, :side, :side_rank], name: 'weights_pk_index'
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Keyword < ActiveRecord::Base
|
105
|
+
self.table_name = 'serper_keywords'
|
106
|
+
end
|
107
|
+
|
108
|
+
class Weight < ActiveRecord::Base
|
109
|
+
self.table_name = 'serper_weights'
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,185 @@
|
|
1
|
+
class Serper::Baidu
|
2
|
+
def _parse_ads_right(file)
|
3
|
+
result = []
|
4
|
+
rank = 0
|
5
|
+
|
6
|
+
file[:doc].search('div#ec_im_container span a.c-icon.efc-cert').each do |div|
|
7
|
+
rank += 1
|
8
|
+
url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v','') rescue ''
|
9
|
+
result << {url: url, rank: rank}
|
10
|
+
end
|
11
|
+
result
|
12
|
+
end
|
13
|
+
|
14
|
+
def _parse_ads_top(file)
|
15
|
+
result = []
|
16
|
+
rank = 0
|
17
|
+
|
18
|
+
file[:doc].search('div#content_left').first.children.each do |div|
|
19
|
+
break if div['id'].to_i > 0
|
20
|
+
div.search('span a.c-icon.efc-cert').each do |div|
|
21
|
+
rank += 1
|
22
|
+
url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v', '') rescue ''
|
23
|
+
result << {url: url, rank: rank}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
result
|
27
|
+
end
|
28
|
+
|
29
|
+
def _parse_con_ar(file)
|
30
|
+
result = []
|
31
|
+
divs = file[:doc].search("div#content_right div#con-ar").first
|
32
|
+
return [] if divs.nil?
|
33
|
+
divs.children.each do |div|
|
34
|
+
next unless div['class'].to_s.include?('result-op')
|
35
|
+
result << {:tpl => div['tpl'],
|
36
|
+
:data_click => Serper::Helper.parse_data_click(div['data-click'])
|
37
|
+
}
|
38
|
+
end
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
# def _parse_pinpaizhuanqu(file)
|
43
|
+
# part = file[:doc].search("div[@id='content_left']").first
|
44
|
+
# return false if part.nil?
|
45
|
+
#
|
46
|
+
# part.children[2].name == 'script'
|
47
|
+
# end
|
48
|
+
|
49
|
+
def _parse_ranks(file)
|
50
|
+
result = []
|
51
|
+
part = file[:doc].search("div[@id='content_left']").first
|
52
|
+
return result if part.nil?
|
53
|
+
|
54
|
+
part.children.each do |table|
|
55
|
+
next if table.nil?
|
56
|
+
id = table['id'].to_i
|
57
|
+
next unless id > 0 && id < 3000
|
58
|
+
|
59
|
+
r = {:rank => id}
|
60
|
+
|
61
|
+
r[:result_op] = table['class'].to_s.include?('result-op')
|
62
|
+
|
63
|
+
r[:fk] = table['fk']
|
64
|
+
|
65
|
+
r[:srcid] = table['srcid']
|
66
|
+
|
67
|
+
r[:tpl] = table['tpl']
|
68
|
+
|
69
|
+
r[:mu] = table['mu']
|
70
|
+
|
71
|
+
url = table.search('h3/a').first
|
72
|
+
unless url.nil?
|
73
|
+
url = url['href']
|
74
|
+
sleep(rand)
|
75
|
+
url = Serper::Crawler.get_rank_url('http:'+url).headers['location'] if url.include?('//www.baidu.com/link?')
|
76
|
+
end
|
77
|
+
r[:url] = url
|
78
|
+
|
79
|
+
r[:title] = Serper::Helper.get_content_safe(table.search('h3'))
|
80
|
+
|
81
|
+
r[:content] = Serper::Helper.get_content_safe(table.search('div.c-abstract'))
|
82
|
+
|
83
|
+
table.search('a').each do |link|
|
84
|
+
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
85
|
+
end
|
86
|
+
r[:baiduopen] = false if r[:baiduopen].nil?
|
87
|
+
|
88
|
+
result << r
|
89
|
+
end
|
90
|
+
result
|
91
|
+
end
|
92
|
+
|
93
|
+
# def _parse_related_keywords(file)
|
94
|
+
# result = []
|
95
|
+
# file[:doc].search('div[@id="rs"]').each do |rs|
|
96
|
+
# rs.css('a').each do |link|
|
97
|
+
# result << link.content
|
98
|
+
# end
|
99
|
+
# end
|
100
|
+
# result
|
101
|
+
# end
|
102
|
+
|
103
|
+
# def _parse_result_num(file)
|
104
|
+
# html = file[:html]
|
105
|
+
# str = html.scan(/百度为您找到相关结果(.*)个/).join
|
106
|
+
# str = str.gsub('约','')
|
107
|
+
# if str.include?('万')
|
108
|
+
# parts = str.split('万')
|
109
|
+
# result = parts[0].to_i * 10000 + parts[1].to_i
|
110
|
+
# else
|
111
|
+
# result = str.gsub(',', '').to_i
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# result
|
115
|
+
# end
|
116
|
+
|
117
|
+
# def _parse_right_hotel(file)
|
118
|
+
# rh = file[:doc].search('div[@tpl="right_hotel"]')
|
119
|
+
# return nil if rh.nil?
|
120
|
+
#
|
121
|
+
# rh = rh.first
|
122
|
+
# return nil if rh.nil?
|
123
|
+
# title = Serper::Helper.get_content_safe(rh.search('div.opr-hotel-title'))
|
124
|
+
#
|
125
|
+
# {:title => title}
|
126
|
+
# end
|
127
|
+
|
128
|
+
# def _parse_right_personinfo(file)
|
129
|
+
# rp = file[:doc].search('div[@tpl="right_personinfo"]')
|
130
|
+
# return nil if rp.nil?
|
131
|
+
#
|
132
|
+
# title = Serper::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large')
|
133
|
+
# info_summary = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-summary')
|
134
|
+
# info = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-info')
|
135
|
+
# source = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-source a')
|
136
|
+
#
|
137
|
+
# return nil if title.nil? && info.nil? && source.nil?
|
138
|
+
# {:title => title, :info_summary => info_summary, :info => info, :source => source}
|
139
|
+
# end
|
140
|
+
|
141
|
+
# def _parse_right_relaperson(file)
|
142
|
+
# relapersons = file[:doc].search('div[@tpl="right_relaperson"]')
|
143
|
+
# return nil if relapersons.nil?
|
144
|
+
#
|
145
|
+
# result = []
|
146
|
+
# relapersons.each do |rr|
|
147
|
+
# title = rr.search('div.cr-title/span').first
|
148
|
+
# title = title.content unless title.nil?
|
149
|
+
# r = []
|
150
|
+
# rr.search('p.opr-relaperson-name/a').each do |p|
|
151
|
+
# r << p['title']
|
152
|
+
# end
|
153
|
+
# result << {:title => title, :names => r}
|
154
|
+
# end
|
155
|
+
# result
|
156
|
+
# end
|
157
|
+
|
158
|
+
# def _parse_right_weather(file)
|
159
|
+
# rw = file[:doc].search('div[@tpl="right_weather"]')
|
160
|
+
# return nil if rw.nil?
|
161
|
+
#
|
162
|
+
# rw = rw.first
|
163
|
+
# return nil if rw.nil?
|
164
|
+
#
|
165
|
+
# title = Serper::Helper.get_content_safe(rw.search('div.opr-weather-title'))
|
166
|
+
# week = rw.search('a.opr-weather-week').first['href']
|
167
|
+
#
|
168
|
+
# {:title => title, :week => week}
|
169
|
+
# end
|
170
|
+
|
171
|
+
def _parse_zhixin(file)
|
172
|
+
result = []
|
173
|
+
file[:doc].search("div#content_left .result-zxl").each do |zxl|
|
174
|
+
result << {:id => zxl['id'],
|
175
|
+
:srcid => zxl['srcid'],
|
176
|
+
:fk => zxl['fk'],
|
177
|
+
:tpl => zxl['tpl'],
|
178
|
+
:mu => zxl['mu'],
|
179
|
+
:data_click => Serper::Helper.parse_data_click(zxl['data-click'])
|
180
|
+
}
|
181
|
+
end
|
182
|
+
result
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
class Serper::Baidu
|
2
|
+
def weight_config
|
3
|
+
{
|
4
|
+
:left_parts => [:ads_top,
|
5
|
+
:zhixin,
|
6
|
+
:ranks
|
7
|
+
],
|
8
|
+
|
9
|
+
:right_parts => [:con_ar,
|
10
|
+
:ads_right
|
11
|
+
],
|
12
|
+
|
13
|
+
:left_part_weight => 8,
|
14
|
+
|
15
|
+
:right_part_weight => 2,
|
16
|
+
|
17
|
+
:zhixin_weight => 3.5,
|
18
|
+
|
19
|
+
:baiduopen_weight => 3,
|
20
|
+
|
21
|
+
:rank_special_weight => 2,
|
22
|
+
|
23
|
+
:con_ar_weight => 2
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
# _weight_of_*** functions
|
28
|
+
# return a hash array
|
29
|
+
# each hash includes: type, name, site, weight
|
30
|
+
|
31
|
+
def _weight_of_ranks(serp_result,side_rank)
|
32
|
+
result = []
|
33
|
+
serp_result[:ranks].each.with_index do |rank,i|
|
34
|
+
side_rank += 1
|
35
|
+
|
36
|
+
url = rank[:url].to_s
|
37
|
+
mu = rank[:mu].to_s
|
38
|
+
|
39
|
+
type = 'SEO'
|
40
|
+
type = 'Special' if rank[:baiduopen]
|
41
|
+
|
42
|
+
unless mu.empty?
|
43
|
+
url = mu
|
44
|
+
type = 'Special'
|
45
|
+
end
|
46
|
+
|
47
|
+
site = Serper::Helper.parse_site(url)
|
48
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
49
|
+
path = Serper::Helper.parse_path(url)
|
50
|
+
|
51
|
+
name = rank[:tpl].to_s
|
52
|
+
|
53
|
+
weight = 1.0/side_rank.to_f
|
54
|
+
if type == 'Special'
|
55
|
+
if rank[:baiduopen]
|
56
|
+
weight = weight * weight_config[:baiduopen_weight].to_f
|
57
|
+
else
|
58
|
+
weight = weight * weight_config[:rank_special_weight].to_f
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
part_rank = rank[:rank]
|
63
|
+
|
64
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, mu: mu, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
65
|
+
end
|
66
|
+
[result, side_rank]
|
67
|
+
end
|
68
|
+
|
69
|
+
def _weight_of_ads_top(serp_result,side_rank)
|
70
|
+
result = []
|
71
|
+
serp_result[:ads_top].each.with_index do |ad,i|
|
72
|
+
side_rank += 1
|
73
|
+
|
74
|
+
url = ad[:url].to_s
|
75
|
+
type = 'SEM'
|
76
|
+
name = ''
|
77
|
+
site = Serper::Helper.parse_site(url)
|
78
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
79
|
+
path = Serper::Helper.parse_path(url)
|
80
|
+
|
81
|
+
part_rank = ad[:rank]
|
82
|
+
|
83
|
+
weight = 1.0/side_rank.to_f
|
84
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
85
|
+
end
|
86
|
+
[result, side_rank]
|
87
|
+
end
|
88
|
+
|
89
|
+
def _weight_of_ads_right(serp_result,side_rank)
|
90
|
+
result = []
|
91
|
+
serp_result[:ads_right].each.with_index do |ad,i|
|
92
|
+
side_rank += 1
|
93
|
+
|
94
|
+
url = ad[:url].to_s
|
95
|
+
type = 'SEM'
|
96
|
+
name = ''
|
97
|
+
site = Serper::Helper.parse_site(url)
|
98
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
99
|
+
path = Serper::Helper.parse_path(url)
|
100
|
+
|
101
|
+
part_rank = ad[:rank]
|
102
|
+
|
103
|
+
weight = 1.0/side_rank.to_f
|
104
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
105
|
+
end
|
106
|
+
[result, side_rank]
|
107
|
+
end
|
108
|
+
|
109
|
+
def _weight_of_con_ar(serp_result,side_rank)
|
110
|
+
result = []
|
111
|
+
serp_result[:con_ar].each.with_index do |con,i|
|
112
|
+
side_rank += 1
|
113
|
+
|
114
|
+
url = con[:data_click]['mu'].to_s
|
115
|
+
type = 'Special'
|
116
|
+
name = con[:tpl]
|
117
|
+
site = Serper::Helper.parse_site(url)
|
118
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
119
|
+
path = Serper::Helper.parse_path(url)
|
120
|
+
|
121
|
+
weight = 1.0 * weight_config[:con_ar_weight]
|
122
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
|
123
|
+
end
|
124
|
+
[result, side_rank]
|
125
|
+
end
|
126
|
+
|
127
|
+
def _weight_of_zhixin(serp_result,side_rank)
|
128
|
+
result = []
|
129
|
+
serp_result[:zhixin].each.with_index do |zhixin,i|
|
130
|
+
side_rank += 1
|
131
|
+
|
132
|
+
url = zhixin[:mu].to_s
|
133
|
+
type = 'Special'
|
134
|
+
name = zhixin[:tpl]
|
135
|
+
site = Serper::Helper.parse_site(url)
|
136
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
137
|
+
weight = 1.0 * weight_config[:zhixin_weight]
|
138
|
+
path = Serper::Helper.parse_path(url)
|
139
|
+
|
140
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
|
141
|
+
end
|
142
|
+
[result, side_rank]
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
|
3
|
+
module Serper
|
4
|
+
class Crawler
|
5
|
+
AllUserAgents = YAML.load(open(File.expand_path('../user_agents.yml',__FILE__)))
|
6
|
+
|
7
|
+
def self.rand_ua
|
8
|
+
AllUserAgents[rand(AllUserAgents.size)]
|
9
|
+
end
|
10
|
+
|
11
|
+
include HTTParty
|
12
|
+
base_uri 'www.baidu.com'
|
13
|
+
follow_redirects false
|
14
|
+
headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
|
15
|
+
|
16
|
+
def self.get_serp(url,retries = 3)
|
17
|
+
self.new.get_serp(url,retries)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.get_rank_url(url)
|
21
|
+
self.new.get_rank_url(url)
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_rank_url(url)
|
25
|
+
begin
|
26
|
+
response = self.class.get(url)
|
27
|
+
rescue StandardError => e
|
28
|
+
puts e.class
|
29
|
+
puts e.message
|
30
|
+
sleep(10)
|
31
|
+
retry
|
32
|
+
end
|
33
|
+
response
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_serp(url, retries = 3)
|
37
|
+
if retries > 0
|
38
|
+
begin
|
39
|
+
response = self.class.get(url)
|
40
|
+
rescue StandardError => e
|
41
|
+
puts e.class
|
42
|
+
puts e.message
|
43
|
+
sleep(10)
|
44
|
+
retry
|
45
|
+
end
|
46
|
+
|
47
|
+
if response.code != 200
|
48
|
+
puts response
|
49
|
+
puts "Retry on URL: #{url}"
|
50
|
+
sleep(rand(60)+1200)
|
51
|
+
response = self.class.get_serp(url,retries - 1)
|
52
|
+
end
|
53
|
+
|
54
|
+
if response.nil?
|
55
|
+
puts "Still error after 3 tries, sleep 3600s now."
|
56
|
+
sleep(3600)
|
57
|
+
response = self.class.get_serp(url)
|
58
|
+
end
|
59
|
+
|
60
|
+
##Baidu Stopped response Content-Length in headers...
|
61
|
+
#if response.headers['Content-Length'].nil?
|
62
|
+
# puts "Can't read Content-Length from response, retry."
|
63
|
+
# response = self.class.get_serp(url,retries-1)
|
64
|
+
#end
|
65
|
+
#
|
66
|
+
#if response.headers['Content-Length'].to_i != response.body.bytesize
|
67
|
+
# issue_file = "/tmp/serper_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
68
|
+
# open(issue_file,'w').puts(response.body)
|
69
|
+
# puts "Notice:"
|
70
|
+
# puts "Serper get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
71
|
+
# puts "Please see file #{issue_file} for body content."
|
72
|
+
# puts "Sleep 10s and retry"
|
73
|
+
# sleep(10)
|
74
|
+
# response = self.class.get_serp(url)
|
75
|
+
#end
|
76
|
+
|
77
|
+
response
|
78
|
+
else
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'domainatrix'
|
2
|
+
|
3
|
+
module Serper
|
4
|
+
module Helper
|
5
|
+
class << self
|
6
|
+
# get content safe from nokogiri search reasult
|
7
|
+
def get_content_safe(noko)
|
8
|
+
return nil if noko.nil?
|
9
|
+
return nil if noko.empty?
|
10
|
+
noko.first.content.strip
|
11
|
+
end
|
12
|
+
|
13
|
+
# parse data click value from baidu div property,
|
14
|
+
# which is a JSON like format
|
15
|
+
def parse_data_click(str)
|
16
|
+
JSON.parse(str
|
17
|
+
.gsub("'",'"')
|
18
|
+
.gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
|
19
|
+
#.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
|
20
|
+
#.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
# normalize weight of given data,
|
25
|
+
# the data must be a hash array structure.
|
26
|
+
# for example : [{a: 1, b: 2}, {a: 2, b: 3}]
|
27
|
+
def normalize(data,weight_col=:weight,normalized_col=:normalized_weight)
|
28
|
+
total_weight = data.reduce(0.0) {|sum,d| sum += d[weight_col].to_f}
|
29
|
+
data.each do|d|
|
30
|
+
d[normalized_col] = d[weight_col].to_f/total_weight
|
31
|
+
end
|
32
|
+
data
|
33
|
+
end
|
34
|
+
|
35
|
+
def parse_site(url)
|
36
|
+
begin
|
37
|
+
url = Domainatrix.parse(url.to_s)
|
38
|
+
site = url.domain + '.' + url.public_suffix
|
39
|
+
rescue Exception => e
|
40
|
+
puts "parse_site from url error:"
|
41
|
+
puts url
|
42
|
+
puts e.class
|
43
|
+
puts e.message
|
44
|
+
site = ''
|
45
|
+
end
|
46
|
+
site
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_subdomain(url)
|
50
|
+
begin
|
51
|
+
url = Domainatrix.parse(url.to_s)
|
52
|
+
subdomain = url.subdomain
|
53
|
+
rescue Exception => e
|
54
|
+
puts "parse_site from url error:"
|
55
|
+
puts url
|
56
|
+
puts e.class
|
57
|
+
puts e.message
|
58
|
+
subdomain = ''
|
59
|
+
end
|
60
|
+
subdomain
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_path(url)
|
64
|
+
begin
|
65
|
+
url = Domainatrix.parse(url.to_s)
|
66
|
+
path = url.path
|
67
|
+
rescue Exception => e
|
68
|
+
puts "parse_site from url error:"
|
69
|
+
puts url
|
70
|
+
puts e.class
|
71
|
+
puts e.message
|
72
|
+
path = ''
|
73
|
+
end
|
74
|
+
path
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
require 'json'
|
5
|
+
require 'serper/crawler'
|
6
|
+
require 'serper/helper'
|
7
|
+
|
8
|
+
module Serper
|
9
|
+
class Parser
|
10
|
+
attr_reader :engine_name, :keyword, :page, :html, :doc, :result
|
11
|
+
|
12
|
+
def initialize(engine_name,keyword,page=1)
|
13
|
+
@engine_name = engine_name
|
14
|
+
@engine = ENGINES[@engine_name].new
|
15
|
+
@keyword = keyword
|
16
|
+
@page = page
|
17
|
+
end
|
18
|
+
|
19
|
+
def serp_url
|
20
|
+
@engine.serp_url(@keyword,@page)
|
21
|
+
end
|
22
|
+
|
23
|
+
def search
|
24
|
+
html = Crawler.get_serp(serp_url).body
|
25
|
+
parse html
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse(html)
|
29
|
+
html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
|
30
|
+
@file = Hash.new
|
31
|
+
@result = Hash.new
|
32
|
+
|
33
|
+
@file[:html] = html
|
34
|
+
@file[:doc] = Nokogiri::HTML(html)
|
35
|
+
|
36
|
+
@engine.methods.each do |m|
|
37
|
+
next unless m =~ /^_parse_/
|
38
|
+
begin
|
39
|
+
@result[m.to_s.sub('_parse_','').to_sym] = @engine.send m,@file
|
40
|
+
rescue Exception => e
|
41
|
+
issue_file = "/tmp/serper_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
42
|
+
open(issue_file,'w').puts(html)
|
43
|
+
puts "Notice:"
|
44
|
+
puts "Serper gem have a bug, please email to zmingqian@qq.com to report it."
|
45
|
+
puts "Please attach file #{issue_file} in the email and the error information below, thanks!"
|
46
|
+
puts e.message
|
47
|
+
puts e.inspect
|
48
|
+
puts e.backtrace
|
49
|
+
raise "Serper Parser Get An Error!"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
@result
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def weights
|
58
|
+
result = []
|
59
|
+
[:left,:right].each do |side|
|
60
|
+
side_rank = 0
|
61
|
+
|
62
|
+
@engine.weight_config["#{side}_parts".to_sym].each do |part|
|
63
|
+
rs,side_rank = @engine.send("_weight_of_#{part}",@result,side_rank)
|
64
|
+
|
65
|
+
rs.each do |r|
|
66
|
+
r[:side] = side.to_s
|
67
|
+
r[:part] = part
|
68
|
+
|
69
|
+
r[:weight] = r[:weight].to_f * @engine.weight_config["#{side}_part_weight".to_sym].to_f
|
70
|
+
result << r
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
Serper::Helper.normalize(result,:side_weight,:weight)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
---
|
2
|
+
- 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/527 (KHTML, like Gecko,
|
3
|
+
Safari/419.3) Arora/0.6 (Change: )'
|
4
|
+
- Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3)
|
5
|
+
Arora/0.8.0
|
6
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser;
|
7
|
+
.NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET
|
8
|
+
CLR 3.0.04506.30)
|
9
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9
|
10
|
+
Chrome/17.0.939.0 Safari/535.8
|
11
|
+
- Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0
|
12
|
+
Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0
|
13
|
+
- Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1
|
14
|
+
Safari/536.3
|
15
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0
|
16
|
+
Safari/536.6
|
17
|
+
- Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0
|
18
|
+
Safari/536.6
|
19
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1
|
20
|
+
Safari/537.1
|
21
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0
|
22
|
+
Safari/537.36
|
23
|
+
- Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0
|
24
|
+
Safari/537.36
|
25
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0
|
26
|
+
- Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1
|
27
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1
|
28
|
+
- Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0
|
29
|
+
- Mozilla/5.0 (Windows NT 6.2; rv:19.0) Gecko/20121129 Firefox/19.0
|
30
|
+
- Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0
|
31
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0
|
32
|
+
- Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)
|
33
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR
|
34
|
+
2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Maxthon
|
35
|
+
2.0)
|
36
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko)
|
37
|
+
Maxthon/3.0.8.2 Safari/533.1
|
38
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Maxthon/4.0.0.2000
|
39
|
+
Chrome/22.0.1229.79 Safari/537.1
|
40
|
+
- Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
|
41
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
|
42
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727;
|
43
|
+
.NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)
|
44
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
|
45
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)
|
46
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
|
47
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)
|
48
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
|
49
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)
|
50
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)
|
51
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media
|
52
|
+
Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)
|
53
|
+
- Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
|
54
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)
|
55
|
+
- Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1;
|
56
|
+
.NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0
|
57
|
+
- Opera/9.25 (Windows NT 6.0; U; en)
|
58
|
+
- Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10
|
59
|
+
- Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00
|
60
|
+
- Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01
|
61
|
+
- Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10
|
62
|
+
- Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00
|
63
|
+
- Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14
|
64
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like
|
65
|
+
Gecko) Version/4.0.4 Safari/531.21.10
|
66
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like
|
67
|
+
Gecko) Version/5.0.1 Safari/533.17.8
|
68
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like
|
69
|
+
Gecko) Version/5.0.2 Safari/533.18.5
|
70
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.2; es-US ) AppleWebKit/540.0 (KHTML like Gecko)
|
71
|
+
Version/6.0 Safari/8900.00
|
72
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like
|
73
|
+
Firefox/3.x) SeaMonkey/2.0.12
|
74
|
+
- Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1
|
75
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20120422 Firefox/12.0 SeaMonkey/2.9
|
76
|
+
- Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)
|
77
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko)
|
78
|
+
Chrome/4.0.249.0 Safari/532.5
|
79
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko)
|
80
|
+
Chrome/5.0.310.0 Safari/532.9
|
81
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko)
|
82
|
+
Chrome/7.0.514.0 Safari/534.7
|
83
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like
|
84
|
+
Gecko) Chrome/9.0.601.0 Safari/534.14
|
85
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like
|
86
|
+
Gecko) Chrome/10.0.601.0 Safari/534.14
|
87
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like
|
88
|
+
Gecko) Chrome/11.0.672.2 Safari/534.20
|
89
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0
|
90
|
+
Safari/534.27
|
91
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24
|
92
|
+
Safari/535.1
|
93
|
+
- Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120
|
94
|
+
Safari/535.2
|
95
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36
|
96
|
+
Safari/535.7
|
97
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421
|
98
|
+
Minefield/3.0.2pre
|
99
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10
|
100
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11
|
101
|
+
(.NET CLR 3.5.30729)
|
102
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6
|
103
|
+
GTB5
|
104
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8
|
105
|
+
( .NET CLR 3.5.30729; .NET4.0E)
|
106
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
107
|
+
- Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
108
|
+
- Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0
|
109
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2
|
110
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1
|
111
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
112
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre
|
113
|
+
- Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )
|
114
|
+
- Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)
|
115
|
+
- Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a
|
116
|
+
- Mozilla/2.02E (Win95; U)
|
117
|
+
- Mozilla/3.01Gold (Win95; I)
|
118
|
+
- Mozilla/4.8 [en] (Windows NT 5.1; U)
|
119
|
+
- Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)
|
120
|
+
- Opera/7.50 (Windows XP; U)
|
121
|
+
- Opera/7.50 (Windows ME; U) [en]
|
122
|
+
- Opera/7.51 (Windows NT 5.1; U) [en]
|
123
|
+
- Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0
|
124
|
+
- Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.2b) Gecko/20021001 Phoenix/0.2
|
125
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.23) Gecko/20090825 SeaMonkey/1.1.18
|
126
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
127
|
+
Camino/2.2.1
|
128
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre
|
129
|
+
Camino/2.2a1pre
|
130
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko)
|
131
|
+
Chrome/19.0.1063.0 Safari/536.3
|
132
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.4 (KHTML like Gecko)
|
133
|
+
Chrome/22.0.1229.79 Safari/537.4
|
134
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.31 (KHTML like Gecko)
|
135
|
+
Chrome/26.0.1410.63 Safari/537.31
|
136
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 1083) AppleWebKit/537.36 (KHTML like Gecko)
|
137
|
+
Chrome/28.0.1469.0 Safari/537.36
|
138
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
139
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0
|
140
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:20.0) Gecko/20100101 Firefox/20.0
|
141
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0
|
142
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) AppleWebKit/528.16 (KHTML, like
|
143
|
+
Gecko, Safari/528.16) OmniWeb/v622.8.0.112941
|
144
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/528.16 (KHTML,
|
145
|
+
like Gecko, Safari/528.16) OmniWeb/v622.8.0
|
146
|
+
- Opera/9.20 (Macintosh; Intel Mac OS X; U; en)
|
147
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.6.30 Version/10.61
|
148
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X 10.4.11; U; en) Presto/2.7.62 Version/11.00
|
149
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52
|
150
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML,
|
151
|
+
like Gecko) Version/4.0.4 Safari/531.21.10
|
152
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15 (KHTML,
|
153
|
+
like Gecko) Version/5.0.3 Safari/533.19.4
|
154
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML,
|
155
|
+
like Gecko) Version/5.0.4 Safari/533.20.27
|
156
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7; en-us) AppleWebKit/534.20.8 (KHTML,
|
157
|
+
like Gecko) Version/5.1 Safari/534.20.8
|
158
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like
|
159
|
+
Gecko) Version/5.1.3 Safari/534.53.10
|
160
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/536.26.17 (KHTML like
|
161
|
+
Gecko) Version/6.0.2 Safari/536.26.17
|
162
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
163
|
+
SeaMonkey/2.7.1
|
164
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML,
|
165
|
+
like Gecko) Chrome/4.0.302.2 Safari/532.8
|
166
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML,
|
167
|
+
like Gecko) Chrome/6.0.464.0 Safari/534.3
|
168
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML,
|
169
|
+
like Gecko) Chrome/9.0.597.15 Safari/534.13
|
170
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko)
|
171
|
+
Chrome/14.0.835.186 Safari/535.1
|
172
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko)
|
173
|
+
Chrome/15.0.874.54 Safari/535.2
|
174
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko)
|
175
|
+
Chrome/16.0.912.36 Safari/535.7
|
176
|
+
- 'Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 '
|
177
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624
|
178
|
+
Firefox/3.5
|
179
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218
|
180
|
+
AlexaToolbar/alxf-2.0 Firefox/3.6.14
|
181
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
182
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0
|
183
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0
|
metadata
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: serper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- MingQian Zhang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: httparty
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: domainatrix
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: activerecord
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: docopt
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-progressbar
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Parse SERP result page.
|
98
|
+
email:
|
99
|
+
- zmingqian@qq.com
|
100
|
+
executables:
|
101
|
+
- serper
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- lib/serper/analyser.rb
|
106
|
+
- lib/serper/baidu/crawler.rb
|
107
|
+
- lib/serper/baidu/parser.rb
|
108
|
+
- lib/serper/baidu/weight.rb
|
109
|
+
- lib/serper/crawler.rb
|
110
|
+
- lib/serper/helper.rb
|
111
|
+
- lib/serper/parser.rb
|
112
|
+
- lib/serper/version.rb
|
113
|
+
- lib/serper.rb
|
114
|
+
- bin/serper
|
115
|
+
- README.md
|
116
|
+
- lib/serper/user_agents.yml
|
117
|
+
homepage: https://github.com/semseo/serper
|
118
|
+
licenses:
|
119
|
+
- MIT
|
120
|
+
metadata: {}
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - '>='
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
|
+
requirements:
|
132
|
+
- - '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
requirements: []
|
136
|
+
rubyforge_project:
|
137
|
+
rubygems_version: 2.0.0
|
138
|
+
signing_key:
|
139
|
+
specification_version: 4
|
140
|
+
summary: SERP
|
141
|
+
test_files: []
|