serper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +1 -0
- data/bin/serper +59 -0
- data/lib/serper.rb +26 -0
- data/lib/serper/analyser.rb +112 -0
- data/lib/serper/baidu/crawler.rb +7 -0
- data/lib/serper/baidu/parser.rb +185 -0
- data/lib/serper/baidu/weight.rb +144 -0
- data/lib/serper/crawler.rb +84 -0
- data/lib/serper/helper.rb +79 -0
- data/lib/serper/parser.rb +77 -0
- data/lib/serper/user_agents.yml +183 -0
- data/lib/serper/version.rb +3 -0
- metadata +141 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 149badc447dec8ed55714a9ed6768e3f85a9b94e
|
4
|
+
data.tar.gz: 8204acacee0b068b61421c0bc59507ea30717895
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d329ccb1dbf584a4a100bf945740a5ff9ffc391a79f34aa598776cc5db901fdefdf0e04e26920060784c302fd9eb935dcf0a2a623a0acdda8b76bdc61c4235b2
|
7
|
+
data.tar.gz: 36a41f00ff396bde0e3c18587bd58359300591572ff346135f340520b16145272c622f228709d0fe72d770a2174f5bf48cc6c9e98e75fc3b22eef78e8ea14f41
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
解析百度的搜索结果页面, 并返回结构化数据以进行后续分析.
|
data/bin/serper
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'serper'
|
4
|
+
require 'optparse'
|
5
|
+
require 'json'
|
6
|
+
require 'pp'
|
7
|
+
require 'docopt'
|
8
|
+
|
9
|
+
cmd = File.basename(__FILE__)
|
10
|
+
|
11
|
+
doc = <<DOCOPT
|
12
|
+
1. serper -s 'keyword' # search 'keyword' and print parse result
|
13
|
+
2. serper -s 'keyword' -o output.json # -o means save result to a file
|
14
|
+
3. serper -f 'file path' # parse html source code from file
|
15
|
+
4. serper -s 'keyword' -j # search 'keyword' and print parse result in JSON format
|
16
|
+
|
17
|
+
Usage:
|
18
|
+
#{cmd} [options]
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-h --help show this help message and exit
|
22
|
+
-v --version show version and exit
|
23
|
+
-a --analyse Name analyse as the given name
|
24
|
+
--keywords File uses with -a, import give keywords File before search
|
25
|
+
-s --search Keyword search Keyword and show result
|
26
|
+
-f --file File parse local file or given url
|
27
|
+
-j --json print JSON output
|
28
|
+
-o --output File output JSON result to File
|
29
|
+
|
30
|
+
DOCOPT
|
31
|
+
|
32
|
+
begin
|
33
|
+
options = Docopt::docopt(doc, version: Serper::VERSION)
|
34
|
+
# pp options
|
35
|
+
rescue Docopt::Exit => e
|
36
|
+
puts e.message
|
37
|
+
end
|
38
|
+
|
39
|
+
result = ''
|
40
|
+
if options['--analyse']
|
41
|
+
analyse = Serper.analyse(options['--analyse'])
|
42
|
+
analyse.import_keywords(options('--keywords'))
|
43
|
+
analyse.search
|
44
|
+
result = 'Analyse finished!'
|
45
|
+
elsif options['--search']
|
46
|
+
result = Serper.search options['--search']
|
47
|
+
elsif options['--file']
|
48
|
+
result = Serper.parse_file options['--file']
|
49
|
+
else
|
50
|
+
puts "At least given one of -a/-s/-f"
|
51
|
+
end
|
52
|
+
|
53
|
+
if options['--json']
|
54
|
+
puts result.to_json
|
55
|
+
else
|
56
|
+
pp result
|
57
|
+
end
|
58
|
+
|
59
|
+
open(options['--output'],'w').puts result.to_json if options['--output']
|
data/lib/serper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require "serper/version"
|
2
|
+
require "serper/parser"
|
3
|
+
require "serper/analyser"
|
4
|
+
|
5
|
+
[:baidu].each do |engine_name|
|
6
|
+
%w{crawler parser weight}.each do |part|
|
7
|
+
require File.expand_path("../serper/#{engine_name}/#{part}.rb",__FILE__)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Serper
|
12
|
+
ENGINES = {
|
13
|
+
:baidu => Baidu
|
14
|
+
}
|
15
|
+
|
16
|
+
def self.search(engine_name,keyword,page=1)
|
17
|
+
serp = Parser.new(engine_name,keyword,page)
|
18
|
+
serp.search
|
19
|
+
serp
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.analyse(connection)
|
23
|
+
Analyser.new(connection)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
require 'csv'
|
3
|
+
require 'date'
|
4
|
+
require 'yaml'
|
5
|
+
require 'ruby-progressbar'
|
6
|
+
|
7
|
+
module Serper
|
8
|
+
class Analyser
|
9
|
+
def initialize(connection)
|
10
|
+
ActiveRecord::Base.establish_connection(connection)
|
11
|
+
end
|
12
|
+
|
13
|
+
def import_keywords(file)
|
14
|
+
CSV.foreach(file) do |l|
|
15
|
+
Keyword.find_or_create_by(:term => l[0]) do |r|
|
16
|
+
r.pv = l[1]
|
17
|
+
r.category = l[2]
|
18
|
+
r.url_type = l[3]
|
19
|
+
r.url_id = l[4]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def run(date=Date.today,skip=true)
|
25
|
+
puts "Serper Analyser on #{date}"
|
26
|
+
ENGINES.keys.each do |engine_name|
|
27
|
+
puts engine_name
|
28
|
+
search_engine(engine_name,date,skip)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def search_engine(engine_name,date,skip=true)
|
33
|
+
p = ProgressBar.create(:title => "Searching #{engine_name} - #{date}", :total => Keyword.all.count, :format => '%t (%c/%C) %a %E |%w')
|
34
|
+
Keyword.all.each do |k|
|
35
|
+
check_exists = Weight.where(:engine => engine_name, :date => date, :keyword_id => k.id)
|
36
|
+
if check_exists.any?
|
37
|
+
if skip
|
38
|
+
next
|
39
|
+
else
|
40
|
+
check_exists.destroy_all
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
serp = Serper.search(engine_name,k.term)
|
45
|
+
serp.weights.each do |w|
|
46
|
+
Weight.create(:date => date,
|
47
|
+
:keyword_id => k.id,
|
48
|
+
:engine => engine_name,
|
49
|
+
:side => w[:side],
|
50
|
+
:part => w[:part],
|
51
|
+
:source => w[:type],
|
52
|
+
:name => w[:name],
|
53
|
+
:site => w[:site],
|
54
|
+
:subdomain => w[:subdomain],
|
55
|
+
:path => w[:path],
|
56
|
+
:part_rank => w[:part_rank],
|
57
|
+
:side_rank => w[:side_rank],
|
58
|
+
:side_weight => w[:side_weight],
|
59
|
+
:weight => w[:weight]
|
60
|
+
)
|
61
|
+
end
|
62
|
+
|
63
|
+
p.increment
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def migrate!
|
68
|
+
ActiveRecord::Schema.define do
|
69
|
+
create_table :serper_keywords do |t|
|
70
|
+
t.string :term
|
71
|
+
t.integer :pv
|
72
|
+
t.string :category
|
73
|
+
t.string :url_type
|
74
|
+
t.integer :url_id
|
75
|
+
|
76
|
+
t.timestamps
|
77
|
+
|
78
|
+
t.index :term
|
79
|
+
end
|
80
|
+
|
81
|
+
create_table :serper_weights do |t|
|
82
|
+
t.date :date
|
83
|
+
t.string :engine
|
84
|
+
t.integer :keyword_id
|
85
|
+
t.string :side # Left Right
|
86
|
+
t.string :part
|
87
|
+
t.string :source # SEO SEM Special
|
88
|
+
t.string :name
|
89
|
+
t.string :site
|
90
|
+
t.string :subdomain
|
91
|
+
t.string :path
|
92
|
+
t.integer :part_rank
|
93
|
+
t.integer :side_rank
|
94
|
+
t.float :side_weight
|
95
|
+
t.float :weight
|
96
|
+
|
97
|
+
t.timestamps
|
98
|
+
|
99
|
+
t.index [:date, :engine, :keyword_id, :side, :side_rank], name: 'weights_pk_index'
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Keyword < ActiveRecord::Base
|
105
|
+
self.table_name = 'serper_keywords'
|
106
|
+
end
|
107
|
+
|
108
|
+
class Weight < ActiveRecord::Base
|
109
|
+
self.table_name = 'serper_weights'
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,185 @@
|
|
1
|
+
class Serper::Baidu
|
2
|
+
def _parse_ads_right(file)
|
3
|
+
result = []
|
4
|
+
rank = 0
|
5
|
+
|
6
|
+
file[:doc].search('div#ec_im_container span a.c-icon.efc-cert').each do |div|
|
7
|
+
rank += 1
|
8
|
+
url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v','') rescue ''
|
9
|
+
result << {url: url, rank: rank}
|
10
|
+
end
|
11
|
+
result
|
12
|
+
end
|
13
|
+
|
14
|
+
def _parse_ads_top(file)
|
15
|
+
result = []
|
16
|
+
rank = 0
|
17
|
+
|
18
|
+
file[:doc].search('div#content_left').first.children.each do |div|
|
19
|
+
break if div['id'].to_i > 0
|
20
|
+
div.search('span a.c-icon.efc-cert').each do |div|
|
21
|
+
rank += 1
|
22
|
+
url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v', '') rescue ''
|
23
|
+
result << {url: url, rank: rank}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
result
|
27
|
+
end
|
28
|
+
|
29
|
+
def _parse_con_ar(file)
|
30
|
+
result = []
|
31
|
+
divs = file[:doc].search("div#content_right div#con-ar").first
|
32
|
+
return [] if divs.nil?
|
33
|
+
divs.children.each do |div|
|
34
|
+
next unless div['class'].to_s.include?('result-op')
|
35
|
+
result << {:tpl => div['tpl'],
|
36
|
+
:data_click => Serper::Helper.parse_data_click(div['data-click'])
|
37
|
+
}
|
38
|
+
end
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
# def _parse_pinpaizhuanqu(file)
|
43
|
+
# part = file[:doc].search("div[@id='content_left']").first
|
44
|
+
# return false if part.nil?
|
45
|
+
#
|
46
|
+
# part.children[2].name == 'script'
|
47
|
+
# end
|
48
|
+
|
49
|
+
def _parse_ranks(file)
|
50
|
+
result = []
|
51
|
+
part = file[:doc].search("div[@id='content_left']").first
|
52
|
+
return result if part.nil?
|
53
|
+
|
54
|
+
part.children.each do |table|
|
55
|
+
next if table.nil?
|
56
|
+
id = table['id'].to_i
|
57
|
+
next unless id > 0 && id < 3000
|
58
|
+
|
59
|
+
r = {:rank => id}
|
60
|
+
|
61
|
+
r[:result_op] = table['class'].to_s.include?('result-op')
|
62
|
+
|
63
|
+
r[:fk] = table['fk']
|
64
|
+
|
65
|
+
r[:srcid] = table['srcid']
|
66
|
+
|
67
|
+
r[:tpl] = table['tpl']
|
68
|
+
|
69
|
+
r[:mu] = table['mu']
|
70
|
+
|
71
|
+
url = table.search('h3/a').first
|
72
|
+
unless url.nil?
|
73
|
+
url = url['href']
|
74
|
+
sleep(rand)
|
75
|
+
url = Serper::Crawler.get_rank_url('http:'+url).headers['location'] if url.include?('//www.baidu.com/link?')
|
76
|
+
end
|
77
|
+
r[:url] = url
|
78
|
+
|
79
|
+
r[:title] = Serper::Helper.get_content_safe(table.search('h3'))
|
80
|
+
|
81
|
+
r[:content] = Serper::Helper.get_content_safe(table.search('div.c-abstract'))
|
82
|
+
|
83
|
+
table.search('a').each do |link|
|
84
|
+
r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
|
85
|
+
end
|
86
|
+
r[:baiduopen] = false if r[:baiduopen].nil?
|
87
|
+
|
88
|
+
result << r
|
89
|
+
end
|
90
|
+
result
|
91
|
+
end
|
92
|
+
|
93
|
+
# def _parse_related_keywords(file)
|
94
|
+
# result = []
|
95
|
+
# file[:doc].search('div[@id="rs"]').each do |rs|
|
96
|
+
# rs.css('a').each do |link|
|
97
|
+
# result << link.content
|
98
|
+
# end
|
99
|
+
# end
|
100
|
+
# result
|
101
|
+
# end
|
102
|
+
|
103
|
+
# def _parse_result_num(file)
|
104
|
+
# html = file[:html]
|
105
|
+
# str = html.scan(/百度为您找到相关结果(.*)个/).join
|
106
|
+
# str = str.gsub('约','')
|
107
|
+
# if str.include?('万')
|
108
|
+
# parts = str.split('万')
|
109
|
+
# result = parts[0].to_i * 10000 + parts[1].to_i
|
110
|
+
# else
|
111
|
+
# result = str.gsub(',', '').to_i
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# result
|
115
|
+
# end
|
116
|
+
|
117
|
+
# def _parse_right_hotel(file)
|
118
|
+
# rh = file[:doc].search('div[@tpl="right_hotel"]')
|
119
|
+
# return nil if rh.nil?
|
120
|
+
#
|
121
|
+
# rh = rh.first
|
122
|
+
# return nil if rh.nil?
|
123
|
+
# title = Serper::Helper.get_content_safe(rh.search('div.opr-hotel-title'))
|
124
|
+
#
|
125
|
+
# {:title => title}
|
126
|
+
# end
|
127
|
+
|
128
|
+
# def _parse_right_personinfo(file)
|
129
|
+
# rp = file[:doc].search('div[@tpl="right_personinfo"]')
|
130
|
+
# return nil if rp.nil?
|
131
|
+
#
|
132
|
+
# title = Serper::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large')
|
133
|
+
# info_summary = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-summary')
|
134
|
+
# info = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-info')
|
135
|
+
# source = Serper::Helper.get_content_safe rp.search('div.opr-personinfo-source a')
|
136
|
+
#
|
137
|
+
# return nil if title.nil? && info.nil? && source.nil?
|
138
|
+
# {:title => title, :info_summary => info_summary, :info => info, :source => source}
|
139
|
+
# end
|
140
|
+
|
141
|
+
# def _parse_right_relaperson(file)
|
142
|
+
# relapersons = file[:doc].search('div[@tpl="right_relaperson"]')
|
143
|
+
# return nil if relapersons.nil?
|
144
|
+
#
|
145
|
+
# result = []
|
146
|
+
# relapersons.each do |rr|
|
147
|
+
# title = rr.search('div.cr-title/span').first
|
148
|
+
# title = title.content unless title.nil?
|
149
|
+
# r = []
|
150
|
+
# rr.search('p.opr-relaperson-name/a').each do |p|
|
151
|
+
# r << p['title']
|
152
|
+
# end
|
153
|
+
# result << {:title => title, :names => r}
|
154
|
+
# end
|
155
|
+
# result
|
156
|
+
# end
|
157
|
+
|
158
|
+
# def _parse_right_weather(file)
|
159
|
+
# rw = file[:doc].search('div[@tpl="right_weather"]')
|
160
|
+
# return nil if rw.nil?
|
161
|
+
#
|
162
|
+
# rw = rw.first
|
163
|
+
# return nil if rw.nil?
|
164
|
+
#
|
165
|
+
# title = Serper::Helper.get_content_safe(rw.search('div.opr-weather-title'))
|
166
|
+
# week = rw.search('a.opr-weather-week').first['href']
|
167
|
+
#
|
168
|
+
# {:title => title, :week => week}
|
169
|
+
# end
|
170
|
+
|
171
|
+
def _parse_zhixin(file)
|
172
|
+
result = []
|
173
|
+
file[:doc].search("div#content_left .result-zxl").each do |zxl|
|
174
|
+
result << {:id => zxl['id'],
|
175
|
+
:srcid => zxl['srcid'],
|
176
|
+
:fk => zxl['fk'],
|
177
|
+
:tpl => zxl['tpl'],
|
178
|
+
:mu => zxl['mu'],
|
179
|
+
:data_click => Serper::Helper.parse_data_click(zxl['data-click'])
|
180
|
+
}
|
181
|
+
end
|
182
|
+
result
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
class Serper::Baidu
|
2
|
+
def weight_config
|
3
|
+
{
|
4
|
+
:left_parts => [:ads_top,
|
5
|
+
:zhixin,
|
6
|
+
:ranks
|
7
|
+
],
|
8
|
+
|
9
|
+
:right_parts => [:con_ar,
|
10
|
+
:ads_right
|
11
|
+
],
|
12
|
+
|
13
|
+
:left_part_weight => 8,
|
14
|
+
|
15
|
+
:right_part_weight => 2,
|
16
|
+
|
17
|
+
:zhixin_weight => 3.5,
|
18
|
+
|
19
|
+
:baiduopen_weight => 3,
|
20
|
+
|
21
|
+
:rank_special_weight => 2,
|
22
|
+
|
23
|
+
:con_ar_weight => 2
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
# _weight_of_*** functions
|
28
|
+
# return a hash array
|
29
|
+
# each hash includes: type, name, site, weight
|
30
|
+
|
31
|
+
def _weight_of_ranks(serp_result,side_rank)
|
32
|
+
result = []
|
33
|
+
serp_result[:ranks].each.with_index do |rank,i|
|
34
|
+
side_rank += 1
|
35
|
+
|
36
|
+
url = rank[:url].to_s
|
37
|
+
mu = rank[:mu].to_s
|
38
|
+
|
39
|
+
type = 'SEO'
|
40
|
+
type = 'Special' if rank[:baiduopen]
|
41
|
+
|
42
|
+
unless mu.empty?
|
43
|
+
url = mu
|
44
|
+
type = 'Special'
|
45
|
+
end
|
46
|
+
|
47
|
+
site = Serper::Helper.parse_site(url)
|
48
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
49
|
+
path = Serper::Helper.parse_path(url)
|
50
|
+
|
51
|
+
name = rank[:tpl].to_s
|
52
|
+
|
53
|
+
weight = 1.0/side_rank.to_f
|
54
|
+
if type == 'Special'
|
55
|
+
if rank[:baiduopen]
|
56
|
+
weight = weight * weight_config[:baiduopen_weight].to_f
|
57
|
+
else
|
58
|
+
weight = weight * weight_config[:rank_special_weight].to_f
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
part_rank = rank[:rank]
|
63
|
+
|
64
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, mu: mu, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
65
|
+
end
|
66
|
+
[result, side_rank]
|
67
|
+
end
|
68
|
+
|
69
|
+
def _weight_of_ads_top(serp_result,side_rank)
|
70
|
+
result = []
|
71
|
+
serp_result[:ads_top].each.with_index do |ad,i|
|
72
|
+
side_rank += 1
|
73
|
+
|
74
|
+
url = ad[:url].to_s
|
75
|
+
type = 'SEM'
|
76
|
+
name = ''
|
77
|
+
site = Serper::Helper.parse_site(url)
|
78
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
79
|
+
path = Serper::Helper.parse_path(url)
|
80
|
+
|
81
|
+
part_rank = ad[:rank]
|
82
|
+
|
83
|
+
weight = 1.0/side_rank.to_f
|
84
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
85
|
+
end
|
86
|
+
[result, side_rank]
|
87
|
+
end
|
88
|
+
|
89
|
+
def _weight_of_ads_right(serp_result,side_rank)
|
90
|
+
result = []
|
91
|
+
serp_result[:ads_right].each.with_index do |ad,i|
|
92
|
+
side_rank += 1
|
93
|
+
|
94
|
+
url = ad[:url].to_s
|
95
|
+
type = 'SEM'
|
96
|
+
name = ''
|
97
|
+
site = Serper::Helper.parse_site(url)
|
98
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
99
|
+
path = Serper::Helper.parse_path(url)
|
100
|
+
|
101
|
+
part_rank = ad[:rank]
|
102
|
+
|
103
|
+
weight = 1.0/side_rank.to_f
|
104
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
|
105
|
+
end
|
106
|
+
[result, side_rank]
|
107
|
+
end
|
108
|
+
|
109
|
+
def _weight_of_con_ar(serp_result,side_rank)
|
110
|
+
result = []
|
111
|
+
serp_result[:con_ar].each.with_index do |con,i|
|
112
|
+
side_rank += 1
|
113
|
+
|
114
|
+
url = con[:data_click]['mu'].to_s
|
115
|
+
type = 'Special'
|
116
|
+
name = con[:tpl]
|
117
|
+
site = Serper::Helper.parse_site(url)
|
118
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
119
|
+
path = Serper::Helper.parse_path(url)
|
120
|
+
|
121
|
+
weight = 1.0 * weight_config[:con_ar_weight]
|
122
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
|
123
|
+
end
|
124
|
+
[result, side_rank]
|
125
|
+
end
|
126
|
+
|
127
|
+
def _weight_of_zhixin(serp_result,side_rank)
|
128
|
+
result = []
|
129
|
+
serp_result[:zhixin].each.with_index do |zhixin,i|
|
130
|
+
side_rank += 1
|
131
|
+
|
132
|
+
url = zhixin[:mu].to_s
|
133
|
+
type = 'Special'
|
134
|
+
name = zhixin[:tpl]
|
135
|
+
site = Serper::Helper.parse_site(url)
|
136
|
+
subdomain = Serper::Helper.parse_subdomain(url)
|
137
|
+
weight = 1.0 * weight_config[:zhixin_weight]
|
138
|
+
path = Serper::Helper.parse_path(url)
|
139
|
+
|
140
|
+
result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
|
141
|
+
end
|
142
|
+
[result, side_rank]
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
|
3
|
+
module Serper
|
4
|
+
class Crawler
|
5
|
+
AllUserAgents = YAML.load(open(File.expand_path('../user_agents.yml',__FILE__)))
|
6
|
+
|
7
|
+
def self.rand_ua
|
8
|
+
AllUserAgents[rand(AllUserAgents.size)]
|
9
|
+
end
|
10
|
+
|
11
|
+
include HTTParty
|
12
|
+
base_uri 'www.baidu.com'
|
13
|
+
follow_redirects false
|
14
|
+
headers "User-Agent" => self.rand_ua, "Referer" => 'http://www.baidu.com/'
|
15
|
+
|
16
|
+
def self.get_serp(url,retries = 3)
|
17
|
+
self.new.get_serp(url,retries)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.get_rank_url(url)
|
21
|
+
self.new.get_rank_url(url)
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_rank_url(url)
|
25
|
+
begin
|
26
|
+
response = self.class.get(url)
|
27
|
+
rescue StandardError => e
|
28
|
+
puts e.class
|
29
|
+
puts e.message
|
30
|
+
sleep(10)
|
31
|
+
retry
|
32
|
+
end
|
33
|
+
response
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_serp(url, retries = 3)
|
37
|
+
if retries > 0
|
38
|
+
begin
|
39
|
+
response = self.class.get(url)
|
40
|
+
rescue StandardError => e
|
41
|
+
puts e.class
|
42
|
+
puts e.message
|
43
|
+
sleep(10)
|
44
|
+
retry
|
45
|
+
end
|
46
|
+
|
47
|
+
if response.code != 200
|
48
|
+
puts response
|
49
|
+
puts "Retry on URL: #{url}"
|
50
|
+
sleep(rand(60)+1200)
|
51
|
+
response = self.class.get_serp(url,retries - 1)
|
52
|
+
end
|
53
|
+
|
54
|
+
if response.nil?
|
55
|
+
puts "Still error after 3 tries, sleep 3600s now."
|
56
|
+
sleep(3600)
|
57
|
+
response = self.class.get_serp(url)
|
58
|
+
end
|
59
|
+
|
60
|
+
##Baidu Stopped response Content-Length in headers...
|
61
|
+
#if response.headers['Content-Length'].nil?
|
62
|
+
# puts "Can't read Content-Length from response, retry."
|
63
|
+
# response = self.class.get_serp(url,retries-1)
|
64
|
+
#end
|
65
|
+
#
|
66
|
+
#if response.headers['Content-Length'].to_i != response.body.bytesize
|
67
|
+
# issue_file = "/tmp/serper_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
68
|
+
# open(issue_file,'w').puts(response.body)
|
69
|
+
# puts "Notice:"
|
70
|
+
# puts "Serper get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size."
|
71
|
+
# puts "Please see file #{issue_file} for body content."
|
72
|
+
# puts "Sleep 10s and retry"
|
73
|
+
# sleep(10)
|
74
|
+
# response = self.class.get_serp(url)
|
75
|
+
#end
|
76
|
+
|
77
|
+
response
|
78
|
+
else
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'domainatrix'
|
2
|
+
|
3
|
+
module Serper
|
4
|
+
module Helper
|
5
|
+
class << self
|
6
|
+
# get content safe from nokogiri search reasult
|
7
|
+
def get_content_safe(noko)
|
8
|
+
return nil if noko.nil?
|
9
|
+
return nil if noko.empty?
|
10
|
+
noko.first.content.strip
|
11
|
+
end
|
12
|
+
|
13
|
+
# parse data click value from baidu div property,
|
14
|
+
# which is a JSON like format
|
15
|
+
def parse_data_click(str)
|
16
|
+
JSON.parse(str
|
17
|
+
.gsub("'",'"')
|
18
|
+
.gsub(/({|,)([a-zA-Z0-9_]+):/, '\1"\2":')
|
19
|
+
#.gsub(/'*([a-zA-Z0-9_]+)'*:/, '"\1":')
|
20
|
+
#.gsub(/:'([^(',\")]*)'(,|})/,':"\1"\2')
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
# normalize weight of given data,
|
25
|
+
# the data must be a hash array structure.
|
26
|
+
# for example : [{a: 1, b: 2}, {a: 2, b: 3}]
|
27
|
+
def normalize(data,weight_col=:weight,normalized_col=:normalized_weight)
|
28
|
+
total_weight = data.reduce(0.0) {|sum,d| sum += d[weight_col].to_f}
|
29
|
+
data.each do|d|
|
30
|
+
d[normalized_col] = d[weight_col].to_f/total_weight
|
31
|
+
end
|
32
|
+
data
|
33
|
+
end
|
34
|
+
|
35
|
+
def parse_site(url)
|
36
|
+
begin
|
37
|
+
url = Domainatrix.parse(url.to_s)
|
38
|
+
site = url.domain + '.' + url.public_suffix
|
39
|
+
rescue Exception => e
|
40
|
+
puts "parse_site from url error:"
|
41
|
+
puts url
|
42
|
+
puts e.class
|
43
|
+
puts e.message
|
44
|
+
site = ''
|
45
|
+
end
|
46
|
+
site
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_subdomain(url)
|
50
|
+
begin
|
51
|
+
url = Domainatrix.parse(url.to_s)
|
52
|
+
subdomain = url.subdomain
|
53
|
+
rescue Exception => e
|
54
|
+
puts "parse_site from url error:"
|
55
|
+
puts url
|
56
|
+
puts e.class
|
57
|
+
puts e.message
|
58
|
+
subdomain = ''
|
59
|
+
end
|
60
|
+
subdomain
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_path(url)
|
64
|
+
begin
|
65
|
+
url = Domainatrix.parse(url.to_s)
|
66
|
+
path = url.path
|
67
|
+
rescue Exception => e
|
68
|
+
puts "parse_site from url error:"
|
69
|
+
puts url
|
70
|
+
puts e.class
|
71
|
+
puts e.message
|
72
|
+
path = ''
|
73
|
+
end
|
74
|
+
path
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
require 'json'
|
5
|
+
require 'serper/crawler'
|
6
|
+
require 'serper/helper'
|
7
|
+
|
8
|
+
module Serper
|
9
|
+
class Parser
|
10
|
+
attr_reader :engine_name, :keyword, :page, :html, :doc, :result
|
11
|
+
|
12
|
+
def initialize(engine_name,keyword,page=1)
|
13
|
+
@engine_name = engine_name
|
14
|
+
@engine = ENGINES[@engine_name].new
|
15
|
+
@keyword = keyword
|
16
|
+
@page = page
|
17
|
+
end
|
18
|
+
|
19
|
+
def serp_url
|
20
|
+
@engine.serp_url(@keyword,@page)
|
21
|
+
end
|
22
|
+
|
23
|
+
def search
|
24
|
+
html = Crawler.get_serp(serp_url).body
|
25
|
+
parse html
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse(html)
|
29
|
+
html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
|
30
|
+
@file = Hash.new
|
31
|
+
@result = Hash.new
|
32
|
+
|
33
|
+
@file[:html] = html
|
34
|
+
@file[:doc] = Nokogiri::HTML(html)
|
35
|
+
|
36
|
+
@engine.methods.each do |m|
|
37
|
+
next unless m =~ /^_parse_/
|
38
|
+
begin
|
39
|
+
@result[m.to_s.sub('_parse_','').to_sym] = @engine.send m,@file
|
40
|
+
rescue Exception => e
|
41
|
+
issue_file = "/tmp/serper_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
|
42
|
+
open(issue_file,'w').puts(html)
|
43
|
+
puts "Notice:"
|
44
|
+
puts "Serper gem have a bug, please email to zmingqian@qq.com to report it."
|
45
|
+
puts "Please attach file #{issue_file} in the email and the error information below, thanks!"
|
46
|
+
puts e.message
|
47
|
+
puts e.inspect
|
48
|
+
puts e.backtrace
|
49
|
+
raise "Serper Parser Get An Error!"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
@result
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def weights
|
58
|
+
result = []
|
59
|
+
[:left,:right].each do |side|
|
60
|
+
side_rank = 0
|
61
|
+
|
62
|
+
@engine.weight_config["#{side}_parts".to_sym].each do |part|
|
63
|
+
rs,side_rank = @engine.send("_weight_of_#{part}",@result,side_rank)
|
64
|
+
|
65
|
+
rs.each do |r|
|
66
|
+
r[:side] = side.to_s
|
67
|
+
r[:part] = part
|
68
|
+
|
69
|
+
r[:weight] = r[:weight].to_f * @engine.weight_config["#{side}_part_weight".to_sym].to_f
|
70
|
+
result << r
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
Serper::Helper.normalize(result,:side_weight,:weight)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
---
|
2
|
+
- 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/527 (KHTML, like Gecko,
|
3
|
+
Safari/419.3) Arora/0.6 (Change: )'
|
4
|
+
- Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3)
|
5
|
+
Arora/0.8.0
|
6
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser;
|
7
|
+
.NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET
|
8
|
+
CLR 3.0.04506.30)
|
9
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9
|
10
|
+
Chrome/17.0.939.0 Safari/535.8
|
11
|
+
- Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0
|
12
|
+
Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0
|
13
|
+
- Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1
|
14
|
+
Safari/536.3
|
15
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0
|
16
|
+
Safari/536.6
|
17
|
+
- Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0
|
18
|
+
Safari/536.6
|
19
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1
|
20
|
+
Safari/537.1
|
21
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0
|
22
|
+
Safari/537.36
|
23
|
+
- Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0
|
24
|
+
Safari/537.36
|
25
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0
|
26
|
+
- Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1
|
27
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1
|
28
|
+
- Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0
|
29
|
+
- Mozilla/5.0 (Windows NT 6.2; rv:19.0) Gecko/20121129 Firefox/19.0
|
30
|
+
- Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0
|
31
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0
|
32
|
+
- Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)
|
33
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR
|
34
|
+
2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Maxthon
|
35
|
+
2.0)
|
36
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko)
|
37
|
+
Maxthon/3.0.8.2 Safari/533.1
|
38
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Maxthon/4.0.0.2000
|
39
|
+
Chrome/22.0.1229.79 Safari/537.1
|
40
|
+
- Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
|
41
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
|
42
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727;
|
43
|
+
.NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)
|
44
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
|
45
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)
|
46
|
+
- Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
|
47
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)
|
48
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
|
49
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)
|
50
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)
|
51
|
+
- Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media
|
52
|
+
Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)
|
53
|
+
- Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
|
54
|
+
- Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)
|
55
|
+
- Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1;
|
56
|
+
.NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0
|
57
|
+
- Opera/9.25 (Windows NT 6.0; U; en)
|
58
|
+
- Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10
|
59
|
+
- Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00
|
60
|
+
- Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01
|
61
|
+
- Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10
|
62
|
+
- Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00
|
63
|
+
- Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14
|
64
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like
|
65
|
+
Gecko) Version/4.0.4 Safari/531.21.10
|
66
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like
|
67
|
+
Gecko) Version/5.0.1 Safari/533.17.8
|
68
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like
|
69
|
+
Gecko) Version/5.0.2 Safari/533.18.5
|
70
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.2; es-US ) AppleWebKit/540.0 (KHTML like Gecko)
|
71
|
+
Version/6.0 Safari/8900.00
|
72
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like
|
73
|
+
Firefox/3.x) SeaMonkey/2.0.12
|
74
|
+
- Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1
|
75
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20120422 Firefox/12.0 SeaMonkey/2.9
|
76
|
+
- Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)
|
77
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko)
|
78
|
+
Chrome/4.0.249.0 Safari/532.5
|
79
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko)
|
80
|
+
Chrome/5.0.310.0 Safari/532.9
|
81
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko)
|
82
|
+
Chrome/7.0.514.0 Safari/534.7
|
83
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like
|
84
|
+
Gecko) Chrome/9.0.601.0 Safari/534.14
|
85
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like
|
86
|
+
Gecko) Chrome/10.0.601.0 Safari/534.14
|
87
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like
|
88
|
+
Gecko) Chrome/11.0.672.2 Safari/534.20
|
89
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0
|
90
|
+
Safari/534.27
|
91
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24
|
92
|
+
Safari/535.1
|
93
|
+
- Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120
|
94
|
+
Safari/535.2
|
95
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36
|
96
|
+
Safari/535.7
|
97
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421
|
98
|
+
Minefield/3.0.2pre
|
99
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10
|
100
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11
|
101
|
+
(.NET CLR 3.5.30729)
|
102
|
+
- Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6
|
103
|
+
GTB5
|
104
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8
|
105
|
+
( .NET CLR 3.5.30729; .NET4.0E)
|
106
|
+
- Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
107
|
+
- Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
108
|
+
- Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0
|
109
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2
|
110
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1
|
111
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
112
|
+
- Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre
|
113
|
+
- Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )
|
114
|
+
- Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)
|
115
|
+
- Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a
|
116
|
+
- Mozilla/2.02E (Win95; U)
|
117
|
+
- Mozilla/3.01Gold (Win95; I)
|
118
|
+
- Mozilla/4.8 [en] (Windows NT 5.1; U)
|
119
|
+
- Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)
|
120
|
+
- Opera/7.50 (Windows XP; U)
|
121
|
+
- Opera/7.50 (Windows ME; U) [en]
|
122
|
+
- Opera/7.51 (Windows NT 5.1; U) [en]
|
123
|
+
- Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0
|
124
|
+
- Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.2b) Gecko/20021001 Phoenix/0.2
|
125
|
+
- Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.23) Gecko/20090825 SeaMonkey/1.1.18
|
126
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
127
|
+
Camino/2.2.1
|
128
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre
|
129
|
+
Camino/2.2a1pre
|
130
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko)
|
131
|
+
Chrome/19.0.1063.0 Safari/536.3
|
132
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.4 (KHTML like Gecko)
|
133
|
+
Chrome/22.0.1229.79 Safari/537.4
|
134
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.31 (KHTML like Gecko)
|
135
|
+
Chrome/26.0.1410.63 Safari/537.31
|
136
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 1083) AppleWebKit/537.36 (KHTML like Gecko)
|
137
|
+
Chrome/28.0.1469.0 Safari/537.36
|
138
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
139
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0
|
140
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:20.0) Gecko/20100101 Firefox/20.0
|
141
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0
|
142
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) AppleWebKit/528.16 (KHTML, like
|
143
|
+
Gecko, Safari/528.16) OmniWeb/v622.8.0.112941
|
144
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/528.16 (KHTML,
|
145
|
+
like Gecko, Safari/528.16) OmniWeb/v622.8.0
|
146
|
+
- Opera/9.20 (Macintosh; Intel Mac OS X; U; en)
|
147
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.6.30 Version/10.61
|
148
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X 10.4.11; U; en) Presto/2.7.62 Version/11.00
|
149
|
+
- Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52
|
150
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML,
|
151
|
+
like Gecko) Version/4.0.4 Safari/531.21.10
|
152
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15 (KHTML,
|
153
|
+
like Gecko) Version/5.0.3 Safari/533.19.4
|
154
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML,
|
155
|
+
like Gecko) Version/5.0.4 Safari/533.20.27
|
156
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7; en-us) AppleWebKit/534.20.8 (KHTML,
|
157
|
+
like Gecko) Version/5.1 Safari/534.20.8
|
158
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like
|
159
|
+
Gecko) Version/5.1.3 Safari/534.53.10
|
160
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/536.26.17 (KHTML like
|
161
|
+
Gecko) Version/6.0.2 Safari/536.26.17
|
162
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; rv:10.0.1) Gecko/20100101 Firefox/10.0.1
|
163
|
+
SeaMonkey/2.7.1
|
164
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML,
|
165
|
+
like Gecko) Chrome/4.0.302.2 Safari/532.8
|
166
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML,
|
167
|
+
like Gecko) Chrome/6.0.464.0 Safari/534.3
|
168
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML,
|
169
|
+
like Gecko) Chrome/9.0.597.15 Safari/534.13
|
170
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko)
|
171
|
+
Chrome/14.0.835.186 Safari/535.1
|
172
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko)
|
173
|
+
Chrome/15.0.874.54 Safari/535.2
|
174
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko)
|
175
|
+
Chrome/16.0.912.36 Safari/535.7
|
176
|
+
- 'Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 '
|
177
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624
|
178
|
+
Firefox/3.5
|
179
|
+
- Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218
|
180
|
+
AlexaToolbar/alxf-2.0 Firefox/3.6.14
|
181
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
|
182
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0
|
183
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0
|
metadata
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: serper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- MingQian Zhang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: httparty
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: domainatrix
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: activerecord
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: docopt
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-progressbar
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Parse SERP result page.
|
98
|
+
email:
|
99
|
+
- zmingqian@qq.com
|
100
|
+
executables:
|
101
|
+
- serper
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- lib/serper/analyser.rb
|
106
|
+
- lib/serper/baidu/crawler.rb
|
107
|
+
- lib/serper/baidu/parser.rb
|
108
|
+
- lib/serper/baidu/weight.rb
|
109
|
+
- lib/serper/crawler.rb
|
110
|
+
- lib/serper/helper.rb
|
111
|
+
- lib/serper/parser.rb
|
112
|
+
- lib/serper/version.rb
|
113
|
+
- lib/serper.rb
|
114
|
+
- bin/serper
|
115
|
+
- README.md
|
116
|
+
- lib/serper/user_agents.yml
|
117
|
+
homepage: https://github.com/semseo/serper
|
118
|
+
licenses:
|
119
|
+
- MIT
|
120
|
+
metadata: {}
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - '>='
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
|
+
requirements:
|
132
|
+
- - '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
requirements: []
|
136
|
+
rubyforge_project:
|
137
|
+
rubygems_version: 2.0.0
|
138
|
+
signing_key:
|
139
|
+
specification_version: 4
|
140
|
+
summary: SERP
|
141
|
+
test_files: []
|