sofi-searcher 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/searcher/bing.rb +41 -0
- data/lib/searcher/china_searcher.rb +78 -0
- data/lib/searcher/global.rb +86 -0
- data/lib/searcher/google.rb +31 -0
- data/lib/searcher.rb +55 -0
- metadata +49 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
require 'searcher/global'
|
3
|
+
def search_from_bing(keyword,page=2)
|
4
|
+
account_key = 'Onm2ZtMfIJsKdKLopx6/VpyADuqrdJPhsacwUuez7Ds='
|
5
|
+
bing_keyword = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27' + URI.encode(keyword) + '%27' + '&$skip=0'
|
6
|
+
uri = URI(bing_keyword)
|
7
|
+
|
8
|
+
req = Net::HTTP::Get.new(uri.request_uri)
|
9
|
+
req.basic_auth('', account_key)
|
10
|
+
|
11
|
+
res = Net::HTTP.start(uri.hostname, uri.port, :use_ssl => uri.scheme == 'https') { |http|
|
12
|
+
http.request(req)
|
13
|
+
}
|
14
|
+
|
15
|
+
res.body
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_list_from_bing(keyword,page=2)
|
19
|
+
content = search_from_bing(keyword,page)
|
20
|
+
|
21
|
+
super_link = Array.new
|
22
|
+
regex = /<d:Url.*?<\/d:Url>/
|
23
|
+
#Global.save_to_file(content,'bing.html','/htmls')
|
24
|
+
|
25
|
+
content.scan(regex).each do |n|
|
26
|
+
regex_http = /http.*?</
|
27
|
+
real_url = n.match(regex_http)
|
28
|
+
real_url = real_url.to_s.delete('<')
|
29
|
+
super_link.push(real_url)
|
30
|
+
#Global.save_link_info(real_url, 'bing')
|
31
|
+
super_link << [real_url,"bing"]
|
32
|
+
end
|
33
|
+
super_link
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Searcher::ChinaSearcher
|
2
|
+
require 'searcher/global'
|
3
|
+
attr_accessor :name,:url,:page_no,:page_name
|
4
|
+
PAGE_NUM = 2
|
5
|
+
|
6
|
+
def initialize(name,url,page_no='1',page_name='pn')
|
7
|
+
|
8
|
+
@name = name
|
9
|
+
@url = url
|
10
|
+
@page_no = page_no
|
11
|
+
@page_name = page_name
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
def search_keywords(keyword,page=PAGE_NUM)
|
16
|
+
res = ''
|
17
|
+
keyword_urls(keyword,page).each do |url|
|
18
|
+
res += Net::HTTP.get_response(URI.parse(url)).body
|
19
|
+
end
|
20
|
+
res
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
def keyword_urls (keyword,page=PAGE_NUM)
|
25
|
+
i = 'baidu' == self.name ? 0 : 1
|
26
|
+
sites = []
|
27
|
+
loop do
|
28
|
+
url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
|
29
|
+
sites << url_with_keyword
|
30
|
+
i += page_no.to_i
|
31
|
+
break if i > (page * self.page_no.to_i)
|
32
|
+
end
|
33
|
+
sites
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def get_list(keyword,page=PAGE_NUM)
|
38
|
+
content = search_keywords(keyword,page)
|
39
|
+
super_link = Array.new
|
40
|
+
regex = /<a.*?href.*?<\/a>/
|
41
|
+
|
42
|
+
#Global.save_to_file(content,self.name + '.html','/htmls')
|
43
|
+
|
44
|
+
content.scan(regex).each do |n|
|
45
|
+
if n.index('<em>') != nil
|
46
|
+
url =/"http.*?"/.match(n)
|
47
|
+
if url != nil
|
48
|
+
string_url = url.to_s.delete('"')
|
49
|
+
redirect_url = Global.html_get_web_url(string_url)
|
50
|
+
if redirect_url != nil
|
51
|
+
super_link << [redirect_url,self.name]
|
52
|
+
#Global.save_link_info(redirect_url,self.name)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
super_link
|
58
|
+
end
|
59
|
+
|
60
|
+
class << self
|
61
|
+
def keyword_urls(names,keyword,page=PAGE_NUM)
|
62
|
+
urls = []
|
63
|
+
names.each do |name|
|
64
|
+
name.keyword_urls(keyword,page).each do |url|
|
65
|
+
urls << url
|
66
|
+
end
|
67
|
+
end
|
68
|
+
urls
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Global
|
2
|
+
|
3
|
+
|
4
|
+
require 'ap' # gem install awesome_print
|
5
|
+
require 'json'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'forkmanager' # gem install parallel-forkmanager
|
8
|
+
require 'beanstalk-client'
|
9
|
+
require 'net/http'
|
10
|
+
require 'uri'
|
11
|
+
Dir[ "./core/*.rb" ].each { |file| require(file) }
|
12
|
+
|
13
|
+
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
|
14
|
+
|
15
|
+
def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
|
16
|
+
# get_final_url_from_response().
|
17
|
+
|
18
|
+
raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
|
19
|
+
begin
|
20
|
+
response = Net::HTTP.get_response(URI.parse(URI.decode(url)))
|
21
|
+
|
22
|
+
case response
|
23
|
+
when Net::HTTPSuccess then
|
24
|
+
url
|
25
|
+
when Net::HTTPRedirection then
|
26
|
+
location = response['location']
|
27
|
+
#warn "redirected to #{location}"
|
28
|
+
#html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
|
29
|
+
else
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
rescue => e
|
33
|
+
e.message
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
|
39
|
+
uri = URI.parse(url)
|
40
|
+
req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
|
41
|
+
req.add_field('User-Agent', user_agent)
|
42
|
+
res = Net::HTTP.start(uri.host, uri.port) do |http|
|
43
|
+
http.read_timeout = timeout
|
44
|
+
http.request(req)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def self.get_whole_html(res,user_agent=UserAgent,timeout=20)
|
50
|
+
|
51
|
+
encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
|
52
|
+
encoding = encoding ? encoding[0].upcase : 'GB18030'
|
53
|
+
html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
|
60
|
+
res = get_whole_response(url,user_agent,timeout)
|
61
|
+
res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
def self.save_link_info(url,info_type='baidu',path='/link_infos')
|
67
|
+
save_to_file(url,"#{info_type}.txt",path)
|
68
|
+
#into DB ... some code ...
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def self.save_to_file(content,file_name,path='/link_infos')
|
73
|
+
path = ".#{path}/"
|
74
|
+
Dir.mkdir(path) if !Dir.exist?(path)
|
75
|
+
|
76
|
+
logfile = File.open(path + file_name, 'a')
|
77
|
+
logfile.puts(content)
|
78
|
+
logfile.close
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'searcher/global'
|
2
|
+
|
3
|
+
def search_from_google(keyword,page=2)
|
4
|
+
res,links = '',[]
|
5
|
+
(1..page).each do |pn|
|
6
|
+
url_with_keyword = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyBvybq0NEaMtMkAkPUd7hhC-17AzcOc9x8&cx=013036536707430787589:_pqjad5hr1a&alt=json&fields=items(link)&q=' + URI.encode(keyword) + '&start=' + pn.to_s
|
7
|
+
url = URI.parse(url_with_keyword)
|
8
|
+
http = Net::HTTP.new(url.host, url.port)
|
9
|
+
http.use_ssl = true
|
10
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
11
|
+
request = Net::HTTP::Get.new(url.request_uri)
|
12
|
+
response = http.request(request)
|
13
|
+
res += response.body
|
14
|
+
links_strings = JSON.parse(response.body) rescue nil
|
15
|
+
links_strings['items'].each do |link|
|
16
|
+
links << [link['link'],"google"]
|
17
|
+
#Global.save_link_info(link['link'], 'google')
|
18
|
+
end
|
19
|
+
#links
|
20
|
+
end
|
21
|
+
links
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def get_list_from_google(keyword,page=2)
|
26
|
+
#content = search_from_google(keyword,page)
|
27
|
+
#Global.save_to_file(content,'google.html','/htmls')
|
28
|
+
search_from_google(keyword,page)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
data/lib/searcher.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
class Searcher
|
2
|
+
UsSearchers = ['google','bing']
|
3
|
+
ChinaSearchers = ['baidu','sogou','so360']
|
4
|
+
AllSearchers = UsSearchers + ChinaSearchers
|
5
|
+
class << self
|
6
|
+
def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
|
7
|
+
infos = []
|
8
|
+
start_time = Time.now
|
9
|
+
searchers.each do |searcher|
|
10
|
+
infos += send "get_info_from_" + searcher,keywords,page
|
11
|
+
end
|
12
|
+
p "after search #{(Time.now - start_time).round(4)}"
|
13
|
+
infos
|
14
|
+
end
|
15
|
+
|
16
|
+
AllSearchers.each do |search|
|
17
|
+
define_method "get_info_from_#{search}" do |keywords,page=1|
|
18
|
+
if UsSearchers.include?(search)
|
19
|
+
send "get_list_from_" + search,keywords,page
|
20
|
+
else
|
21
|
+
searcher = send(search)
|
22
|
+
return searcher.get_list(keywords,page)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def baidu
|
28
|
+
@baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
|
29
|
+
end
|
30
|
+
|
31
|
+
def sogou
|
32
|
+
@sogou = ChinaSearcher.new('sogou', 'http://www.sogou.com/web?query=', '1','page')
|
33
|
+
end
|
34
|
+
|
35
|
+
def so360
|
36
|
+
@so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
|
37
|
+
end
|
38
|
+
|
39
|
+
def china_searchers
|
40
|
+
[baidu,sogou,so360] # the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
|
41
|
+
#[sogou,so360]
|
42
|
+
end
|
43
|
+
|
44
|
+
#Searcher.china_searchers.each do |searcher|
|
45
|
+
#define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
|
46
|
+
# searcher.get_list(keywords,page)
|
47
|
+
#end
|
48
|
+
#end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
require 'searcher/china_searcher'
|
53
|
+
require 'searcher/bing'
|
54
|
+
require 'searcher/google'
|
55
|
+
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sofi-searcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zhimeng Sun
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-09 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Just a simple Searcher
|
15
|
+
email: 532681765@qq.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/searcher.rb
|
21
|
+
- lib/searcher/bing.rb
|
22
|
+
- lib/searcher/china_searcher.rb
|
23
|
+
- lib/searcher/global.rb
|
24
|
+
- lib/searcher/google.rb
|
25
|
+
homepage: http://rubygems.org/gems/sofi-searcher
|
26
|
+
licenses: []
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
none: false
|
33
|
+
requirements:
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ! '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 1.8.25
|
46
|
+
signing_key:
|
47
|
+
specification_version: 3
|
48
|
+
summary: Just a searcher
|
49
|
+
test_files: []
|