sofi-searcher 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/searcher/bing.rb +41 -0
- data/lib/searcher/china_searcher.rb +78 -0
- data/lib/searcher/global.rb +86 -0
- data/lib/searcher/google.rb +31 -0
- data/lib/searcher.rb +55 -0
- metadata +49 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
require 'searcher/global'
|
3
|
+
def search_from_bing(keyword,page=2)
|
4
|
+
account_key = 'Onm2ZtMfIJsKdKLopx6/VpyADuqrdJPhsacwUuez7Ds='
|
5
|
+
bing_keyword = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27' + URI.encode(keyword) + '%27' + '&$skip=0'
|
6
|
+
uri = URI(bing_keyword)
|
7
|
+
|
8
|
+
req = Net::HTTP::Get.new(uri.request_uri)
|
9
|
+
req.basic_auth('', account_key)
|
10
|
+
|
11
|
+
res = Net::HTTP.start(uri.hostname, uri.port, :use_ssl => uri.scheme == 'https') { |http|
|
12
|
+
http.request(req)
|
13
|
+
}
|
14
|
+
|
15
|
+
res.body
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_list_from_bing(keyword,page=2)
|
19
|
+
content = search_from_bing(keyword,page)
|
20
|
+
|
21
|
+
super_link = Array.new
|
22
|
+
regex = /<d:Url.*?<\/d:Url>/
|
23
|
+
#Global.save_to_file(content,'bing.html','/htmls')
|
24
|
+
|
25
|
+
content.scan(regex).each do |n|
|
26
|
+
regex_http = /http.*?</
|
27
|
+
real_url = n.match(regex_http)
|
28
|
+
real_url = real_url.to_s.delete('<')
|
29
|
+
super_link.push(real_url)
|
30
|
+
#Global.save_link_info(real_url, 'bing')
|
31
|
+
super_link << [real_url,"bing"]
|
32
|
+
end
|
33
|
+
super_link
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Searcher::ChinaSearcher
|
2
|
+
require 'searcher/global'
|
3
|
+
attr_accessor :name,:url,:page_no,:page_name
|
4
|
+
PAGE_NUM = 2
|
5
|
+
|
6
|
+
def initialize(name,url,page_no='1',page_name='pn')
|
7
|
+
|
8
|
+
@name = name
|
9
|
+
@url = url
|
10
|
+
@page_no = page_no
|
11
|
+
@page_name = page_name
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
def search_keywords(keyword,page=PAGE_NUM)
|
16
|
+
res = ''
|
17
|
+
keyword_urls(keyword,page).each do |url|
|
18
|
+
res += Net::HTTP.get_response(URI.parse(url)).body
|
19
|
+
end
|
20
|
+
res
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
def keyword_urls (keyword,page=PAGE_NUM)
|
25
|
+
i = 'baidu' == self.name ? 0 : 1
|
26
|
+
sites = []
|
27
|
+
loop do
|
28
|
+
url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
|
29
|
+
sites << url_with_keyword
|
30
|
+
i += page_no.to_i
|
31
|
+
break if i > (page * self.page_no.to_i)
|
32
|
+
end
|
33
|
+
sites
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def get_list(keyword,page=PAGE_NUM)
|
38
|
+
content = search_keywords(keyword,page)
|
39
|
+
super_link = Array.new
|
40
|
+
regex = /<a.*?href.*?<\/a>/
|
41
|
+
|
42
|
+
#Global.save_to_file(content,self.name + '.html','/htmls')
|
43
|
+
|
44
|
+
content.scan(regex).each do |n|
|
45
|
+
if n.index('<em>') != nil
|
46
|
+
url =/"http.*?"/.match(n)
|
47
|
+
if url != nil
|
48
|
+
string_url = url.to_s.delete('"')
|
49
|
+
redirect_url = Global.html_get_web_url(string_url)
|
50
|
+
if redirect_url != nil
|
51
|
+
super_link << [redirect_url,self.name]
|
52
|
+
#Global.save_link_info(redirect_url,self.name)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
super_link
|
58
|
+
end
|
59
|
+
|
60
|
+
class << self
|
61
|
+
def keyword_urls(names,keyword,page=PAGE_NUM)
|
62
|
+
urls = []
|
63
|
+
names.each do |name|
|
64
|
+
name.keyword_urls(keyword,page).each do |url|
|
65
|
+
urls << url
|
66
|
+
end
|
67
|
+
end
|
68
|
+
urls
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Global
|
2
|
+
|
3
|
+
|
4
|
+
require 'ap' # gem install awesome_print
|
5
|
+
require 'json'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'forkmanager' # gem install parallel-forkmanager
|
8
|
+
require 'beanstalk-client'
|
9
|
+
require 'net/http'
|
10
|
+
require 'uri'
|
11
|
+
Dir[ "./core/*.rb" ].each { |file| require(file) }
|
12
|
+
|
13
|
+
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
|
14
|
+
|
15
|
+
def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
|
16
|
+
# get_final_url_from_response().
|
17
|
+
|
18
|
+
raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
|
19
|
+
begin
|
20
|
+
response = Net::HTTP.get_response(URI.parse(URI.decode(url)))
|
21
|
+
|
22
|
+
case response
|
23
|
+
when Net::HTTPSuccess then
|
24
|
+
url
|
25
|
+
when Net::HTTPRedirection then
|
26
|
+
location = response['location']
|
27
|
+
#warn "redirected to #{location}"
|
28
|
+
#html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
|
29
|
+
else
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
rescue => e
|
33
|
+
e.message
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
|
39
|
+
uri = URI.parse(url)
|
40
|
+
req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
|
41
|
+
req.add_field('User-Agent', user_agent)
|
42
|
+
res = Net::HTTP.start(uri.host, uri.port) do |http|
|
43
|
+
http.read_timeout = timeout
|
44
|
+
http.request(req)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def self.get_whole_html(res,user_agent=UserAgent,timeout=20)
|
50
|
+
|
51
|
+
encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
|
52
|
+
encoding = encoding ? encoding[0].upcase : 'GB18030'
|
53
|
+
html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
|
60
|
+
res = get_whole_response(url,user_agent,timeout)
|
61
|
+
res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
def self.save_link_info(url,info_type='baidu',path='/link_infos')
|
67
|
+
save_to_file(url,"#{info_type}.txt",path)
|
68
|
+
#into DB ... some code ...
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def self.save_to_file(content,file_name,path='/link_infos')
|
73
|
+
path = ".#{path}/"
|
74
|
+
Dir.mkdir(path) if !Dir.exist?(path)
|
75
|
+
|
76
|
+
logfile = File.open(path + file_name, 'a')
|
77
|
+
logfile.puts(content)
|
78
|
+
logfile.close
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'searcher/global'
|
2
|
+
|
3
|
+
def search_from_google(keyword,page=2)
|
4
|
+
res,links = '',[]
|
5
|
+
(1..page).each do |pn|
|
6
|
+
url_with_keyword = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyBvybq0NEaMtMkAkPUd7hhC-17AzcOc9x8&cx=013036536707430787589:_pqjad5hr1a&alt=json&fields=items(link)&q=' + URI.encode(keyword) + '&start=' + pn.to_s
|
7
|
+
url = URI.parse(url_with_keyword)
|
8
|
+
http = Net::HTTP.new(url.host, url.port)
|
9
|
+
http.use_ssl = true
|
10
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
11
|
+
request = Net::HTTP::Get.new(url.request_uri)
|
12
|
+
response = http.request(request)
|
13
|
+
res += response.body
|
14
|
+
links_strings = JSON.parse(response.body) rescue nil
|
15
|
+
links_strings['items'].each do |link|
|
16
|
+
links << [link['link'],"google"]
|
17
|
+
#Global.save_link_info(link['link'], 'google')
|
18
|
+
end
|
19
|
+
#links
|
20
|
+
end
|
21
|
+
links
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def get_list_from_google(keyword,page=2)
|
26
|
+
#content = search_from_google(keyword,page)
|
27
|
+
#Global.save_to_file(content,'google.html','/htmls')
|
28
|
+
search_from_google(keyword,page)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
data/lib/searcher.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
class Searcher
|
2
|
+
UsSearchers = ['google','bing']
|
3
|
+
ChinaSearchers = ['baidu','sogou','so360']
|
4
|
+
AllSearchers = UsSearchers + ChinaSearchers
|
5
|
+
class << self
|
6
|
+
def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
|
7
|
+
infos = []
|
8
|
+
start_time = Time.now
|
9
|
+
searchers.each do |searcher|
|
10
|
+
infos += send "get_info_from_" + searcher,keywords,page
|
11
|
+
end
|
12
|
+
p "after search #{(Time.now - start_time).round(4)}"
|
13
|
+
infos
|
14
|
+
end
|
15
|
+
|
16
|
+
AllSearchers.each do |search|
|
17
|
+
define_method "get_info_from_#{search}" do |keywords,page=1|
|
18
|
+
if UsSearchers.include?(search)
|
19
|
+
send "get_list_from_" + search,keywords,page
|
20
|
+
else
|
21
|
+
searcher = send(search)
|
22
|
+
return searcher.get_list(keywords,page)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def baidu
|
28
|
+
@baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
|
29
|
+
end
|
30
|
+
|
31
|
+
def sogou
|
32
|
+
@sogou = ChinaSearcher.new('sogou', 'http://www.sogou.com/web?query=', '1','page')
|
33
|
+
end
|
34
|
+
|
35
|
+
def so360
|
36
|
+
@so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
|
37
|
+
end
|
38
|
+
|
39
|
+
def china_searchers
|
40
|
+
[baidu,sogou,so360] # the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
|
41
|
+
#[sogou,so360]
|
42
|
+
end
|
43
|
+
|
44
|
+
#Searcher.china_searchers.each do |searcher|
|
45
|
+
#define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
|
46
|
+
# searcher.get_list(keywords,page)
|
47
|
+
#end
|
48
|
+
#end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
require 'searcher/china_searcher'
|
53
|
+
require 'searcher/bing'
|
54
|
+
require 'searcher/google'
|
55
|
+
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sofi-searcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zhimeng Sun
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-09 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Just a simple Searcher
|
15
|
+
email: 532681765@qq.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/searcher.rb
|
21
|
+
- lib/searcher/bing.rb
|
22
|
+
- lib/searcher/china_searcher.rb
|
23
|
+
- lib/searcher/global.rb
|
24
|
+
- lib/searcher/google.rb
|
25
|
+
homepage: http://rubygems.org/gems/sofi-searcher
|
26
|
+
licenses: []
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
none: false
|
33
|
+
requirements:
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ! '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 1.8.25
|
46
|
+
signing_key:
|
47
|
+
specification_version: 3
|
48
|
+
summary: Just a searcher
|
49
|
+
test_files: []
|