sofi-searcher 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+
2
+ require 'searcher/global'
3
+ def search_from_bing(keyword,page=2)
4
+ account_key = 'Onm2ZtMfIJsKdKLopx6/VpyADuqrdJPhsacwUuez7Ds='
5
+ bing_keyword = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27' + URI.encode(keyword) + '%27' + '&$skip=0'
6
+ uri = URI(bing_keyword)
7
+
8
+ req = Net::HTTP::Get.new(uri.request_uri)
9
+ req.basic_auth('', account_key)
10
+
11
+ res = Net::HTTP.start(uri.hostname, uri.port, :use_ssl => uri.scheme == 'https') { |http|
12
+ http.request(req)
13
+ }
14
+
15
+ res.body
16
+ end
17
+
18
+ def get_list_from_bing(keyword,page=2)
19
+ content = search_from_bing(keyword,page)
20
+
21
+ super_link = Array.new
22
+ regex = /<d:Url.*?<\/d:Url>/
23
+ #Global.save_to_file(content,'bing.html','/htmls')
24
+
25
+ content.scan(regex).each do |n|
26
+ regex_http = /http.*?</
27
+ real_url = n.match(regex_http)
28
+ real_url = real_url.to_s.delete('<')
29
+ super_link.push(real_url)
30
+ #Global.save_link_info(real_url, 'bing')
31
+ super_link << [real_url,"bing"]
32
+ end
33
+ super_link
34
+ end
35
+
36
+
37
+
38
+
39
+
40
+
41
+
@@ -0,0 +1,78 @@
1
+ class Searcher::ChinaSearcher
2
+ require 'searcher/global'
3
+ attr_accessor :name,:url,:page_no,:page_name
4
+ PAGE_NUM = 2
5
+
6
+ def initialize(name,url,page_no='1',page_name='pn')
7
+
8
+ @name = name
9
+ @url = url
10
+ @page_no = page_no
11
+ @page_name = page_name
12
+
13
+ end
14
+
15
+ def search_keywords(keyword,page=PAGE_NUM)
16
+ res = ''
17
+ keyword_urls(keyword,page).each do |url|
18
+ res += Net::HTTP.get_response(URI.parse(url)).body
19
+ end
20
+ res
21
+ end
22
+
23
+
24
+ def keyword_urls (keyword,page=PAGE_NUM)
25
+ i = 'baidu' == self.name ? 0 : 1
26
+ sites = []
27
+ loop do
28
+ url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
29
+ sites << url_with_keyword
30
+ i += page_no.to_i
31
+ break if i > (page * self.page_no.to_i)
32
+ end
33
+ sites
34
+ end
35
+
36
+
37
+ def get_list(keyword,page=PAGE_NUM)
38
+ content = search_keywords(keyword,page)
39
+ super_link = Array.new
40
+ regex = /<a.*?href.*?<\/a>/
41
+
42
+ #Global.save_to_file(content,self.name + '.html','/htmls')
43
+
44
+ content.scan(regex).each do |n|
45
+ if n.index('<em>') != nil
46
+ url =/"http.*?"/.match(n)
47
+ if url != nil
48
+ string_url = url.to_s.delete('"')
49
+ redirect_url = Global.html_get_web_url(string_url)
50
+ if redirect_url != nil
51
+ super_link << [redirect_url,self.name]
52
+ #Global.save_link_info(redirect_url,self.name)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ super_link
58
+ end
59
+
60
+ class << self
61
+ def keyword_urls(names,keyword,page=PAGE_NUM)
62
+ urls = []
63
+ names.each do |name|
64
+ name.keyword_urls(keyword,page).each do |url|
65
+ urls << url
66
+ end
67
+ end
68
+ urls
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+
75
+
76
+
77
+
78
+
@@ -0,0 +1,86 @@
1
+ module Global
2
+
3
+
4
+ require 'ap' # gem install awesome_print
5
+ require 'json'
6
+ require 'nokogiri'
7
+ require 'forkmanager' # gem install parallel-forkmanager
8
+ require 'beanstalk-client'
9
+ require 'net/http'
10
+ require 'uri'
11
+ Dir[ "./core/*.rb" ].each { |file| require(file) }
12
+
13
+ UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
14
+
15
+ def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
16
+ # get_final_url_from_response().
17
+
18
+ raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
19
+ begin
20
+ response = Net::HTTP.get_response(URI.parse(URI.decode(url)))
21
+
22
+ case response
23
+ when Net::HTTPSuccess then
24
+ url
25
+ when Net::HTTPRedirection then
26
+ location = response['location']
27
+ #warn "redirected to #{location}"
28
+ #html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
29
+ else
30
+ nil
31
+ end
32
+ rescue => e
33
+ e.message
34
+ end
35
+ end
36
+
37
+
38
+ def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
39
+ uri = URI.parse(url)
40
+ req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
41
+ req.add_field('User-Agent', user_agent)
42
+ res = Net::HTTP.start(uri.host, uri.port) do |http|
43
+ http.read_timeout = timeout
44
+ http.request(req)
45
+ end
46
+ end
47
+
48
+
49
+ def self.get_whole_html(res,user_agent=UserAgent,timeout=20)
50
+
51
+ encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
52
+ encoding = encoding ? encoding[0].upcase : 'GB18030'
53
+ html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')
54
+
55
+ end
56
+
57
+
58
+
59
+ def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
60
+ res = get_whole_response(url,user_agent,timeout)
61
+ res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
62
+ end
63
+
64
+
65
+
66
+ def self.save_link_info(url,info_type='baidu',path='/link_infos')
67
+ save_to_file(url,"#{info_type}.txt",path)
68
+ #into DB ... some code ...
69
+ end
70
+
71
+
72
+ def self.save_to_file(content,file_name,path='/link_infos')
73
+ path = ".#{path}/"
74
+ Dir.mkdir(path) if !Dir.exist?(path)
75
+
76
+ logfile = File.open(path + file_name, 'a')
77
+ logfile.puts(content)
78
+ logfile.close
79
+ end
80
+
81
+
82
+
83
+
84
+
85
+
86
+ end
@@ -0,0 +1,31 @@
1
+ require 'searcher/global'
2
+
3
+ def search_from_google(keyword,page=2)
4
+ res,links = '',[]
5
+ (1..page).each do |pn|
6
+ url_with_keyword = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyBvybq0NEaMtMkAkPUd7hhC-17AzcOc9x8&cx=013036536707430787589:_pqjad5hr1a&alt=json&fields=items(link)&q=' + URI.encode(keyword) + '&start=' + pn.to_s
7
+ url = URI.parse(url_with_keyword)
8
+ http = Net::HTTP.new(url.host, url.port)
9
+ http.use_ssl = true
10
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
11
+ request = Net::HTTP::Get.new(url.request_uri)
12
+ response = http.request(request)
13
+ res += response.body
14
+ links_strings = JSON.parse(response.body) rescue nil
15
+ links_strings['items'].each do |link|
16
+ links << [link['link'],"google"]
17
+ #Global.save_link_info(link['link'], 'google')
18
+ end
19
+ #links
20
+ end
21
+ links
22
+ end
23
+
24
+
25
+ def get_list_from_google(keyword,page=2)
26
+ #content = search_from_google(keyword,page)
27
+ #Global.save_to_file(content,'google.html','/htmls')
28
+ search_from_google(keyword,page)
29
+ end
30
+
31
+
data/lib/searcher.rb ADDED
@@ -0,0 +1,55 @@
1
+ class Searcher
2
+ UsSearchers = ['google','bing']
3
+ ChinaSearchers = ['baidu','sogou','so360']
4
+ AllSearchers = UsSearchers + ChinaSearchers
5
+ class << self
6
+ def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
7
+ infos = []
8
+ start_time = Time.now
9
+ searchers.each do |searcher|
10
+ infos += send "get_info_from_" + searcher,keywords,page
11
+ end
12
+ p "after search #{(Time.now - start_time).round(4)}"
13
+ infos
14
+ end
15
+
16
+ AllSearchers.each do |search|
17
+ define_method "get_info_from_#{search}" do |keywords,page=1|
18
+ if UsSearchers.include?(search)
19
+ send "get_list_from_" + search,keywords,page
20
+ else
21
+ searcher = send(search)
22
+ return searcher.get_list(keywords,page)
23
+ end
24
+ end
25
+ end
26
+
27
+ def baidu
28
+ @baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
29
+ end
30
+
31
+ def sogou
32
+ @sogou = ChinaSearcher.new('sogou', 'http://www.sogou.com/web?query=', '1','page')
33
+ end
34
+
35
+ def so360
36
+ @so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
37
+ end
38
+
39
+ def china_searchers
40
+ [baidu,sogou,so360] # the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
41
+ #[sogou,so360]
42
+ end
43
+
44
+ #Searcher.china_searchers.each do |searcher|
45
+ #define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
46
+ # searcher.get_list(keywords,page)
47
+ #end
48
+ #end
49
+ end
50
+
51
+ end
52
+ require 'searcher/china_searcher'
53
+ require 'searcher/bing'
54
+ require 'searcher/google'
55
+
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sofi-searcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Zhimeng Sun
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-09 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Just a simple Searcher
15
+ email: 532681765@qq.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/searcher.rb
21
+ - lib/searcher/bing.rb
22
+ - lib/searcher/china_searcher.rb
23
+ - lib/searcher/global.rb
24
+ - lib/searcher/google.rb
25
+ homepage: http://rubygems.org/gems/sofi-searcher
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.25
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Just a searcher
49
+ test_files: []