sofi-searcher 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+
2
+ require 'searcher/global'
3
+ def search_from_bing(keyword,page=2)
4
+ account_key = 'Onm2ZtMfIJsKdKLopx6/VpyADuqrdJPhsacwUuez7Ds='
5
+ bing_keyword = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27' + URI.encode(keyword) + '%27' + '&$skip=0'
6
+ uri = URI(bing_keyword)
7
+
8
+ req = Net::HTTP::Get.new(uri.request_uri)
9
+ req.basic_auth('', account_key)
10
+
11
+ res = Net::HTTP.start(uri.hostname, uri.port, :use_ssl => uri.scheme == 'https') { |http|
12
+ http.request(req)
13
+ }
14
+
15
+ res.body
16
+ end
17
+
18
+ def get_list_from_bing(keyword,page=2)
19
+ content = search_from_bing(keyword,page)
20
+
21
+ super_link = Array.new
22
+ regex = /<d:Url.*?<\/d:Url>/
23
+ #Global.save_to_file(content,'bing.html','/htmls')
24
+
25
+ content.scan(regex).each do |n|
26
+ regex_http = /http.*?</
27
+ real_url = n.match(regex_http)
28
+ real_url = real_url.to_s.delete('<')
29
+ super_link.push(real_url)
30
+ #Global.save_link_info(real_url, 'bing')
31
+ super_link << [real_url,"bing"]
32
+ end
33
+ super_link
34
+ end
35
+
36
+
37
+
38
+
39
+
40
+
41
+
@@ -0,0 +1,78 @@
1
+ class Searcher::ChinaSearcher
2
+ require 'searcher/global'
3
+ attr_accessor :name,:url,:page_no,:page_name
4
+ PAGE_NUM = 2
5
+
6
+ def initialize(name,url,page_no='1',page_name='pn')
7
+
8
+ @name = name
9
+ @url = url
10
+ @page_no = page_no
11
+ @page_name = page_name
12
+
13
+ end
14
+
15
+ def search_keywords(keyword,page=PAGE_NUM)
16
+ res = ''
17
+ keyword_urls(keyword,page).each do |url|
18
+ res += Net::HTTP.get_response(URI.parse(url)).body
19
+ end
20
+ res
21
+ end
22
+
23
+
24
+ def keyword_urls (keyword,page=PAGE_NUM)
25
+ i = 'baidu' == self.name ? 0 : 1
26
+ sites = []
27
+ loop do
28
+ url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
29
+ sites << url_with_keyword
30
+ i += page_no.to_i
31
+ break if i > (page * self.page_no.to_i)
32
+ end
33
+ sites
34
+ end
35
+
36
+
37
+ def get_list(keyword,page=PAGE_NUM)
38
+ content = search_keywords(keyword,page)
39
+ super_link = Array.new
40
+ regex = /<a.*?href.*?<\/a>/
41
+
42
+ #Global.save_to_file(content,self.name + '.html','/htmls')
43
+
44
+ content.scan(regex).each do |n|
45
+ if n.index('<em>') != nil
46
+ url =/"http.*?"/.match(n)
47
+ if url != nil
48
+ string_url = url.to_s.delete('"')
49
+ redirect_url = Global.html_get_web_url(string_url)
50
+ if redirect_url != nil
51
+ super_link << [redirect_url,self.name]
52
+ #Global.save_link_info(redirect_url,self.name)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ super_link
58
+ end
59
+
60
+ class << self
61
+ def keyword_urls(names,keyword,page=PAGE_NUM)
62
+ urls = []
63
+ names.each do |name|
64
+ name.keyword_urls(keyword,page).each do |url|
65
+ urls << url
66
+ end
67
+ end
68
+ urls
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+
75
+
76
+
77
+
78
+
@@ -0,0 +1,86 @@
1
+ module Global
2
+
3
+
4
+ require 'ap' # gem install awesome_print
5
+ require 'json'
6
+ require 'nokogiri'
7
+ require 'forkmanager' # gem install parallel-forkmanager
8
+ require 'beanstalk-client'
9
+ require 'net/http'
10
+ require 'uri'
11
+ Dir[ "./core/*.rb" ].each { |file| require(file) }
12
+
13
+ UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
14
+
15
+ def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
16
+ # get_final_url_from_response().
17
+
18
+ raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
19
+ begin
20
+ response = Net::HTTP.get_response(URI.parse(URI.decode(url)))
21
+
22
+ case response
23
+ when Net::HTTPSuccess then
24
+ url
25
+ when Net::HTTPRedirection then
26
+ location = response['location']
27
+ #warn "redirected to #{location}"
28
+ #html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
29
+ else
30
+ nil
31
+ end
32
+ rescue => e
33
+ e.message
34
+ end
35
+ end
36
+
37
+
38
+ def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
39
+ uri = URI.parse(url)
40
+ req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
41
+ req.add_field('User-Agent', user_agent)
42
+ res = Net::HTTP.start(uri.host, uri.port) do |http|
43
+ http.read_timeout = timeout
44
+ http.request(req)
45
+ end
46
+ end
47
+
48
+
49
+ def self.get_whole_html(res,user_agent=UserAgent,timeout=20)
50
+
51
+ encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
52
+ encoding = encoding ? encoding[0].upcase : 'GB18030'
53
+ html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')
54
+
55
+ end
56
+
57
+
58
+
59
+ def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
60
+ res = get_whole_response(url,user_agent,timeout)
61
+ res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
62
+ end
63
+
64
+
65
+
66
+ def self.save_link_info(url,info_type='baidu',path='/link_infos')
67
+ save_to_file(url,"#{info_type}.txt",path)
68
+ #into DB ... some code ...
69
+ end
70
+
71
+
72
+ def self.save_to_file(content,file_name,path='/link_infos')
73
+ path = ".#{path}/"
74
+ Dir.mkdir(path) if !Dir.exist?(path)
75
+
76
+ logfile = File.open(path + file_name, 'a')
77
+ logfile.puts(content)
78
+ logfile.close
79
+ end
80
+
81
+
82
+
83
+
84
+
85
+
86
+ end
@@ -0,0 +1,31 @@
1
+ require 'searcher/global'
2
+
3
+ def search_from_google(keyword,page=2)
4
+ res,links = '',[]
5
+ (1..page).each do |pn|
6
+ url_with_keyword = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyBvybq0NEaMtMkAkPUd7hhC-17AzcOc9x8&cx=013036536707430787589:_pqjad5hr1a&alt=json&fields=items(link)&q=' + URI.encode(keyword) + '&start=' + pn.to_s
7
+ url = URI.parse(url_with_keyword)
8
+ http = Net::HTTP.new(url.host, url.port)
9
+ http.use_ssl = true
10
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
11
+ request = Net::HTTP::Get.new(url.request_uri)
12
+ response = http.request(request)
13
+ res += response.body
14
+ links_strings = JSON.parse(response.body) rescue nil
15
+ links_strings['items'].each do |link|
16
+ links << [link['link'],"google"]
17
+ #Global.save_link_info(link['link'], 'google')
18
+ end
19
+ #links
20
+ end
21
+ links
22
+ end
23
+
24
+
25
+ def get_list_from_google(keyword,page=2)
26
+ #content = search_from_google(keyword,page)
27
+ #Global.save_to_file(content,'google.html','/htmls')
28
+ search_from_google(keyword,page)
29
+ end
30
+
31
+
data/lib/searcher.rb ADDED
@@ -0,0 +1,55 @@
1
+ class Searcher
2
+ UsSearchers = ['google','bing']
3
+ ChinaSearchers = ['baidu','sogou','so360']
4
+ AllSearchers = UsSearchers + ChinaSearchers
5
+ class << self
6
+ def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
7
+ infos = []
8
+ start_time = Time.now
9
+ searchers.each do |searcher|
10
+ infos += send "get_info_from_" + searcher,keywords,page
11
+ end
12
+ p "after search #{(Time.now - start_time).round(4)}"
13
+ infos
14
+ end
15
+
16
+ AllSearchers.each do |search|
17
+ define_method "get_info_from_#{search}" do |keywords,page=1|
18
+ if UsSearchers.include?(search)
19
+ send "get_list_from_" + search,keywords,page
20
+ else
21
+ searcher = send(search)
22
+ return searcher.get_list(keywords,page)
23
+ end
24
+ end
25
+ end
26
+
27
+ def baidu
28
+ @baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
29
+ end
30
+
31
+ def sogou
32
+ @sogou = ChinaSearcher.new('sogou', 'http://www.sogou.com/web?query=', '1','page')
33
+ end
34
+
35
+ def so360
36
+ @so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
37
+ end
38
+
39
+ def china_searchers
40
+ [baidu,sogou,so360] # the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
41
+ #[sogou,so360]
42
+ end
43
+
44
+ #Searcher.china_searchers.each do |searcher|
45
+ #define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
46
+ # searcher.get_list(keywords,page)
47
+ #end
48
+ #end
49
+ end
50
+
51
+ end
52
+ require 'searcher/china_searcher'
53
+ require 'searcher/bing'
54
+ require 'searcher/google'
55
+
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sofi-searcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Zhimeng Sun
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-09 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Just a simple Searcher
15
+ email: 532681765@qq.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/searcher.rb
21
+ - lib/searcher/bing.rb
22
+ - lib/searcher/china_searcher.rb
23
+ - lib/searcher/global.rb
24
+ - lib/searcher/google.rb
25
+ homepage: http://rubygems.org/gems/sofi-searcher
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.25
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Just a searcher
49
+ test_files: []