RubyGems - sofi-searcher - Versions diffs - 0.1.1 - Mend

sofi-searcher 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/searcher/bing.rb +41 -0
data/lib/searcher/china_searcher.rb +78 -0
data/lib/searcher/global.rb +86 -0
data/lib/searcher/google.rb +31 -0
data/lib/searcher.rb +55 -0
metadata +49 -0

data/lib/searcher/bing.rb ADDED Viewed

@@ -0,0 +1,41 @@
+  require 'searcher/global'
+  def search_from_bing(keyword,page=2)
+    account_key = 'Onm2ZtMfIJsKdKLopx6/VpyADuqrdJPhsacwUuez7Ds='
+    bing_keyword = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27' + URI.encode(keyword) + '%27' + '&$skip=0'
+    uri = URI(bing_keyword)
+    req = Net::HTTP::Get.new(uri.request_uri)
+    req.basic_auth('', account_key)
+    res = Net::HTTP.start(uri.hostname, uri.port, :use_ssl => uri.scheme == 'https') { |http|
+      http.request(req)
+    }
+    res.body
+  end
+  def get_list_from_bing(keyword,page=2)
+    content = search_from_bing(keyword,page)
+    super_link = Array.new
+    regex = /<d:Url.*?<\/d:Url>/
+    #Global.save_to_file(content,'bing.html','/htmls')
+    content.scan(regex).each  do  |n|
+      regex_http = /http.*?</
+      real_url = n.match(regex_http)
+      real_url = real_url.to_s.delete('<')
+      super_link.push(real_url)
+      #Global.save_link_info(real_url, 'bing')
+      super_link <<  [real_url,"bing"]
+    end
+    super_link
+  end

data/lib/searcher/china_searcher.rb ADDED Viewed

@@ -0,0 +1,78 @@
+class Searcher::ChinaSearcher
+    require 'searcher/global'
+    attr_accessor :name,:url,:page_no,:page_name
+    PAGE_NUM = 2
+    def initialize(name,url,page_no='1',page_name='pn')
+      @name =  name
+      @url = url
+      @page_no = page_no
+      @page_name = page_name
+    end
+    def search_keywords(keyword,page=PAGE_NUM)
+      res = ''
+        keyword_urls(keyword,page).each do |url|
+          res += Net::HTTP.get_response(URI.parse(url)).body
+        end
+      res
+    end
+    def keyword_urls (keyword,page=PAGE_NUM)
+      i =  'baidu' == self.name ? 0 : 1
+      sites = []
+      loop do
+         url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
+         sites << url_with_keyword
+         i += page_no.to_i
+         break if i > (page * self.page_no.to_i)
+       end
+      sites
+    end
+    def get_list(keyword,page=PAGE_NUM)
+      content = search_keywords(keyword,page)
+      super_link = Array.new
+      regex = /<a.*?href.*?<\/a>/
+      #Global.save_to_file(content,self.name + '.html','/htmls')
+      content.scan(regex).each do |n|
+        if n.index('<em>') != nil
+          url =/"http.*?"/.match(n)
+          if url != nil
+            string_url = url.to_s.delete('"')
+            redirect_url = Global.html_get_web_url(string_url)
+            if redirect_url != nil
+              super_link << [redirect_url,self.name]
+              #Global.save_link_info(redirect_url,self.name)
+            end
+          end
+        end
+      end
+      super_link
+    end
+    class << self
+        def  keyword_urls(names,keyword,page=PAGE_NUM)
+          urls = []
+          names.each do |name|
+               name.keyword_urls(keyword,page).each do |url|
+                  urls << url
+               end
+          end
+          urls
+        end
+    end
+end

data/lib/searcher/global.rb ADDED Viewed

@@ -0,0 +1,86 @@
+module Global
+  require 'ap'                # gem install awesome_print
+  require 'json'
+  require 'nokogiri'
+  require 'forkmanager'       # gem install parallel-forkmanager
+  require 'beanstalk-client'
+  require 'net/http'
+  require 'uri'
+  Dir[ "./core/*.rb" ].each { |file| require(file) }
+  UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
+  def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
+    # get_final_url_from_response().
+    raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
+    begin
+          response = Net::HTTP.get_response(URI.parse(URI.decode(url)))
+          case response
+            when Net::HTTPSuccess then
+              url
+            when Net::HTTPRedirection then
+              location = response['location']
+              #warn "redirected to #{location}"
+              #html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
+            else
+              nil
+          end
+    rescue => e
+          e.message
+    end
+  end
+  def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
+    uri = URI.parse(url)
+    req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
+    req.add_field('User-Agent', user_agent)
+    res = Net::HTTP.start(uri.host, uri.port) do |http|
+      http.read_timeout = timeout
+      http.request(req)
+    end
+  end
+  def self.get_whole_html(res,user_agent=UserAgent,timeout=20)
+    encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
+    encoding = encoding ? encoding[0].upcase : 'GB18030'
+    html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')
+  end
+  def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
+    res = get_whole_response(url,user_agent,timeout)
+    res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
+  end
+  def self.save_link_info(url,info_type='baidu',path='/link_infos')
+    save_to_file(url,"#{info_type}.txt",path)
+    #into DB ... some code ...
+  end
+  def self.save_to_file(content,file_name,path='/link_infos')
+    path = ".#{path}/"
+    Dir.mkdir(path)  if !Dir.exist?(path)
+    logfile = File.open(path + file_name, 'a')
+    logfile.puts(content)
+    logfile.close
+  end
+end

data/lib/searcher/google.rb ADDED Viewed

@@ -0,0 +1,31 @@
+    require 'searcher/global'
+    def search_from_google(keyword,page=2)
+      res,links = '',[]
+      (1..page).each do |pn|
+        url_with_keyword = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyBvybq0NEaMtMkAkPUd7hhC-17AzcOc9x8&cx=013036536707430787589:_pqjad5hr1a&alt=json&fields=items(link)&q=' + URI.encode(keyword) + '&start=' + pn.to_s
+        url = URI.parse(url_with_keyword)
+        http = Net::HTTP.new(url.host, url.port)
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+        request = Net::HTTP::Get.new(url.request_uri)
+        response = http.request(request)
+        res += response.body
+        links_strings = JSON.parse(response.body) rescue nil
+        links_strings['items'].each do |link|
+          links << [link['link'],"google"]
+          #Global.save_link_info(link['link'], 'google')
+        end
+        #links
+      end
+      links
+    end
+    def get_list_from_google(keyword,page=2)
+      #content = search_from_google(keyword,page)
+      #Global.save_to_file(content,'google.html','/htmls')
+      search_from_google(keyword,page)
+    end

data/lib/searcher.rb ADDED Viewed

@@ -0,0 +1,55 @@
+class Searcher
+ UsSearchers = ['google','bing']
+ ChinaSearchers = ['baidu','sogou','so360']
+ AllSearchers = UsSearchers + ChinaSearchers
+class << self
+  def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
+          infos = []
+          start_time = Time.now
+          searchers.each do |searcher|
+           infos += send "get_info_from_" + searcher,keywords,page
+          end
+          p "after search #{(Time.now - start_time).round(4)}"
+          infos
+     end
+     AllSearchers.each do |search|
+        define_method "get_info_from_#{search}" do |keywords,page=1|
+          if UsSearchers.include?(search)
+            send "get_list_from_" + search,keywords,page
+          else
+            searcher = send(search)
+            return searcher.get_list(keywords,page)
+          end
+        end
+     end
+      def baidu
+         @baidu =  ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
+      end
+      def sogou
+         @sogou =  ChinaSearcher.new('sogou', 'http://www.sogou.com/web?query=', '1','page')
+      end
+      def so360
+         @so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
+      end
+      def china_searchers
+        [baidu,sogou,so360]    #  the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
+        #[sogou,so360]
+      end
+      #Searcher.china_searchers.each do |searcher|
+        #define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
+         # searcher.get_list(keywords,page)
+        #end
+      #end
+ end
+end
+require 'searcher/china_searcher'
+require 'searcher/bing'
+require 'searcher/google'

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: sofi-searcher
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+  prerelease:
+platform: ruby
+authors:
+- Zhimeng Sun
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-04-09 00:00:00.000000000 Z
+dependencies: []
+description: Just a simple Searcher
+email: 532681765@qq.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/searcher.rb
+- lib/searcher/bing.rb
+- lib/searcher/china_searcher.rb
+- lib/searcher/global.rb
+- lib/searcher/google.rb
+homepage: http://rubygems.org/gems/sofi-searcher
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: Just a searcher
+test_files: []