RubyGems - sofi-searcher - Versions diffs - 0.1.1 → 0.1.2 - Mend

sofi-searcher 0.1.1 → 0.1.2

Files changed (7) hide show

data/lib/core/nil.rb +9 -0
data/lib/searcher/bing.rb +1 -1
data/lib/searcher/china_searcher.rb +7 -6
data/lib/searcher/global.rb +5 -7
data/lib/searcher/spider.rb +110 -0
data/lib/searcher.rb +28 -21
metadata +4 -2

data/lib/core/nil.rb ADDED Viewed

@@ -0,0 +1,9 @@
+class Nil
+  def to_s
+    ''
+  end
+  def +(s)
+    s
+  end
+end

data/lib/searcher/bing.rb CHANGED Viewed

@@ -26,7 +26,7 @@
       regex_http = /http.*?</
       real_url = n.match(regex_http)
       real_url = real_url.to_s.delete('<')
-      super_link.push(real_url)
+      #super_link.push(real_url)
       #Global.save_link_info(real_url, 'bing')
       super_link <<  [real_url,"bing"]
     end

data/lib/searcher/china_searcher.rb CHANGED Viewed

@@ -22,13 +22,13 @@ class Searcher::ChinaSearcher
     def keyword_urls (keyword,page=PAGE_NUM)
-      i =  'baidu' == self.name ? 0 : 1
+      i =  'baidu' == @name ? 0 : 1
       sites = []
       loop do
-         url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
+         url_with_keyword = @url + URI.encode(keyword) + '&' + @page_name + '=' + i.to_s
          sites << url_with_keyword
          i += page_no.to_i
-         break if i > (page * self.page_no.to_i)
+         break if i > (page * @page_no.to_i)
        end
       sites
     end
@@ -39,7 +39,7 @@ class Searcher::ChinaSearcher
       super_link = Array.new
       regex = /<a.*?href.*?<\/a>/
-      #Global.save_to_file(content,self.name + '.html','/htmls')
+      #Global.save_to_file(content,@name + '.html','/htmls')
       content.scan(regex).each do |n|
         if n.index('<em>') != nil
@@ -48,8 +48,8 @@ class Searcher::ChinaSearcher
             string_url = url.to_s.delete('"')
             redirect_url = Global.html_get_web_url(string_url)
             if redirect_url != nil
-              super_link << [redirect_url,self.name]
-              #Global.save_link_info(redirect_url,self.name)
+              super_link << [redirect_url,@name]
+              #Global.save_link_info(redirect_url,@name)
             end
           end
         end
@@ -67,6 +67,7 @@ class Searcher::ChinaSearcher
           end
           urls
         end
     end
 end

data/lib/searcher/global.rb CHANGED Viewed

@@ -1,19 +1,19 @@
 module Global
-  require 'ap'                # gem install awesome_print
+  #require 'ap'                # gem install awesome_print
   require 'json'
   require 'nokogiri'
   require 'forkmanager'       # gem install parallel-forkmanager
   require 'beanstalk-client'
   require 'net/http'
   require 'uri'
-  Dir[ "./core/*.rb" ].each { |file| require(file) }
   UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
+  Beanstalk_jobs = [['localhost:11300'],'crawler-jobs']
   def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
-    # get_final_url_from_response().
     raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
     begin
@@ -23,9 +23,7 @@ module Global
             when Net::HTTPSuccess then
               url
             when Net::HTTPRedirection then
-              location = response['location']
-              #warn "redirected to #{location}"
-              #html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
+              response['location']
             else
               nil
           end
@@ -37,7 +35,7 @@ module Global
   def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
     uri = URI.parse(url)
-    req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
+    req = Net::HTTP::Get.new(uri.path.to_s + '?' + uri.query.to_s)
     req.add_field('User-Agent', user_agent)
     res = Net::HTTP.start(uri.host, uri.port) do |http|
       http.read_timeout = timeout

data/lib/searcher/spider.rb ADDED Viewed

@@ -0,0 +1,110 @@
+#!/usr/bin/env ruby
+#encoding: UTF-8
+require 'searcher/global'
+require 'core/nil'
+class Searcher::MultipleCrawler
+  class Crawler
+    def initialize(user_agent=Global::UserAgent, redirect_limit=1)
+      @user_agent = user_agent
+      @redirect_limit = redirect_limit
+      @timeout = 20
+    end
+    attr_accessor :user_agent, :redirect_limit, :timeout
+    def fetch(website,selector='')
+        res = Global.get_whole_response(website,@user_agent,@timeout)
+        html = Global.get_whole_html(res,@user_agent,@timeout)
+        doc = Nokogiri::HTML(html)
+        #doc.css(selector)  if selector != ''
+    end
+  end
+  def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1)
+    @websites = websites                # the url we ready to crawl
+    @beanstalk_jobs = beanstalk_jobs    # beanstalk host port and so on
+    @pm_max = pm_max                    # max process number
+    @user_agent = user_agent            # user_agent 伪装成浏览器访问
+    @redirect_limit = redirect_limit
+    @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
+  end
+  attr_accessor :user_agent, :redirect_limit
+  def init_beanstalk_jobs
+    beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
+    #清空beanstalk的残留消息队列
+    begin
+      while job = beanstalk.reserve(0.1)
+        job.delete
+      end
+    rescue Beanstalk::TimedOut
+      print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
+    end
+    @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
+    beanstalk.close
+  rescue => e
+    puts e
+    exit
+  end
+  def process_jobs # 处理任务
+    start_time = Time.now
+    pm = Parallel::ForkManager.new(@pm_max)
+    @pm_max.times do |i|
+      pm.start(i) and next # 启动后，立刻 next 不会等待进程执行完，这样才可以并行运算
+      beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
+      @ipc_reader.close    # 关闭读取管道，子进程只返回数据
+      loop{
+        begin
+          job = beanstalk.reserve(0.1) # 检测超时为0.1秒，因为任务以前提前压栈
+          index = job.body
+          job.delete
+          website = @websites[index.to_i]
+          result = Crawler.new.fetch(website)
+          @ipc_writer.puts(result)
+        rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
+          break
+        end
+      }
+      @ipc_writer.close
+      pm.finish(0)
+    end
+    @ipc_writer.close
+    begin
+      pm.wait_all_children        # (wait all children finished) 等待所有子进程处理完毕
+    rescue SystemExit, Interrupt
+      print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
+    ensure
+      results = read_results
+      #ap results, :indent => -4 , :index=>false # 打印处理结果
+      #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
+    end
+  end
+  def read_results
+    results = []
+    while result = @ipc_reader.gets
+      results << result
+    end
+    @ipc_reader.close
+    results
+  end
+  def run
+    init_beanstalk_jobs
+    process_jobs
+  end
+end

data/lib/searcher.rb CHANGED Viewed

@@ -1,29 +1,39 @@
+require 'core/nil'
 class Searcher
- UsSearchers = ['google','bing']
- ChinaSearchers = ['baidu','sogou','so360']
+ UsSearchers = %w(google bing)
+ ChinaSearchers = %w(baidu sogou so360)
  AllSearchers = UsSearchers + ChinaSearchers
-class << self
-  def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
+  class << self
+      def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
           infos = []
           start_time = Time.now
           searchers.each do |searcher|
-           infos += send "get_info_from_" + searcher,keywords,page
+           infos += send 'get_info_from_' + searcher,keywords,page
           end
-          p "after search #{(Time.now - start_time).round(4)}"
+          infos << (Time.now - start_time).round(4)
           infos
-     end
-     AllSearchers.each do |search|
+      end
+      AllSearchers.each do |search|
         define_method "get_info_from_#{search}" do |keywords,page=1|
           if UsSearchers.include?(search)
-            send "get_list_from_" + search,keywords,page
+            send 'get_list_from_' + search,keywords,page
           else
             searcher = send(search)
-            return searcher.get_list(keywords,page)
+            searcher.get_list(keywords,page)
           end
         end
-     end
+      end
+      def get_infos_from(url,selector='title')
+        crawler.fetch(url,selector)
+      end
+      def crawler
+        @crawler = MultipleCrawler::Crawler.new
+      end
       def baidu
          @baidu =  ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
       end
@@ -33,23 +43,20 @@ class << self
       end
       def so360
-         @so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
+         @so360 = ChinaSearcher.new('so360','http://www.so.com/s?&q=')
       end
       def china_searchers
-        [baidu,sogou,so360]    #  the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
+        [baidu,sogou,so360]
         #[sogou,so360]
       end
-      #Searcher.china_searchers.each do |searcher|
-        #define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
-         # searcher.get_list(keywords,page)
-        #end
-      #end
- end
+  end
 end
 require 'searcher/china_searcher'
+require 'searcher/spider'
 require 'searcher/bing'
 require 'searcher/google'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sofi-searcher
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-09 00:00:00.000000000 Z
+date: 2013-04-12 00:00:00.000000000 Z
 dependencies: []
 description: Just a simple Searcher
 email: 532681765@qq.com
@@ -18,10 +18,12 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/searcher.rb
+- lib/core/nil.rb
 - lib/searcher/bing.rb
 - lib/searcher/china_searcher.rb
 - lib/searcher/global.rb
 - lib/searcher/google.rb
+- lib/searcher/spider.rb
 homepage: http://rubygems.org/gems/sofi-searcher
 licenses: []
 post_install_message: