RubyGems - sofi-searcher - Versions diffs - 0.1.3 → 0.1.4 - Mend

sofi-searcher 0.1.3 → 0.1.4

Files changed (3) hide show

data/lib/searcher.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class Searcher
  ChinaSearchers = %w(baidu sogou so360)
  AllSearchers = UsSearchers + ChinaSearchers
   class << self
-      def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
+      def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
           infos = []
           start_time = Time.now
           searchers.each do |searcher|
@@ -26,7 +26,7 @@ class Searcher
         end
       end
-      def get_infos_from(url,selector='title')
+      def get_infos_from_url(url,selector='title')
         crawler.fetch(url,selector)
       end

data/lib/searcher/spider.rb CHANGED Viewed

@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
     attr_accessor :user_agent, :redirect_limit, :timeout
     def fetch(website,selector='')
+        p "Pid:#{Process.pid}, fetch: #{website}\n"
         res = Global.get_whole_response(website,@user_agent,@timeout)
         html = Global.get_whole_html(res,@user_agent,@timeout)
         doc = Nokogiri::HTML(html)
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
     @websites = websites                # the url we ready to crawl
     @beanstalk_jobs = beanstalk_jobs    # beanstalk host port and so on
     @pm_max = pm_max                    # max process number
-    @user_agent = user_agent            # user_agent 伪装成浏览器访问
+    @user_agent = user_agent
     @redirect_limit = redirect_limit
-    @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
+    @ipc_reader, @ipc_writer = IO.pipe
   end
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
         job.delete
       end
     rescue Beanstalk::TimedOut
-      print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
+      print "Beanstalk queues cleared!\n"
     end
     @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
     beanstalk.close
-  rescue => e
-    puts e
-    exit
+      rescue => e
+        puts e
+        exit
   end
-  def process_jobs # 处理任务
-    start_time = Time.now
+  def process_jobs
     pm = Parallel::ForkManager.new(@pm_max)
+    #pm.run_on_start do |pid,ident|
+    #  print "** #{ident} started, pid: #{pid} and  size of results is #{results.size}\n"
+    #end
+    #
+    #pm.run_on_finish {
+    #    |pid,exit_code,ident|
+    #  print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and  size of results is #{results.size}\n"
+    #}
     @pm_max.times do |i|
-      pm.start(i) and next # 启动后，立刻 next 不会等待进程执行完，这样才可以并行运算
+      pm.start(i) and next
       beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
-      @ipc_reader.close    # 关闭读取管道，子进程只返回数据
-      loop{
+      @ipc_reader.close
+      loop do
         begin
-          job = beanstalk.reserve(0.1) # 检测超时为0.1秒，因为任务以前提前压栈
+          job = beanstalk.reserve(0.1) # timeout 0.1s
           index = job.body
           job.delete
           website = @websites[index.to_i]
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
         rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
           break
         end
-      }
-      @ipc_writer.close
-      pm.finish(0)
+      end
+      pm.finish(i)
     end
     @ipc_writer.close
     begin
-      pm.wait_all_children        # (wait all children finished) 等待所有子进程处理完毕
+      pm.wait_all_children
     rescue SystemExit, Interrupt
-      print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
-    ensure
-      results = read_results
-      #ap results, :indent => -4 , :index=>false # 打印处理结果
-      #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
+      print "Interrupt wait all children!\n"
     end
-  end
+  end
   def read_results
     results = []
     while result = @ipc_reader.gets
       results << result
     end
-    @ipc_reader.close
     results
   end
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
   def run
     init_beanstalk_jobs
     process_jobs
+    read_results
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sofi-searcher
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
   prerelease:
 platform: ruby
 authors: