RubyGems - sofi-searcher - Versions diffs - 0.1.3 → 0.1.4 - Mend

sofi-searcher 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/lib/searcher.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class Searcher
  ChinaSearchers = %w(baidu sogou so360)
  AllSearchers = UsSearchers + ChinaSearchers
   class << self
-      def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
+      def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
           infos = []
           start_time = Time.now
           searchers.each do |searcher|
@@ -26,7 +26,7 @@ class Searcher
         end
       end
-      def get_infos_from(url,selector='title')
+      def get_infos_from_url(url,selector='title')
         crawler.fetch(url,selector)
       end

data/lib/searcher/spider.rb CHANGED Viewed

@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
     attr_accessor :user_agent, :redirect_limit, :timeout
     def fetch(website,selector='')
+        p "Pid:#{Process.pid}, fetch: #{website}\n"
         res = Global.get_whole_response(website,@user_agent,@timeout)
         html = Global.get_whole_html(res,@user_agent,@timeout)
         doc = Nokogiri::HTML(html)
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
     @websites = websites                # the url we ready to crawl
     @beanstalk_jobs = beanstalk_jobs    # beanstalk host port and so on
     @pm_max = pm_max                    # max process number
-    @user_agent = user_agent            # user_agent 伪装成浏览器访问
+    @user_agent = user_agent
     @redirect_limit = redirect_limit
-    @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
+    @ipc_reader, @ipc_writer = IO.pipe
   end
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
         job.delete
       end
     rescue Beanstalk::TimedOut
-      print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
+      print "Beanstalk queues cleared!\n"
     end
     @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
     beanstalk.close
-  rescue => e
-    puts e
-    exit
+      rescue => e
+        puts e
+        exit
   end
-  def process_jobs # 处理任务
-    start_time = Time.now
+  def process_jobs
     pm = Parallel::ForkManager.new(@pm_max)
+    #pm.run_on_start do |pid,ident|
+    #  print "** #{ident} started, pid: #{pid} and  size of results is #{results.size}\n"
+    #end
+    #
+    #pm.run_on_finish {
+    #    |pid,exit_code,ident|
+    #  print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and  size of results is #{results.size}\n"
+    #}
     @pm_max.times do |i|
-      pm.start(i) and next # 启动后，立刻 next 不会等待进程执行完，这样才可以并行运算
+      pm.start(i) and next
       beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
-      @ipc_reader.close    # 关闭读取管道，子进程只返回数据
-      loop{
+      @ipc_reader.close
+      loop do
         begin
-          job = beanstalk.reserve(0.1) # 检测超时为0.1秒，因为任务以前提前压栈
+          job = beanstalk.reserve(0.1) # timeout 0.1s
           index = job.body
           job.delete
           website = @websites[index.to_i]
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
         rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
           break
         end
-      }
-      @ipc_writer.close
-      pm.finish(0)
+      end
+      pm.finish(i)
     end
     @ipc_writer.close
     begin
-      pm.wait_all_children        # (wait all children finished) 等待所有子进程处理完毕
+      pm.wait_all_children
     rescue SystemExit, Interrupt
-      print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
-    ensure
-      results = read_results
-      #ap results, :indent => -4 , :index=>false # 打印处理结果
-      #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
+      print "Interrupt wait all children!\n"
     end
-  end
+  end
   def read_results
     results = []
     while result = @ipc_reader.gets
       results << result
     end
-    @ipc_reader.close
     results
   end
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
   def run
     init_beanstalk_jobs
     process_jobs
+    read_results
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sofi-searcher
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
   prerelease:
 platform: ruby
 authors: