sofi-searcher 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/searcher.rb +2 -2
  2. data/lib/searcher/spider.rb +33 -25
  3. metadata +1 -1
data/lib/searcher.rb CHANGED
@@ -5,7 +5,7 @@ class Searcher
5
5
  ChinaSearchers = %w(baidu sogou so360)
6
6
  AllSearchers = UsSearchers + ChinaSearchers
7
7
  class << self
8
- def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
8
+ def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
9
9
  infos = []
10
10
  start_time = Time.now
11
11
  searchers.each do |searcher|
@@ -26,7 +26,7 @@ class Searcher
26
26
  end
27
27
  end
28
28
 
29
- def get_infos_from(url,selector='title')
29
+ def get_infos_from_url(url,selector='title')
30
30
  crawler.fetch(url,selector)
31
31
  end
32
32
 
@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
16
16
  attr_accessor :user_agent, :redirect_limit, :timeout
17
17
 
18
18
  def fetch(website,selector='')
19
-
19
+ p "Pid:#{Process.pid}, fetch: #{website}\n"
20
20
  res = Global.get_whole_response(website,@user_agent,@timeout)
21
21
  html = Global.get_whole_html(res,@user_agent,@timeout)
22
22
  doc = Nokogiri::HTML(html)
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
31
31
  @websites = websites # the url we ready to crawl
32
32
  @beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
33
33
  @pm_max = pm_max # max process number
34
- @user_agent = user_agent # user_agent 伪装成浏览器访问
34
+ @user_agent = user_agent
35
35
  @redirect_limit = redirect_limit
36
-
37
- @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
36
+ @ipc_reader, @ipc_writer = IO.pipe
38
37
  end
39
38
 
40
39
 
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
48
47
  job.delete
49
48
  end
50
49
  rescue Beanstalk::TimedOut
51
- print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
50
+ print "Beanstalk queues cleared!\n"
52
51
  end
53
52
  @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
54
53
  beanstalk.close
55
- rescue => e
56
- puts e
57
- exit
54
+ rescue => e
55
+ puts e
56
+ exit
58
57
  end
59
58
 
60
59
 
61
- def process_jobs # 处理任务
62
- start_time = Time.now
60
+ def process_jobs
61
+
63
62
  pm = Parallel::ForkManager.new(@pm_max)
63
+
64
+ #pm.run_on_start do |pid,ident|
65
+ # print "** #{ident} started, pid: #{pid} and size of results is #{results.size}\n"
66
+ #end
67
+ #
68
+ #pm.run_on_finish {
69
+ # |pid,exit_code,ident|
70
+ # print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and size of results is #{results.size}\n"
71
+ #}
72
+
64
73
  @pm_max.times do |i|
65
- pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算
74
+
75
+ pm.start(i) and next
66
76
  beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
67
- @ipc_reader.close # 关闭读取管道,子进程只返回数据
68
- loop{
77
+ @ipc_reader.close
78
+
79
+ loop do
69
80
  begin
70
- job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈
81
+ job = beanstalk.reserve(0.1) # timeout 0.1s
71
82
  index = job.body
72
83
  job.delete
73
84
  website = @websites[index.to_i]
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
76
87
  rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
77
88
  break
78
89
  end
79
- }
80
- @ipc_writer.close
81
- pm.finish(0)
90
+ end
91
+ pm.finish(i)
82
92
  end
93
+
83
94
  @ipc_writer.close
95
+
84
96
  begin
85
- pm.wait_all_children # (wait all children finished) 等待所有子进程处理完毕
97
+ pm.wait_all_children
86
98
  rescue SystemExit, Interrupt
87
- print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
88
- ensure
89
- results = read_results
90
- #ap results, :indent => -4 , :index=>false # 打印处理结果
91
- #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
99
+ print "Interrupt wait all children!\n"
92
100
  end
93
- end
94
101
 
102
+ end
95
103
 
96
104
  def read_results
97
105
  results = []
98
106
  while result = @ipc_reader.gets
99
107
  results << result
100
108
  end
101
- @ipc_reader.close
102
109
  results
103
110
  end
104
111
 
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
106
113
  def run
107
114
  init_beanstalk_jobs
108
115
  process_jobs
116
+ read_results
109
117
  end
110
118
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sofi-searcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: