sofi-searcher 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/searcher.rb +2 -2
  2. data/lib/searcher/spider.rb +33 -25
  3. metadata +1 -1
data/lib/searcher.rb CHANGED
@@ -5,7 +5,7 @@ class Searcher
5
5
  ChinaSearchers = %w(baidu sogou so360)
6
6
  AllSearchers = UsSearchers + ChinaSearchers
7
7
  class << self
8
- def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
8
+ def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
9
9
  infos = []
10
10
  start_time = Time.now
11
11
  searchers.each do |searcher|
@@ -26,7 +26,7 @@ class Searcher
26
26
  end
27
27
  end
28
28
 
29
- def get_infos_from(url,selector='title')
29
+ def get_infos_from_url(url,selector='title')
30
30
  crawler.fetch(url,selector)
31
31
  end
32
32
 
@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
16
16
  attr_accessor :user_agent, :redirect_limit, :timeout
17
17
 
18
18
  def fetch(website,selector='')
19
-
19
+ p "Pid:#{Process.pid}, fetch: #{website}\n"
20
20
  res = Global.get_whole_response(website,@user_agent,@timeout)
21
21
  html = Global.get_whole_html(res,@user_agent,@timeout)
22
22
  doc = Nokogiri::HTML(html)
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
31
31
  @websites = websites # the url we ready to crawl
32
32
  @beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
33
33
  @pm_max = pm_max # max process number
34
- @user_agent = user_agent # user_agent 伪装成浏览器访问
34
+ @user_agent = user_agent
35
35
  @redirect_limit = redirect_limit
36
-
37
- @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
36
+ @ipc_reader, @ipc_writer = IO.pipe
38
37
  end
39
38
 
40
39
 
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
48
47
  job.delete
49
48
  end
50
49
  rescue Beanstalk::TimedOut
51
- print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
50
+ print "Beanstalk queues cleared!\n"
52
51
  end
53
52
  @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
54
53
  beanstalk.close
55
- rescue => e
56
- puts e
57
- exit
54
+ rescue => e
55
+ puts e
56
+ exit
58
57
  end
59
58
 
60
59
 
61
- def process_jobs # 处理任务
62
- start_time = Time.now
60
+ def process_jobs
61
+
63
62
  pm = Parallel::ForkManager.new(@pm_max)
63
+
64
+ #pm.run_on_start do |pid,ident|
65
+ # print "** #{ident} started, pid: #{pid} and size of results is #{results.size}\n"
66
+ #end
67
+ #
68
+ #pm.run_on_finish {
69
+ # |pid,exit_code,ident|
70
+ # print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and size of results is #{results.size}\n"
71
+ #}
72
+
64
73
  @pm_max.times do |i|
65
- pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算
74
+
75
+ pm.start(i) and next
66
76
  beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
67
- @ipc_reader.close # 关闭读取管道,子进程只返回数据
68
- loop{
77
+ @ipc_reader.close
78
+
79
+ loop do
69
80
  begin
70
- job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈
81
+ job = beanstalk.reserve(0.1) # timeout 0.1s
71
82
  index = job.body
72
83
  job.delete
73
84
  website = @websites[index.to_i]
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
76
87
  rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
77
88
  break
78
89
  end
79
- }
80
- @ipc_writer.close
81
- pm.finish(0)
90
+ end
91
+ pm.finish(i)
82
92
  end
93
+
83
94
  @ipc_writer.close
95
+
84
96
  begin
85
- pm.wait_all_children # (wait all children finished) 等待所有子进程处理完毕
97
+ pm.wait_all_children
86
98
  rescue SystemExit, Interrupt
87
- print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
88
- ensure
89
- results = read_results
90
- #ap results, :indent => -4 , :index=>false # 打印处理结果
91
- #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
99
+ print "Interrupt wait all children!\n"
92
100
  end
93
- end
94
101
 
102
+ end
95
103
 
96
104
  def read_results
97
105
  results = []
98
106
  while result = @ipc_reader.gets
99
107
  results << result
100
108
  end
101
- @ipc_reader.close
102
109
  results
103
110
  end
104
111
 
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
106
113
  def run
107
114
  init_beanstalk_jobs
108
115
  process_jobs
116
+ read_results
109
117
  end
110
118
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sofi-searcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: