sofi-searcher 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/core/nil.rb ADDED
@@ -0,0 +1,9 @@
1
+ class Nil
2
+ def to_s
3
+ ''
4
+ end
5
+
6
+ def +(s)
7
+ s
8
+ end
9
+ end
data/lib/searcher/bing.rb CHANGED
@@ -26,7 +26,7 @@
26
26
  regex_http = /http.*?</
27
27
  real_url = n.match(regex_http)
28
28
  real_url = real_url.to_s.delete('<')
29
- super_link.push(real_url)
29
+ #super_link.push(real_url)
30
30
  #Global.save_link_info(real_url, 'bing')
31
31
  super_link << [real_url,"bing"]
32
32
  end
@@ -22,13 +22,13 @@ class Searcher::ChinaSearcher
22
22
 
23
23
 
24
24
  def keyword_urls (keyword,page=PAGE_NUM)
25
- i = 'baidu' == self.name ? 0 : 1
25
+ i = 'baidu' == @name ? 0 : 1
26
26
  sites = []
27
27
  loop do
28
- url_with_keyword = self.url + URI.encode(keyword) + '&' + self.page_name + '=' + i.to_s
28
+ url_with_keyword = @url + URI.encode(keyword) + '&' + @page_name + '=' + i.to_s
29
29
  sites << url_with_keyword
30
30
  i += page_no.to_i
31
- break if i > (page * self.page_no.to_i)
31
+ break if i > (page * @page_no.to_i)
32
32
  end
33
33
  sites
34
34
  end
@@ -39,7 +39,7 @@ class Searcher::ChinaSearcher
39
39
  super_link = Array.new
40
40
  regex = /<a.*?href.*?<\/a>/
41
41
 
42
- #Global.save_to_file(content,self.name + '.html','/htmls')
42
+ #Global.save_to_file(content,@name + '.html','/htmls')
43
43
 
44
44
  content.scan(regex).each do |n|
45
45
  if n.index('<em>') != nil
@@ -48,8 +48,8 @@ class Searcher::ChinaSearcher
48
48
  string_url = url.to_s.delete('"')
49
49
  redirect_url = Global.html_get_web_url(string_url)
50
50
  if redirect_url != nil
51
- super_link << [redirect_url,self.name]
52
- #Global.save_link_info(redirect_url,self.name)
51
+ super_link << [redirect_url,@name]
52
+ #Global.save_link_info(redirect_url,@name)
53
53
  end
54
54
  end
55
55
  end
@@ -67,6 +67,7 @@ class Searcher::ChinaSearcher
67
67
  end
68
68
  urls
69
69
  end
70
+
70
71
  end
71
72
 
72
73
  end
@@ -1,19 +1,19 @@
1
1
  module Global
2
2
 
3
3
 
4
- require 'ap' # gem install awesome_print
4
+ #require 'ap' # gem install awesome_print
5
5
  require 'json'
6
6
  require 'nokogiri'
7
7
  require 'forkmanager' # gem install parallel-forkmanager
8
8
  require 'beanstalk-client'
9
9
  require 'net/http'
10
10
  require 'uri'
11
- Dir[ "./core/*.rb" ].each { |file| require(file) }
11
+
12
12
 
13
13
  UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
14
+ Beanstalk_jobs = [['localhost:11300'],'crawler-jobs']
14
15
 
15
16
  def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
16
- # get_final_url_from_response().
17
17
 
18
18
  raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
19
19
  begin
@@ -23,9 +23,7 @@ module Global
23
23
  when Net::HTTPSuccess then
24
24
  url
25
25
  when Net::HTTPRedirection then
26
- location = response['location']
27
- #warn "redirected to #{location}"
28
- #html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
26
+ response['location']
29
27
  else
30
28
  nil
31
29
  end
@@ -37,7 +35,7 @@ module Global
37
35
 
38
36
  def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
39
37
  uri = URI.parse(url)
40
- req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
38
+ req = Net::HTTP::Get.new(uri.path.to_s + '?' + uri.query.to_s)
41
39
  req.add_field('User-Agent', user_agent)
42
40
  res = Net::HTTP.start(uri.host, uri.port) do |http|
43
41
  http.read_timeout = timeout
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env ruby
2
+ #encoding: UTF-8
3
+ require 'searcher/global'
4
+ require 'core/nil'
5
+
6
+ class Searcher::MultipleCrawler
7
+
8
+ class Crawler
9
+
10
+ def initialize(user_agent=Global::UserAgent, redirect_limit=1)
11
+ @user_agent = user_agent
12
+ @redirect_limit = redirect_limit
13
+ @timeout = 20
14
+ end
15
+
16
+ attr_accessor :user_agent, :redirect_limit, :timeout
17
+
18
+ def fetch(website,selector='')
19
+
20
+ res = Global.get_whole_response(website,@user_agent,@timeout)
21
+ html = Global.get_whole_html(res,@user_agent,@timeout)
22
+ doc = Nokogiri::HTML(html)
23
+ #doc.css(selector) if selector != ''
24
+ end
25
+
26
+ end
27
+
28
+
29
+
30
+ def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1)
31
+ @websites = websites # the url we ready to crawl
32
+ @beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
33
+ @pm_max = pm_max # max process number
34
+ @user_agent = user_agent # user_agent 伪装成浏览器访问
35
+ @redirect_limit = redirect_limit
36
+
37
+ @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
38
+ end
39
+
40
+
41
+ attr_accessor :user_agent, :redirect_limit
42
+
43
+ def init_beanstalk_jobs
44
+ beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
45
+ #清空beanstalk的残留消息队列
46
+ begin
47
+ while job = beanstalk.reserve(0.1)
48
+ job.delete
49
+ end
50
+ rescue Beanstalk::TimedOut
51
+ print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
52
+ end
53
+ @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
54
+ beanstalk.close
55
+ rescue => e
56
+ puts e
57
+ exit
58
+ end
59
+
60
+
61
+ def process_jobs # 处理任务
62
+ start_time = Time.now
63
+ pm = Parallel::ForkManager.new(@pm_max)
64
+ @pm_max.times do |i|
65
+ pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算
66
+ beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
67
+ @ipc_reader.close # 关闭读取管道,子进程只返回数据
68
+ loop{
69
+ begin
70
+ job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈
71
+ index = job.body
72
+ job.delete
73
+ website = @websites[index.to_i]
74
+ result = Crawler.new.fetch(website)
75
+ @ipc_writer.puts(result)
76
+ rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
77
+ break
78
+ end
79
+ }
80
+ @ipc_writer.close
81
+ pm.finish(0)
82
+ end
83
+ @ipc_writer.close
84
+ begin
85
+ pm.wait_all_children # (wait all children finished) 等待所有子进程处理完毕
86
+ rescue SystemExit, Interrupt
87
+ print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
88
+ ensure
89
+ results = read_results
90
+ #ap results, :indent => -4 , :index=>false # 打印处理结果
91
+ #print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
92
+ end
93
+ end
94
+
95
+
96
+ def read_results
97
+ results = []
98
+ while result = @ipc_reader.gets
99
+ results << result
100
+ end
101
+ @ipc_reader.close
102
+ results
103
+ end
104
+
105
+
106
+ def run
107
+ init_beanstalk_jobs
108
+ process_jobs
109
+ end
110
+ end
data/lib/searcher.rb CHANGED
@@ -1,29 +1,39 @@
1
+ require 'core/nil'
2
+
1
3
  class Searcher
2
- UsSearchers = ['google','bing']
3
- ChinaSearchers = ['baidu','sogou','so360']
4
+ UsSearchers = %w(google bing)
5
+ ChinaSearchers = %w(baidu sogou so360)
4
6
  AllSearchers = UsSearchers + ChinaSearchers
5
- class << self
6
- def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
7
+ class << self
8
+ def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
7
9
  infos = []
8
10
  start_time = Time.now
9
11
  searchers.each do |searcher|
10
- infos += send "get_info_from_" + searcher,keywords,page
12
+ infos += send 'get_info_from_' + searcher,keywords,page
11
13
  end
12
- p "after search #{(Time.now - start_time).round(4)}"
14
+ infos << (Time.now - start_time).round(4)
13
15
  infos
14
- end
15
-
16
- AllSearchers.each do |search|
16
+ end
17
+
18
+ AllSearchers.each do |search|
17
19
  define_method "get_info_from_#{search}" do |keywords,page=1|
18
20
  if UsSearchers.include?(search)
19
- send "get_list_from_" + search,keywords,page
21
+ send 'get_list_from_' + search,keywords,page
20
22
  else
21
23
  searcher = send(search)
22
- return searcher.get_list(keywords,page)
24
+ searcher.get_list(keywords,page)
23
25
  end
24
26
  end
25
- end
26
-
27
+ end
28
+
29
+ def get_infos_from(url,selector='title')
30
+ crawler.fetch(url,selector)
31
+ end
32
+
33
+ def crawler
34
+ @crawler = MultipleCrawler::Crawler.new
35
+ end
36
+
27
37
  def baidu
28
38
  @baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
29
39
  end
@@ -33,23 +43,20 @@ class << self
33
43
  end
34
44
 
35
45
  def so360
36
- @so360 = ChinaSearcher.new('360','http://www.so.com/s?&q=')
46
+ @so360 = ChinaSearcher.new('so360','http://www.so.com/s?&q=')
37
47
  end
38
48
 
39
49
  def china_searchers
40
- [baidu,sogou,so360] # the result from baidu contains many redirect urls, So when add it , the run time will be longer and longer
50
+ [baidu,sogou,so360]
41
51
  #[sogou,so360]
42
52
  end
43
53
 
44
- #Searcher.china_searchers.each do |searcher|
45
- #define_method "get_info_from_#{searcher.name}" do |keywords,page=1|
46
- # searcher.get_list(keywords,page)
47
- #end
48
- #end
49
- end
54
+
55
+ end
50
56
 
51
57
  end
52
58
  require 'searcher/china_searcher'
59
+ require 'searcher/spider'
53
60
  require 'searcher/bing'
54
61
  require 'searcher/google'
55
62
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sofi-searcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-09 00:00:00.000000000 Z
12
+ date: 2013-04-12 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Just a simple Searcher
15
15
  email: 532681765@qq.com
@@ -18,10 +18,12 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - lib/searcher.rb
21
+ - lib/core/nil.rb
21
22
  - lib/searcher/bing.rb
22
23
  - lib/searcher/china_searcher.rb
23
24
  - lib/searcher/global.rb
24
25
  - lib/searcher/google.rb
26
+ - lib/searcher/spider.rb
25
27
  homepage: http://rubygems.org/gems/sofi-searcher
26
28
  licenses: []
27
29
  post_install_message: