sofi-searcher 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/core/nil.rb +9 -0
- data/lib/searcher/bing.rb +1 -1
- data/lib/searcher/china_searcher.rb +7 -6
- data/lib/searcher/global.rb +5 -7
- data/lib/searcher/spider.rb +110 -0
- data/lib/searcher.rb +28 -21
- metadata +4 -2
data/lib/core/nil.rb
ADDED
data/lib/searcher/bing.rb
CHANGED
@@ -22,13 +22,13 @@ class Searcher::ChinaSearcher
|
|
22
22
|
|
23
23
|
|
24
24
|
def keyword_urls (keyword,page=PAGE_NUM)
|
25
|
-
i = 'baidu' ==
|
25
|
+
i = 'baidu' == @name ? 0 : 1
|
26
26
|
sites = []
|
27
27
|
loop do
|
28
|
-
url_with_keyword =
|
28
|
+
url_with_keyword = @url + URI.encode(keyword) + '&' + @page_name + '=' + i.to_s
|
29
29
|
sites << url_with_keyword
|
30
30
|
i += page_no.to_i
|
31
|
-
break if i > (page *
|
31
|
+
break if i > (page * @page_no.to_i)
|
32
32
|
end
|
33
33
|
sites
|
34
34
|
end
|
@@ -39,7 +39,7 @@ class Searcher::ChinaSearcher
|
|
39
39
|
super_link = Array.new
|
40
40
|
regex = /<a.*?href.*?<\/a>/
|
41
41
|
|
42
|
-
#Global.save_to_file(content
|
42
|
+
#Global.save_to_file(content,@name + '.html','/htmls')
|
43
43
|
|
44
44
|
content.scan(regex).each do |n|
|
45
45
|
if n.index('<em>') != nil
|
@@ -48,8 +48,8 @@ class Searcher::ChinaSearcher
|
|
48
48
|
string_url = url.to_s.delete('"')
|
49
49
|
redirect_url = Global.html_get_web_url(string_url)
|
50
50
|
if redirect_url != nil
|
51
|
-
super_link << [redirect_url
|
52
|
-
#Global.save_link_info(redirect_url
|
51
|
+
super_link << [redirect_url,@name]
|
52
|
+
#Global.save_link_info(redirect_url,@name)
|
53
53
|
end
|
54
54
|
end
|
55
55
|
end
|
@@ -67,6 +67,7 @@ class Searcher::ChinaSearcher
|
|
67
67
|
end
|
68
68
|
urls
|
69
69
|
end
|
70
|
+
|
70
71
|
end
|
71
72
|
|
72
73
|
end
|
data/lib/searcher/global.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
module Global
|
2
2
|
|
3
3
|
|
4
|
-
require 'ap' # gem install awesome_print
|
4
|
+
#require 'ap' # gem install awesome_print
|
5
5
|
require 'json'
|
6
6
|
require 'nokogiri'
|
7
7
|
require 'forkmanager' # gem install parallel-forkmanager
|
8
8
|
require 'beanstalk-client'
|
9
9
|
require 'net/http'
|
10
10
|
require 'uri'
|
11
|
-
|
11
|
+
|
12
12
|
|
13
13
|
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
|
14
|
+
Beanstalk_jobs = [['localhost:11300'],'crawler-jobs']
|
14
15
|
|
15
16
|
def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)
|
16
|
-
# get_final_url_from_response().
|
17
17
|
|
18
18
|
raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
|
19
19
|
begin
|
@@ -23,9 +23,7 @@ module Global
|
|
23
23
|
when Net::HTTPSuccess then
|
24
24
|
url
|
25
25
|
when Net::HTTPRedirection then
|
26
|
-
|
27
|
-
#warn "redirected to #{location}"
|
28
|
-
#html_get_web_url(location,user_agent,timeout,redirect_limit - 1)
|
26
|
+
response['location']
|
29
27
|
else
|
30
28
|
nil
|
31
29
|
end
|
@@ -37,7 +35,7 @@ module Global
|
|
37
35
|
|
38
36
|
def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
|
39
37
|
uri = URI.parse(url)
|
40
|
-
req = Net::HTTP::Get.new(uri.path + '?' + uri.query.to_s)
|
38
|
+
req = Net::HTTP::Get.new(uri.path.to_s + '?' + uri.query.to_s)
|
41
39
|
req.add_field('User-Agent', user_agent)
|
42
40
|
res = Net::HTTP.start(uri.host, uri.port) do |http|
|
43
41
|
http.read_timeout = timeout
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#encoding: UTF-8
|
3
|
+
require 'searcher/global'
|
4
|
+
require 'core/nil'
|
5
|
+
|
6
|
+
class Searcher::MultipleCrawler
|
7
|
+
|
8
|
+
class Crawler
|
9
|
+
|
10
|
+
def initialize(user_agent=Global::UserAgent, redirect_limit=1)
|
11
|
+
@user_agent = user_agent
|
12
|
+
@redirect_limit = redirect_limit
|
13
|
+
@timeout = 20
|
14
|
+
end
|
15
|
+
|
16
|
+
attr_accessor :user_agent, :redirect_limit, :timeout
|
17
|
+
|
18
|
+
def fetch(website,selector='')
|
19
|
+
|
20
|
+
res = Global.get_whole_response(website,@user_agent,@timeout)
|
21
|
+
html = Global.get_whole_html(res,@user_agent,@timeout)
|
22
|
+
doc = Nokogiri::HTML(html)
|
23
|
+
#doc.css(selector) if selector != ''
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1)
|
31
|
+
@websites = websites # the url we ready to crawl
|
32
|
+
@beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
|
33
|
+
@pm_max = pm_max # max process number
|
34
|
+
@user_agent = user_agent # user_agent 伪装成浏览器访问
|
35
|
+
@redirect_limit = redirect_limit
|
36
|
+
|
37
|
+
@ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
attr_accessor :user_agent, :redirect_limit
|
42
|
+
|
43
|
+
def init_beanstalk_jobs
|
44
|
+
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
|
45
|
+
#清空beanstalk的残留消息队列
|
46
|
+
begin
|
47
|
+
while job = beanstalk.reserve(0.1)
|
48
|
+
job.delete
|
49
|
+
end
|
50
|
+
rescue Beanstalk::TimedOut
|
51
|
+
print "Beanstalk queues cleared!(Beanstalk 队列已清除)\n"
|
52
|
+
end
|
53
|
+
@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
|
54
|
+
beanstalk.close
|
55
|
+
rescue => e
|
56
|
+
puts e
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def process_jobs # 处理任务
|
62
|
+
start_time = Time.now
|
63
|
+
pm = Parallel::ForkManager.new(@pm_max)
|
64
|
+
@pm_max.times do |i|
|
65
|
+
pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算
|
66
|
+
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
|
67
|
+
@ipc_reader.close # 关闭读取管道,子进程只返回数据
|
68
|
+
loop{
|
69
|
+
begin
|
70
|
+
job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈
|
71
|
+
index = job.body
|
72
|
+
job.delete
|
73
|
+
website = @websites[index.to_i]
|
74
|
+
result = Crawler.new.fetch(website)
|
75
|
+
@ipc_writer.puts(result)
|
76
|
+
rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
|
77
|
+
break
|
78
|
+
end
|
79
|
+
}
|
80
|
+
@ipc_writer.close
|
81
|
+
pm.finish(0)
|
82
|
+
end
|
83
|
+
@ipc_writer.close
|
84
|
+
begin
|
85
|
+
pm.wait_all_children # (wait all children finished) 等待所有子进程处理完毕
|
86
|
+
rescue SystemExit, Interrupt
|
87
|
+
print "Interrupt wait all children! (等待所有子进程处理完毕)\n"
|
88
|
+
ensure
|
89
|
+
results = read_results
|
90
|
+
#ap results, :indent => -4 , :index=>false # 打印处理结果
|
91
|
+
#print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
def read_results
|
97
|
+
results = []
|
98
|
+
while result = @ipc_reader.gets
|
99
|
+
results << result
|
100
|
+
end
|
101
|
+
@ipc_reader.close
|
102
|
+
results
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def run
|
107
|
+
init_beanstalk_jobs
|
108
|
+
process_jobs
|
109
|
+
end
|
110
|
+
end
|
data/lib/searcher.rb
CHANGED
@@ -1,29 +1,39 @@
|
|
1
|
+
require 'core/nil'
|
2
|
+
|
1
3
|
class Searcher
|
2
|
-
UsSearchers =
|
3
|
-
ChinaSearchers =
|
4
|
+
UsSearchers = %w(google bing)
|
5
|
+
ChinaSearchers = %w(baidu sogou so360)
|
4
6
|
AllSearchers = UsSearchers + ChinaSearchers
|
5
|
-
class << self
|
6
|
-
|
7
|
+
class << self
|
8
|
+
def get_infos_from_searches(keywords,page=1,searchers=AllSearchers)
|
7
9
|
infos = []
|
8
10
|
start_time = Time.now
|
9
11
|
searchers.each do |searcher|
|
10
|
-
infos += send
|
12
|
+
infos += send 'get_info_from_' + searcher,keywords,page
|
11
13
|
end
|
12
|
-
|
14
|
+
infos << (Time.now - start_time).round(4)
|
13
15
|
infos
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
end
|
17
|
+
|
18
|
+
AllSearchers.each do |search|
|
17
19
|
define_method "get_info_from_#{search}" do |keywords,page=1|
|
18
20
|
if UsSearchers.include?(search)
|
19
|
-
send
|
21
|
+
send 'get_list_from_' + search,keywords,page
|
20
22
|
else
|
21
23
|
searcher = send(search)
|
22
|
-
|
24
|
+
searcher.get_list(keywords,page)
|
23
25
|
end
|
24
26
|
end
|
25
|
-
|
26
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_infos_from(url,selector='title')
|
30
|
+
crawler.fetch(url,selector)
|
31
|
+
end
|
32
|
+
|
33
|
+
def crawler
|
34
|
+
@crawler = MultipleCrawler::Crawler.new
|
35
|
+
end
|
36
|
+
|
27
37
|
def baidu
|
28
38
|
@baidu = ChinaSearcher.new('baidu', 'http://www.baidu.com/s?wd=','10')
|
29
39
|
end
|
@@ -33,23 +43,20 @@ class << self
|
|
33
43
|
end
|
34
44
|
|
35
45
|
def so360
|
36
|
-
@so360 = ChinaSearcher.new('
|
46
|
+
@so360 = ChinaSearcher.new('so360','http://www.so.com/s?&q=')
|
37
47
|
end
|
38
48
|
|
39
49
|
def china_searchers
|
40
|
-
[baidu,sogou,so360]
|
50
|
+
[baidu,sogou,so360]
|
41
51
|
#[sogou,so360]
|
42
52
|
end
|
43
53
|
|
44
|
-
|
45
|
-
|
46
|
-
# searcher.get_list(keywords,page)
|
47
|
-
#end
|
48
|
-
#end
|
49
|
-
end
|
54
|
+
|
55
|
+
end
|
50
56
|
|
51
57
|
end
|
52
58
|
require 'searcher/china_searcher'
|
59
|
+
require 'searcher/spider'
|
53
60
|
require 'searcher/bing'
|
54
61
|
require 'searcher/google'
|
55
62
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sofi-searcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-12 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Just a simple Searcher
|
15
15
|
email: 532681765@qq.com
|
@@ -18,10 +18,12 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- lib/searcher.rb
|
21
|
+
- lib/core/nil.rb
|
21
22
|
- lib/searcher/bing.rb
|
22
23
|
- lib/searcher/china_searcher.rb
|
23
24
|
- lib/searcher/global.rb
|
24
25
|
- lib/searcher/google.rb
|
26
|
+
- lib/searcher/spider.rb
|
25
27
|
homepage: http://rubygems.org/gems/sofi-searcher
|
26
28
|
licenses: []
|
27
29
|
post_install_message:
|