sofi-searcher 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/searcher.rb +2 -2
- data/lib/searcher/spider.rb +33 -25
- metadata +1 -1
data/lib/searcher.rb
CHANGED
@@ -5,7 +5,7 @@ class Searcher
|
|
5
5
|
ChinaSearchers = %w(baidu sogou so360)
|
6
6
|
AllSearchers = UsSearchers + ChinaSearchers
|
7
7
|
class << self
|
8
|
-
def
|
8
|
+
def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
|
9
9
|
infos = []
|
10
10
|
start_time = Time.now
|
11
11
|
searchers.each do |searcher|
|
@@ -26,7 +26,7 @@ class Searcher
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def get_infos_from_url(url,selector='title')
|
30
30
|
crawler.fetch(url,selector)
|
31
31
|
end
|
32
32
|
|
data/lib/searcher/spider.rb
CHANGED
@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
|
|
16
16
|
attr_accessor :user_agent, :redirect_limit, :timeout
|
17
17
|
|
18
18
|
def fetch(website,selector='')
|
19
|
-
|
19
|
+
p "Pid:#{Process.pid}, fetch: #{website}\n"
|
20
20
|
res = Global.get_whole_response(website,@user_agent,@timeout)
|
21
21
|
html = Global.get_whole_html(res,@user_agent,@timeout)
|
22
22
|
doc = Nokogiri::HTML(html)
|
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
|
|
31
31
|
@websites = websites # the url we ready to crawl
|
32
32
|
@beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
|
33
33
|
@pm_max = pm_max # max process number
|
34
|
-
@user_agent = user_agent
|
34
|
+
@user_agent = user_agent
|
35
35
|
@redirect_limit = redirect_limit
|
36
|
-
|
37
|
-
@ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
|
36
|
+
@ipc_reader, @ipc_writer = IO.pipe
|
38
37
|
end
|
39
38
|
|
40
39
|
|
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
|
|
48
47
|
job.delete
|
49
48
|
end
|
50
49
|
rescue Beanstalk::TimedOut
|
51
|
-
print "Beanstalk queues cleared
|
50
|
+
print "Beanstalk queues cleared!\n"
|
52
51
|
end
|
53
52
|
@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
|
54
53
|
beanstalk.close
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
rescue => e
|
55
|
+
puts e
|
56
|
+
exit
|
58
57
|
end
|
59
58
|
|
60
59
|
|
61
|
-
def process_jobs
|
62
|
-
|
60
|
+
def process_jobs
|
61
|
+
|
63
62
|
pm = Parallel::ForkManager.new(@pm_max)
|
63
|
+
|
64
|
+
#pm.run_on_start do |pid,ident|
|
65
|
+
# print "** #{ident} started, pid: #{pid} and size of results is #{results.size}\n"
|
66
|
+
#end
|
67
|
+
#
|
68
|
+
#pm.run_on_finish {
|
69
|
+
# |pid,exit_code,ident|
|
70
|
+
# print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and size of results is #{results.size}\n"
|
71
|
+
#}
|
72
|
+
|
64
73
|
@pm_max.times do |i|
|
65
|
-
|
74
|
+
|
75
|
+
pm.start(i) and next
|
66
76
|
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
|
67
|
-
@ipc_reader.close
|
68
|
-
|
77
|
+
@ipc_reader.close
|
78
|
+
|
79
|
+
loop do
|
69
80
|
begin
|
70
|
-
job = beanstalk.reserve(0.1) #
|
81
|
+
job = beanstalk.reserve(0.1) # timeout 0.1s
|
71
82
|
index = job.body
|
72
83
|
job.delete
|
73
84
|
website = @websites[index.to_i]
|
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
|
|
76
87
|
rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
|
77
88
|
break
|
78
89
|
end
|
79
|
-
|
80
|
-
|
81
|
-
pm.finish(0)
|
90
|
+
end
|
91
|
+
pm.finish(i)
|
82
92
|
end
|
93
|
+
|
83
94
|
@ipc_writer.close
|
95
|
+
|
84
96
|
begin
|
85
|
-
pm.wait_all_children
|
97
|
+
pm.wait_all_children
|
86
98
|
rescue SystemExit, Interrupt
|
87
|
-
print "Interrupt wait all children
|
88
|
-
ensure
|
89
|
-
results = read_results
|
90
|
-
#ap results, :indent => -4 , :index=>false # 打印处理结果
|
91
|
-
#print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
|
99
|
+
print "Interrupt wait all children!\n"
|
92
100
|
end
|
93
|
-
end
|
94
101
|
|
102
|
+
end
|
95
103
|
|
96
104
|
def read_results
|
97
105
|
results = []
|
98
106
|
while result = @ipc_reader.gets
|
99
107
|
results << result
|
100
108
|
end
|
101
|
-
@ipc_reader.close
|
102
109
|
results
|
103
110
|
end
|
104
111
|
|
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
|
|
106
113
|
def run
|
107
114
|
init_beanstalk_jobs
|
108
115
|
process_jobs
|
116
|
+
read_results
|
109
117
|
end
|
110
118
|
end
|