sofi-searcher 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/searcher.rb +2 -2
- data/lib/searcher/spider.rb +33 -25
- metadata +1 -1
data/lib/searcher.rb
CHANGED
@@ -5,7 +5,7 @@ class Searcher
|
|
5
5
|
ChinaSearchers = %w(baidu sogou so360)
|
6
6
|
AllSearchers = UsSearchers + ChinaSearchers
|
7
7
|
class << self
|
8
|
-
def
|
8
|
+
def get_links_from_searches(keywords,page=1,searchers=AllSearchers)
|
9
9
|
infos = []
|
10
10
|
start_time = Time.now
|
11
11
|
searchers.each do |searcher|
|
@@ -26,7 +26,7 @@ class Searcher
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def get_infos_from_url(url,selector='title')
|
30
30
|
crawler.fetch(url,selector)
|
31
31
|
end
|
32
32
|
|
data/lib/searcher/spider.rb
CHANGED
@@ -16,7 +16,7 @@ class Searcher::MultipleCrawler
|
|
16
16
|
attr_accessor :user_agent, :redirect_limit, :timeout
|
17
17
|
|
18
18
|
def fetch(website,selector='')
|
19
|
-
|
19
|
+
p "Pid:#{Process.pid}, fetch: #{website}\n"
|
20
20
|
res = Global.get_whole_response(website,@user_agent,@timeout)
|
21
21
|
html = Global.get_whole_html(res,@user_agent,@timeout)
|
22
22
|
doc = Nokogiri::HTML(html)
|
@@ -31,10 +31,9 @@ class Searcher::MultipleCrawler
|
|
31
31
|
@websites = websites # the url we ready to crawl
|
32
32
|
@beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on
|
33
33
|
@pm_max = pm_max # max process number
|
34
|
-
@user_agent = user_agent
|
34
|
+
@user_agent = user_agent
|
35
35
|
@redirect_limit = redirect_limit
|
36
|
-
|
37
|
-
@ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道
|
36
|
+
@ipc_reader, @ipc_writer = IO.pipe
|
38
37
|
end
|
39
38
|
|
40
39
|
|
@@ -48,26 +47,38 @@ class Searcher::MultipleCrawler
|
|
48
47
|
job.delete
|
49
48
|
end
|
50
49
|
rescue Beanstalk::TimedOut
|
51
|
-
print "Beanstalk queues cleared
|
50
|
+
print "Beanstalk queues cleared!\n"
|
52
51
|
end
|
53
52
|
@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
|
54
53
|
beanstalk.close
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
rescue => e
|
55
|
+
puts e
|
56
|
+
exit
|
58
57
|
end
|
59
58
|
|
60
59
|
|
61
|
-
def process_jobs
|
62
|
-
|
60
|
+
def process_jobs
|
61
|
+
|
63
62
|
pm = Parallel::ForkManager.new(@pm_max)
|
63
|
+
|
64
|
+
#pm.run_on_start do |pid,ident|
|
65
|
+
# print "** #{ident} started, pid: #{pid} and size of results is #{results.size}\n"
|
66
|
+
#end
|
67
|
+
#
|
68
|
+
#pm.run_on_finish {
|
69
|
+
# |pid,exit_code,ident|
|
70
|
+
# print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and size of results is #{results.size}\n"
|
71
|
+
#}
|
72
|
+
|
64
73
|
@pm_max.times do |i|
|
65
|
-
|
74
|
+
|
75
|
+
pm.start(i) and next
|
66
76
|
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
|
67
|
-
@ipc_reader.close
|
68
|
-
|
77
|
+
@ipc_reader.close
|
78
|
+
|
79
|
+
loop do
|
69
80
|
begin
|
70
|
-
job = beanstalk.reserve(0.1) #
|
81
|
+
job = beanstalk.reserve(0.1) # timeout 0.1s
|
71
82
|
index = job.body
|
72
83
|
job.delete
|
73
84
|
website = @websites[index.to_i]
|
@@ -76,29 +87,25 @@ class Searcher::MultipleCrawler
|
|
76
87
|
rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
|
77
88
|
break
|
78
89
|
end
|
79
|
-
|
80
|
-
|
81
|
-
pm.finish(0)
|
90
|
+
end
|
91
|
+
pm.finish(i)
|
82
92
|
end
|
93
|
+
|
83
94
|
@ipc_writer.close
|
95
|
+
|
84
96
|
begin
|
85
|
-
pm.wait_all_children
|
97
|
+
pm.wait_all_children
|
86
98
|
rescue SystemExit, Interrupt
|
87
|
-
print "Interrupt wait all children
|
88
|
-
ensure
|
89
|
-
results = read_results
|
90
|
-
#ap results, :indent => -4 , :index=>false # 打印处理结果
|
91
|
-
#print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
|
99
|
+
print "Interrupt wait all children!\n"
|
92
100
|
end
|
93
|
-
end
|
94
101
|
|
102
|
+
end
|
95
103
|
|
96
104
|
def read_results
|
97
105
|
results = []
|
98
106
|
while result = @ipc_reader.gets
|
99
107
|
results << result
|
100
108
|
end
|
101
|
-
@ipc_reader.close
|
102
109
|
results
|
103
110
|
end
|
104
111
|
|
@@ -106,5 +113,6 @@ class Searcher::MultipleCrawler
|
|
106
113
|
def run
|
107
114
|
init_beanstalk_jobs
|
108
115
|
process_jobs
|
116
|
+
read_results
|
109
117
|
end
|
110
118
|
end
|