rspider 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,53 @@
1
+ module Rspider
2
+ class OptParser < Hash
3
+ def initialize(args)
4
+ super()
5
+ self[:conf]=""
6
+ self[:env]="TEST"
7
+ self[:debug]="on"
8
+ opts=OptionParser.new do |opt|
9
+ opt.banner="Usage:#$0 [options]"
10
+
11
+ opt.on("-c","--conf [STRING]",
12
+ 'The Configuration File') do |confFile|
13
+ confFile.chomp!
14
+ if confFile == ""
15
+ puts "No configuration file given"
16
+ exit
17
+ end
18
+ if confFile.nil?
19
+ puts "Configuration not specifed"
20
+ exit
21
+ end
22
+ if !File.file?(confFile)
23
+ puts "Configuration #{confFile} not exists"
24
+ exit
25
+ end
26
+ self[:conf]=confFile
27
+ end
28
+ opt.on("-e","--env [STRING]",
29
+ 'The Enviroment ') do |env|
30
+ if env.upcase =="PRO"
31
+ env="PRO"
32
+ else
33
+ env="TEST"
34
+ end
35
+ self[:env]=env
36
+ end
37
+ opt.on("-d","--debug [on|off]",'show debug messages') do |d|
38
+ if d.upcase == "ON"
39
+ d="on"
40
+ else
41
+ d="off"
42
+ end
43
+ self[:debug]=d
44
+ end
45
+ opt.on("-h","--help",'display this help and exit') do
46
+ puts opt
47
+ exit
48
+ end
49
+ end
50
+ opts.parse!(args)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,92 @@
1
+ # Understand robots.txt.
2
+
3
+ # Created by James Edward Gray II on 2006-01-31.
4
+ # Copyright 2006 Gray Productions. All rights reserved.
5
+
6
+ require "uri"
7
+ module Rspider
8
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
9
+ class RobotRules
10
+ #user_agent is string like 'Mozilla/IE6.0 ' and some thing else
11
+ #This will send to site in header field:
12
+ #============================
13
+ #GET /robots.txt HTTP/1.1
14
+ #HOST www.example.com
15
+ #User-Agent:#{user_agent}
16
+ #===========================
17
+ def initialize( user_agent )
18
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
19
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
20
+ end
21
+ #parse data of robots.txt
22
+ def parse( text_uri, robots_data )
23
+ begin
24
+ uri = URI.parse(text_uri)
25
+ rescue Exception=>e
26
+ puts "-"*80
27
+ puts "\n"*3
28
+ puts e
29
+ puts "uri:#{text_uri}"
30
+ end
31
+ location = "#{uri.host}:#{uri.port}"
32
+ @rules.delete(location)
33
+
34
+ rules = robots_data.split(/[\015\012]+/).map do |rule|
35
+ rule.sub(/\s*#.*$/, "")
36
+ end
37
+ anon_rules = Array.new
38
+ my_rules = Array.new
39
+ current = anon_rules
40
+ rules.each do |rule|
41
+ case rule
42
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
43
+ break unless my_rules.empty?
44
+
45
+ current = if $1 == "*"
46
+ anon_rules
47
+ elsif $1.downcase.index(@user_agent)
48
+ my_rules
49
+ else
50
+ nil
51
+ end
52
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
53
+ next if current.nil?
54
+
55
+ if $1.empty?
56
+ current << nil
57
+ else
58
+ disallow = URI.parse($1)
59
+
60
+ next unless disallow.scheme.nil? or disallow.scheme ==
61
+ uri.scheme
62
+ next unless disallow.port.nil? or disallow.port == uri.port
63
+ next unless disallow.host.nil? or
64
+ disallow.host.downcase == uri.host.downcase
65
+
66
+ disallow = disallow.path
67
+ disallow = "/" if disallow.empty?
68
+ disallow = "/#{disallow}" unless disallow[0] == ?/
69
+
70
+ current << disallow
71
+ end
72
+ end
73
+ end
74
+
75
+ @rules[location] = if my_rules.empty?
76
+ anon_rules.compact
77
+ else
78
+ my_rules.compact
79
+ end
80
+ end
81
+ #decide that if we can cralwer the url
82
+ def allowed?( text_uri )
83
+ uri = URI.parse(text_uri)
84
+ location = "#{uri.host}:#{uri.port}"
85
+ path = uri.path
86
+
87
+ return true unless %w{http https}.include?(uri.scheme)
88
+
89
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,45 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ =begin rdoc
9
+ This class hold a site and avoid from visiting a site in heavy frequency.
10
+ =end
11
+
12
+ class SiteLocker
13
+ attr_accessor :site,:time,:max
14
+ #initialization subprocess
15
+ #
16
+ def initialize(max,site="www.example.com")
17
+ @visits=Hash.new
18
+ @max=max
19
+ @site=site
20
+ @time=5
21
+ end
22
+ #we visit a site ,we log this
23
+ def visitedSite()
24
+ t=Time.now.to_i
25
+ @visits[t]=@visits[t].to_i+1
26
+ end
27
+ #If we can visit the site again?
28
+ def canVisitSite?()
29
+ t=Time.now.to_i-@time
30
+ @visits.delete_if{|k,v|
31
+ k<(t-@time)
32
+ }
33
+ values=0
34
+ @visits.values.each{|v|
35
+ values = values +v
36
+ }
37
+ return values<@max
38
+ end
39
+ #dump the data struct to string
40
+ def to_s
41
+ temp=@visits.collect{|k,v| "visits[#{k}]\t=>\t#{v}"}
42
+ temp.join("\n")
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,324 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ require "uri"
8
+ require "open-uri"
9
+ require "net/http"
10
+ require "net/https"
11
+ require "cgi"
12
+
13
+ R_links_regexps=[/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i,
14
+ /(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
15
+ /(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
16
+ /(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
17
+ /(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]+(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i ]
18
+
19
+
20
+ module Rspider
21
+ #this class if the main class of the Rspider library
22
+ #It cralwers sites and storage urls,documents
23
+ class Spider
24
+ attr_accessor :urls,:can_leave_domain,:threads,:max_depth,:buckets,:source,:threads,:same_domain_regexp,:urlStorage,:logger,:browser,:contentStorage,:siteLockers,:relationStorage
25
+ #Param conf must be hash from ConfParser
26
+ def initialize(conf)
27
+ @can_leave_domain=conf["can_leave_domain"]
28
+ @max_depth=conf["max_depth"]
29
+ @max_redirects=conf["max_redirects"]
30
+ @save_path=conf["save_path"]
31
+ @buckets=conf["buckets"]
32
+ @source=conf["source"]
33
+ @threads=conf["threads"]
34
+ @same_domain_regexp=Regexp.new(conf["same_domain_regexp"])
35
+ @conf=conf
36
+ @accepted_formats="text/html,text/xml,text/plain"
37
+ @logger=nil
38
+ @contentStorage=nil
39
+ @tidy=HtmlTidy.new
40
+ @setup=nil
41
+ @teardown=nil
42
+ @callbacks=Hash.new
43
+ @siteLockers=Hash.new
44
+ @robotRules=RobotRules.new(@conf["agent"])
45
+ @visitedRules=[]
46
+ @badUrlSymbols=["@","<",">","(",")","$","*","[","]"]
47
+ @badUrlSymbols.collect!{
48
+ |k|
49
+ [k,CGI.escape(k)]
50
+ }
51
+ end
52
+ #main entrance of the class
53
+ def start_from(url)
54
+ @logger.log_msg "start url can't be crawled!#" if ( $DEBUG and !urlCanBeCralwered?(url))
55
+ @urlStorage<<url if urlCanBeCralwered?(url)
56
+ end
57
+ def run(max_times=16)
58
+ j=0
59
+ while(true) do
60
+ # $tracker.stop($stdout) if interrupted
61
+ return if j > max_times
62
+ j = j +1
63
+ url=@urlStorage.pop
64
+ # puts "fetched url:#{url}" if $DEBUG
65
+ puts "thread ended:no more urls" if ( url.nil? or url=="failed") and $DEBUG
66
+ @logger.log_msg("error:no more urls","ERROR") if (url.nil? or url=="failed")
67
+ exit if url=="failed"
68
+ exit if url.nil?
69
+ begin
70
+ uri=URI::parse(url)
71
+ domain=uri.host.to_s+":"+uri.port.to_s
72
+ @siteLockers[domain]=SiteLocker.new(12,domain) unless @siteLockers.has_key?(domain)
73
+ if @siteLockers[domain].canVisitSite?
74
+ fetch(url,0) if urlCanBeCralwered?(url)
75
+ #debug $mem_profiler.add url
76
+ #debug $mem_profiler.report
77
+ @siteLockers[domain].visitedSite
78
+ else
79
+ sleep 0.5
80
+ next
81
+ end
82
+ rescue Exception => e
83
+ puts "error:Exception #{e} " if $DEBUG
84
+ end
85
+ end
86
+ end
87
+ #Url can be cralwered . obey the rules from robots.txt
88
+ def urlCanBeCralwered?(url)
89
+ uri=URI::parse(url)
90
+ robot_url="#{uri.scheme}://#{uri.host}:#{uri.port}/robots.txt"
91
+ if !@visitedRules.include?(robot_url)
92
+ begin
93
+ content=@browser.get(URI::parse(robot_url))
94
+ return true if content.code == "404"
95
+ can_visit=@robotRules.parse(robot_url,content.body)
96
+ return can_visit
97
+ rescue URI::InvalidURIError => invalidUri
98
+ @logger.log_msg("invalid uri:#{url}")
99
+ return false
100
+ rescue Exception => e
101
+ puts "there is some thing wrong!"
102
+ return true
103
+ end
104
+ @visitedRules << robot_url
105
+ end
106
+ @robotRules.allowed?(url)
107
+ end
108
+ #apply filters
109
+ def do_callbacks(url, resp)
110
+ cbs = [@callbacks[:every],
111
+ resp.success? ? @callbacks[:success] : @callbacks[:failure],
112
+ @callbacks[resp.code.to_i]]
113
+ cbs.each do |cb|
114
+ cb.call(url, resp ) if cb
115
+ end
116
+ end
117
+ def setup(p = nil, &block)
118
+ @setup = p ? p : block
119
+ end
120
+
121
+ # Run last, once for each page. Given the URL as a string.
122
+ def teardown(p = nil, &block)
123
+ @teardown = p ? p : block
124
+ end
125
+
126
+ def on(code, p = nil, &block)
127
+ f = p ? p : block
128
+ case code
129
+ when Fixnum
130
+ @callbacks[code] = f
131
+ else
132
+ @callbacks[code.to_sym] = f
133
+ end
134
+ end
135
+ #execute the fetch task using the browser
136
+ def fetch(url,redirects=0,depth=0)
137
+ puts "get the max depth" if depth > @max_depth
138
+ puts "get the max redirects:redirects:#{redirects},max_redirects:#{@max_redirects}" if redirects > @max_redirects
139
+ @logger.log_msg( "get the max depth:#{url}") if depth > @max_depth
140
+ @logger.log_msg( "get the max redirects:#{url}") if redirects > @max_redirects
141
+ return 1 if depth > max_depth
142
+ return 1 if redirects > @max_redirects
143
+ resp=@browser.get(URI::parse(url))
144
+ do_callbacks(url,resp)
145
+ if resp.redirect?
146
+ new_url=gen_full_url(url,resp["Location"])
147
+ begin
148
+ @urlStorage<< new_url if urlCanBeCralwered?(new_url)
149
+ @relationStorage.save(url,new_url) if urlCanBeCralwered?(new_url)
150
+ rescue
151
+ end
152
+ fetch(new_url,redirects+1,depth)
153
+ return
154
+ end
155
+ if !resp.success?
156
+ @logger.log_msg("url fetch failed:#{url}") unless resp.success?
157
+ @urlStorage.error(url) if @urlStorage.respond_to? :error
158
+ return false
159
+ end
160
+ content=resp.body
161
+ #content=@tidy.tidy(content)
162
+
163
+ @contentStorage.add(url,content)
164
+ puts "content nil:#{url}" if $DEBUG and content.nil?
165
+ return if content.nil?
166
+ @urlStorage.visited(url)
167
+ allUrls=GrabLinksByRegex(content,url)
168
+ allUrls.delete_if { |u| !isGoodUrl(u) }
169
+ allUrls.each{|u|
170
+ begin
171
+ @urlStorage<< u if urlCanBeCralwered?(u)
172
+ @relationStorage.save(url,u) if urlCanBeCralwered?(u)
173
+ rescue
174
+ end
175
+ }
176
+ end
177
+ #if the url is a HTML
178
+ def isGoodUrl(url)
179
+ @logger.log_msg "warning:url #{url} is too long to storage." if url.length > @conf["url_max_length"]
180
+ return false if url.length > @conf["url_max_length"]
181
+ return false if (url =~ /\.gif$/)
182
+ return false if (url =~ /\.jpg$/)
183
+ return false if (url =~ /\.png$/)
184
+ return false if (url =~ /\.js$/)
185
+ return false if (url =~ /\.css$/)
186
+ if @can_leave_domain
187
+ return true
188
+ else
189
+ return true if urlInDomain(url)
190
+ false
191
+ end
192
+ end
193
+ #if the url leave the domain
194
+ def urlInDomain(url)
195
+ return true if (url =~ @same_domain_regexp)
196
+ false
197
+ end
198
+
199
+ # get the base by url
200
+ #@param base_url:: url
201
+ def getBaseUrl(url)
202
+ return url if url =~ /\/$/
203
+ base=File.dirname(url)+"/"
204
+ if base == "http:/"
205
+ if url=~ /\/$/
206
+ base=url
207
+ else
208
+ base=url+ "/"
209
+ end
210
+ end
211
+ base
212
+ end
213
+ # Get the domain substatement
214
+ #@param u:: url
215
+ def getDomainField(u)
216
+ u.sub!(/http:\/\//i,"")
217
+ ar=u.split("/")
218
+ domain=ar.shift
219
+ return "http://"+domain
220
+ end
221
+ #Get the links From html content
222
+ #@param u:: url of the content
223
+ #@return:: Array
224
+ def GrabLinksByW3c(html,u)
225
+ base=u.split("?").shift
226
+ urls=[]
227
+ url=""
228
+ BeautifulStoneSoup.new(html).find_all('a').each do |tag|
229
+ if tag["href"] =~ /http:/i
230
+ url = tag['href'] if tag['href']
231
+ elsif tag["href"] =~ /^\//
232
+ url = getDomainField(u) + tag["href"]
233
+ else
234
+ url = base+tag['href'] if tag['href']
235
+ end
236
+ url.gsub!(/\&amp;/,'&')
237
+ urls << url
238
+ end
239
+ return urls
240
+ end
241
+ #Get the links of html content by regexp
242
+ #@param html:: html content
243
+ #@param u:: original url of the html document
244
+ #@return:: Array
245
+ def GrabLinksByRegex(html,u)
246
+ base_url = (html.scan(/<base\s+href="(.*?)"/i).flatten)[0]
247
+ u=base_url unless base_url.nil?
248
+ base=getBaseUrl(u)
249
+ urls=[]
250
+ url=""
251
+ hrefs=scan_html_relative_links(html,base)
252
+ hrefs.each {|w|
253
+ next unless w
254
+ next if (w =~ /^#/)
255
+ next if w =~ /^mailto:/i
256
+ next if w =~ /^javascript:/i
257
+ w.gsub!(/([^#]+)#(.*)/,'\1') #remove the strings after the char '#'
258
+ if w =~ /([a-zA-Z]{3,6}):\/\//i
259
+ url = w
260
+ elsif w =~ /^\//
261
+ url = getDomainField(base) + w
262
+ else
263
+ url = base + w
264
+ end
265
+ url.gsub!(/\&amp;/,"&")
266
+ url=fixUrlValidate(url)
267
+ urls << url
268
+ }
269
+ urls
270
+ end
271
+ #get the full url path
272
+ def gen_full_url(base_uri,link)
273
+ base=getBaseUrl(base_uri)
274
+ link.gsub!("./","")
275
+ link.gsub!(/([^#]+)#(.*)/,'\1')
276
+ if !(link =~ /([a-zA-Z]{3,6}):\/\//i).nil?
277
+ url = link
278
+ elsif link =~ /^\//
279
+ url = getDomainField(base)+link
280
+ else
281
+ url = base + link
282
+ end
283
+ url
284
+ end
285
+ #change the url from http://www.sohu.com/../i/fin/./../index.html
286
+ #to http://www.sohu.com/i/index.html
287
+ def fixUrlValidate(u)
288
+ a=u.split("/")
289
+ n=[]
290
+ a.each{
291
+ |v|
292
+ next if v=="."
293
+ n.pop if v==".." and n.length>3
294
+ next if v==".."
295
+ n.push(v)
296
+ }
297
+ l=n.join("/")
298
+ @badUrlSymbols.each{|k|
299
+ l.gsub! k[0],k[1]
300
+ }
301
+ l
302
+ end
303
+ #返回所有的链接
304
+ def scan_html_relative_links(html,base_url)
305
+ links=[]
306
+ R_links_regexps.each{ |r|
307
+ matches=html.scan(r)
308
+ matches.each{ |m|
309
+ links.push m[0] if m[3].nil? and m[0] != ""
310
+ }
311
+ }
312
+ links
313
+ end
314
+ def scan_html_simple_links(html,base_url)
315
+ r=regexp.compile('href=[\'\"]([^\'^\"^\s]*)[\'\"]')
316
+ links=[]
317
+ matches=html.scan(r)
318
+ matches.each{ |m|
319
+ links.push m[0]
320
+ }
321
+ links
322
+ end
323
+ end
324
+ end