rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ module Rspider
2
+ class OptParser < Hash
3
+ def initialize(args)
4
+ super()
5
+ self[:conf]=""
6
+ self[:env]="TEST"
7
+ self[:debug]="on"
8
+ opts=OptionParser.new do |opt|
9
+ opt.banner="Usage:#$0 [options]"
10
+
11
+ opt.on("-c","--conf [STRING]",
12
+ 'The Configuration File') do |confFile|
13
+ confFile.chomp!
14
+ if confFile == ""
15
+ puts "No configuration file given"
16
+ exit
17
+ end
18
+ if confFile.nil?
19
+ puts "Configuration not specifed"
20
+ exit
21
+ end
22
+ if !File.file?(confFile)
23
+ puts "Configuration #{confFile} not exists"
24
+ exit
25
+ end
26
+ self[:conf]=confFile
27
+ end
28
+ opt.on("-e","--env [STRING]",
29
+ 'The Enviroment ') do |env|
30
+ if env.upcase =="PRO"
31
+ env="PRO"
32
+ else
33
+ env="TEST"
34
+ end
35
+ self[:env]=env
36
+ end
37
+ opt.on("-d","--debug [on|off]",'show debug messages') do |d|
38
+ if d.upcase == "ON"
39
+ d="on"
40
+ else
41
+ d="off"
42
+ end
43
+ self[:debug]=d
44
+ end
45
+ opt.on("-h","--help",'display this help and exit') do
46
+ puts opt
47
+ exit
48
+ end
49
+ end
50
+ opts.parse!(args)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,92 @@
1
+ # Understand robots.txt.
2
+
3
+ # Created by James Edward Gray II on 2006-01-31.
4
+ # Copyright 2006 Gray Productions. All rights reserved.
5
+
6
+ require "uri"
7
+ module Rspider
8
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
9
+ class RobotRules
10
+ #user_agent is string like 'Mozilla/IE6.0 ' and some thing else
11
+ #This will send to site in header field:
12
+ #============================
13
+ #GET /robots.txt HTTP/1.1
14
+ #HOST www.example.com
15
+ #User-Agent:#{user_agent}
16
+ #===========================
17
+ def initialize( user_agent )
18
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
19
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
20
+ end
21
+ #parse data of robots.txt
22
+ def parse( text_uri, robots_data )
23
+ begin
24
+ uri = URI.parse(text_uri)
25
+ rescue Exception=>e
26
+ puts "-"*80
27
+ puts "\n"*3
28
+ puts e
29
+ puts "uri:#{text_uri}"
30
+ end
31
+ location = "#{uri.host}:#{uri.port}"
32
+ @rules.delete(location)
33
+
34
+ rules = robots_data.split(/[\015\012]+/).map do |rule|
35
+ rule.sub(/\s*#.*$/, "")
36
+ end
37
+ anon_rules = Array.new
38
+ my_rules = Array.new
39
+ current = anon_rules
40
+ rules.each do |rule|
41
+ case rule
42
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
43
+ break unless my_rules.empty?
44
+
45
+ current = if $1 == "*"
46
+ anon_rules
47
+ elsif $1.downcase.index(@user_agent)
48
+ my_rules
49
+ else
50
+ nil
51
+ end
52
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
53
+ next if current.nil?
54
+
55
+ if $1.empty?
56
+ current << nil
57
+ else
58
+ disallow = URI.parse($1)
59
+
60
+ next unless disallow.scheme.nil? or disallow.scheme ==
61
+ uri.scheme
62
+ next unless disallow.port.nil? or disallow.port == uri.port
63
+ next unless disallow.host.nil? or
64
+ disallow.host.downcase == uri.host.downcase
65
+
66
+ disallow = disallow.path
67
+ disallow = "/" if disallow.empty?
68
+ disallow = "/#{disallow}" unless disallow[0] == ?/
69
+
70
+ current << disallow
71
+ end
72
+ end
73
+ end
74
+
75
+ @rules[location] = if my_rules.empty?
76
+ anon_rules.compact
77
+ else
78
+ my_rules.compact
79
+ end
80
+ end
81
+ #decide that if we can cralwer the url
82
+ def allowed?( text_uri )
83
+ uri = URI.parse(text_uri)
84
+ location = "#{uri.host}:#{uri.port}"
85
+ path = uri.path
86
+
87
+ return true unless %w{http https}.include?(uri.scheme)
88
+
89
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,45 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ =begin rdoc
9
+ This class hold a site and avoid from visiting a site in heavy frequency.
10
+ =end
11
+
12
+ class SiteLocker
13
+ attr_accessor :site,:time,:max
14
+ #initialization subprocess
15
+ #
16
+ def initialize(max,site="www.example.com")
17
+ @visits=Hash.new
18
+ @max=max
19
+ @site=site
20
+ @time=5
21
+ end
22
+ #we visit a site ,we log this
23
+ def visitedSite()
24
+ t=Time.now.to_i
25
+ @visits[t]=@visits[t].to_i+1
26
+ end
27
+ #If we can visit the site again?
28
+ def canVisitSite?()
29
+ t=Time.now.to_i-@time
30
+ @visits.delete_if{|k,v|
31
+ k<(t-@time)
32
+ }
33
+ values=0
34
+ @visits.values.each{|v|
35
+ values = values +v
36
+ }
37
+ return values<@max
38
+ end
39
+ #dump the data struct to string
40
+ def to_s
41
+ temp=@visits.collect{|k,v| "visits[#{k}]\t=>\t#{v}"}
42
+ temp.join("\n")
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,324 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ require "uri"
8
+ require "open-uri"
9
+ require "net/http"
10
+ require "net/https"
11
+ require "cgi"
12
+
13
+ R_links_regexps=[/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i,
14
+ /(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
15
+ /(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
16
+ /(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
17
+ /(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]+(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i ]
18
+
19
+
20
+ module Rspider
21
+ #this class if the main class of the Rspider library
22
+ #It cralwers sites and storage urls,documents
23
+ class Spider
24
+ attr_accessor :urls,:can_leave_domain,:threads,:max_depth,:buckets,:source,:threads,:same_domain_regexp,:urlStorage,:logger,:browser,:contentStorage,:siteLockers,:relationStorage
25
+ #Param conf must be hash from ConfParser
26
+ def initialize(conf)
27
+ @can_leave_domain=conf["can_leave_domain"]
28
+ @max_depth=conf["max_depth"]
29
+ @max_redirects=conf["max_redirects"]
30
+ @save_path=conf["save_path"]
31
+ @buckets=conf["buckets"]
32
+ @source=conf["source"]
33
+ @threads=conf["threads"]
34
+ @same_domain_regexp=Regexp.new(conf["same_domain_regexp"])
35
+ @conf=conf
36
+ @accepted_formats="text/html,text/xml,text/plain"
37
+ @logger=nil
38
+ @contentStorage=nil
39
+ @tidy=HtmlTidy.new
40
+ @setup=nil
41
+ @teardown=nil
42
+ @callbacks=Hash.new
43
+ @siteLockers=Hash.new
44
+ @robotRules=RobotRules.new(@conf["agent"])
45
+ @visitedRules=[]
46
+ @badUrlSymbols=["@","<",">","(",")","$","*","[","]"]
47
+ @badUrlSymbols.collect!{
48
+ |k|
49
+ [k,CGI.escape(k)]
50
+ }
51
+ end
52
+ #main entrance of the class
53
+ def start_from(url)
54
+ @logger.log_msg "start url can't be crawled!#" if ( $DEBUG and !urlCanBeCralwered?(url))
55
+ @urlStorage<<url if urlCanBeCralwered?(url)
56
+ end
57
+ def run(max_times=16)
58
+ j=0
59
+ while(true) do
60
+ # $tracker.stop($stdout) if interrupted
61
+ return if j > max_times
62
+ j = j +1
63
+ url=@urlStorage.pop
64
+ # puts "fetched url:#{url}" if $DEBUG
65
+ puts "thread ended:no more urls" if ( url.nil? or url=="failed") and $DEBUG
66
+ @logger.log_msg("error:no more urls","ERROR") if (url.nil? or url=="failed")
67
+ exit if url=="failed"
68
+ exit if url.nil?
69
+ begin
70
+ uri=URI::parse(url)
71
+ domain=uri.host.to_s+":"+uri.port.to_s
72
+ @siteLockers[domain]=SiteLocker.new(12,domain) unless @siteLockers.has_key?(domain)
73
+ if @siteLockers[domain].canVisitSite?
74
+ fetch(url,0) if urlCanBeCralwered?(url)
75
+ #debug $mem_profiler.add url
76
+ #debug $mem_profiler.report
77
+ @siteLockers[domain].visitedSite
78
+ else
79
+ sleep 0.5
80
+ next
81
+ end
82
+ rescue Exception => e
83
+ puts "error:Exception #{e} " if $DEBUG
84
+ end
85
+ end
86
+ end
87
+ #Url can be cralwered . obey the rules from robots.txt
88
+ def urlCanBeCralwered?(url)
89
+ uri=URI::parse(url)
90
+ robot_url="#{uri.scheme}://#{uri.host}:#{uri.port}/robots.txt"
91
+ if !@visitedRules.include?(robot_url)
92
+ begin
93
+ content=@browser.get(URI::parse(robot_url))
94
+ return true if content.code == "404"
95
+ can_visit=@robotRules.parse(robot_url,content.body)
96
+ return can_visit
97
+ rescue URI::InvalidURIError => invalidUri
98
+ @logger.log_msg("invalid uri:#{url}")
99
+ return false
100
+ rescue Exception => e
101
+ puts "there is some thing wrong!"
102
+ return true
103
+ end
104
+ @visitedRules << robot_url
105
+ end
106
+ @robotRules.allowed?(url)
107
+ end
108
+ #apply filters
109
+ def do_callbacks(url, resp)
110
+ cbs = [@callbacks[:every],
111
+ resp.success? ? @callbacks[:success] : @callbacks[:failure],
112
+ @callbacks[resp.code.to_i]]
113
+ cbs.each do |cb|
114
+ cb.call(url, resp ) if cb
115
+ end
116
+ end
117
+ def setup(p = nil, &block)
118
+ @setup = p ? p : block
119
+ end
120
+
121
+ # Run last, once for each page. Given the URL as a string.
122
+ def teardown(p = nil, &block)
123
+ @teardown = p ? p : block
124
+ end
125
+
126
+ def on(code, p = nil, &block)
127
+ f = p ? p : block
128
+ case code
129
+ when Fixnum
130
+ @callbacks[code] = f
131
+ else
132
+ @callbacks[code.to_sym] = f
133
+ end
134
+ end
135
+ #execute the fetch task using the browser
136
+ def fetch(url,redirects=0,depth=0)
137
+ puts "get the max depth" if depth > @max_depth
138
+ puts "get the max redirects:redirects:#{redirects},max_redirects:#{@max_redirects}" if redirects > @max_redirects
139
+ @logger.log_msg( "get the max depth:#{url}") if depth > @max_depth
140
+ @logger.log_msg( "get the max redirects:#{url}") if redirects > @max_redirects
141
+ return 1 if depth > max_depth
142
+ return 1 if redirects > @max_redirects
143
+ resp=@browser.get(URI::parse(url))
144
+ do_callbacks(url,resp)
145
+ if resp.redirect?
146
+ new_url=gen_full_url(url,resp["Location"])
147
+ begin
148
+ @urlStorage<< new_url if urlCanBeCralwered?(new_url)
149
+ @relationStorage.save(url,new_url) if urlCanBeCralwered?(new_url)
150
+ rescue
151
+ end
152
+ fetch(new_url,redirects+1,depth)
153
+ return
154
+ end
155
+ if !resp.success?
156
+ @logger.log_msg("url fetch failed:#{url}") unless resp.success?
157
+ @urlStorage.error(url) if @urlStorage.respond_to? :error
158
+ return false
159
+ end
160
+ content=resp.body
161
+ #content=@tidy.tidy(content)
162
+
163
+ @contentStorage.add(url,content)
164
+ puts "content nil:#{url}" if $DEBUG and content.nil?
165
+ return if content.nil?
166
+ @urlStorage.visited(url)
167
+ allUrls=GrabLinksByRegex(content,url)
168
+ allUrls.delete_if { |u| !isGoodUrl(u) }
169
+ allUrls.each{|u|
170
+ begin
171
+ @urlStorage<< u if urlCanBeCralwered?(u)
172
+ @relationStorage.save(url,u) if urlCanBeCralwered?(u)
173
+ rescue
174
+ end
175
+ }
176
+ end
177
+ #if the url is a HTML
178
+ def isGoodUrl(url)
179
+ @logger.log_msg "warning:url #{url} is too long to storage." if url.length > @conf["url_max_length"]
180
+ return false if url.length > @conf["url_max_length"]
181
+ return false if (url =~ /\.gif$/)
182
+ return false if (url =~ /\.jpg$/)
183
+ return false if (url =~ /\.png$/)
184
+ return false if (url =~ /\.js$/)
185
+ return false if (url =~ /\.css$/)
186
+ if @can_leave_domain
187
+ return true
188
+ else
189
+ return true if urlInDomain(url)
190
+ false
191
+ end
192
+ end
193
+ #if the url leave the domain
194
+ def urlInDomain(url)
195
+ return true if (url =~ @same_domain_regexp)
196
+ false
197
+ end
198
+
199
+ # get the base by url
200
+ #@param base_url:: url
201
+ def getBaseUrl(url)
202
+ return url if url =~ /\/$/
203
+ base=File.dirname(url)+"/"
204
+ if base == "http:/"
205
+ if url=~ /\/$/
206
+ base=url
207
+ else
208
+ base=url+ "/"
209
+ end
210
+ end
211
+ base
212
+ end
213
+ # Get the domain substatement
214
+ #@param u:: url
215
+ def getDomainField(u)
216
+ u.sub!(/http:\/\//i,"")
217
+ ar=u.split("/")
218
+ domain=ar.shift
219
+ return "http://"+domain
220
+ end
221
+ #Get the links From html content
222
+ #@param u:: url of the content
223
+ #@return:: Array
224
+ def GrabLinksByW3c(html,u)
225
+ base=u.split("?").shift
226
+ urls=[]
227
+ url=""
228
+ BeautifulStoneSoup.new(html).find_all('a').each do |tag|
229
+ if tag["href"] =~ /http:/i
230
+ url = tag['href'] if tag['href']
231
+ elsif tag["href"] =~ /^\//
232
+ url = getDomainField(u) + tag["href"]
233
+ else
234
+ url = base+tag['href'] if tag['href']
235
+ end
236
+ url.gsub!(/\&amp;/,'&')
237
+ urls << url
238
+ end
239
+ return urls
240
+ end
241
+ #Get the links of html content by regexp
242
+ #@param html:: html content
243
+ #@param u:: original url of the html document
244
+ #@return:: Array
245
+ def GrabLinksByRegex(html,u)
246
+ base_url = (html.scan(/<base\s+href="(.*?)"/i).flatten)[0]
247
+ u=base_url unless base_url.nil?
248
+ base=getBaseUrl(u)
249
+ urls=[]
250
+ url=""
251
+ hrefs=scan_html_relative_links(html,base)
252
+ hrefs.each {|w|
253
+ next unless w
254
+ next if (w =~ /^#/)
255
+ next if w =~ /^mailto:/i
256
+ next if w =~ /^javascript:/i
257
+ w.gsub!(/([^#]+)#(.*)/,'\1') #remove the strings after the char '#'
258
+ if w =~ /([a-zA-Z]{3,6}):\/\//i
259
+ url = w
260
+ elsif w =~ /^\//
261
+ url = getDomainField(base) + w
262
+ else
263
+ url = base + w
264
+ end
265
+ url.gsub!(/\&amp;/,"&")
266
+ url=fixUrlValidate(url)
267
+ urls << url
268
+ }
269
+ urls
270
+ end
271
+ #get the full url path
272
+ def gen_full_url(base_uri,link)
273
+ base=getBaseUrl(base_uri)
274
+ link.gsub!("./","")
275
+ link.gsub!(/([^#]+)#(.*)/,'\1')
276
+ if !(link =~ /([a-zA-Z]{3,6}):\/\//i).nil?
277
+ url = link
278
+ elsif link =~ /^\//
279
+ url = getDomainField(base)+link
280
+ else
281
+ url = base + link
282
+ end
283
+ url
284
+ end
285
+ #change the url from http://www.sohu.com/../i/fin/./../index.html
286
+ #to http://www.sohu.com/i/index.html
287
+ def fixUrlValidate(u)
288
+ a=u.split("/")
289
+ n=[]
290
+ a.each{
291
+ |v|
292
+ next if v=="."
293
+ n.pop if v==".." and n.length>3
294
+ next if v==".."
295
+ n.push(v)
296
+ }
297
+ l=n.join("/")
298
+ @badUrlSymbols.each{|k|
299
+ l.gsub! k[0],k[1]
300
+ }
301
+ l
302
+ end
303
+ #返回所有的链接
304
+ def scan_html_relative_links(html,base_url)
305
+ links=[]
306
+ R_links_regexps.each{ |r|
307
+ matches=html.scan(r)
308
+ matches.each{ |m|
309
+ links.push m[0] if m[3].nil? and m[0] != ""
310
+ }
311
+ }
312
+ links
313
+ end
314
+ def scan_html_simple_links(html,base_url)
315
+ r=regexp.compile('href=[\'\"]([^\'^\"^\s]*)[\'\"]')
316
+ links=[]
317
+ matches=html.scan(r)
318
+ matches.each{ |m|
319
+ links.push m[0]
320
+ }
321
+ links
322
+ end
323
+ end
324
+ end