rspider 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
module Rspider
|
2
|
+
class OptParser < Hash
|
3
|
+
def initialize(args)
|
4
|
+
super()
|
5
|
+
self[:conf]=""
|
6
|
+
self[:env]="TEST"
|
7
|
+
self[:debug]="on"
|
8
|
+
opts=OptionParser.new do |opt|
|
9
|
+
opt.banner="Usage:#$0 [options]"
|
10
|
+
|
11
|
+
opt.on("-c","--conf [STRING]",
|
12
|
+
'The Configuration File') do |confFile|
|
13
|
+
confFile.chomp!
|
14
|
+
if confFile == ""
|
15
|
+
puts "No configuration file given"
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
if confFile.nil?
|
19
|
+
puts "Configuration not specifed"
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
if !File.file?(confFile)
|
23
|
+
puts "Configuration #{confFile} not exists"
|
24
|
+
exit
|
25
|
+
end
|
26
|
+
self[:conf]=confFile
|
27
|
+
end
|
28
|
+
opt.on("-e","--env [STRING]",
|
29
|
+
'The Enviroment ') do |env|
|
30
|
+
if env.upcase =="PRO"
|
31
|
+
env="PRO"
|
32
|
+
else
|
33
|
+
env="TEST"
|
34
|
+
end
|
35
|
+
self[:env]=env
|
36
|
+
end
|
37
|
+
opt.on("-d","--debug [on|off]",'show debug messages') do |d|
|
38
|
+
if d.upcase == "ON"
|
39
|
+
d="on"
|
40
|
+
else
|
41
|
+
d="off"
|
42
|
+
end
|
43
|
+
self[:debug]=d
|
44
|
+
end
|
45
|
+
opt.on("-h","--help",'display this help and exit') do
|
46
|
+
puts opt
|
47
|
+
exit
|
48
|
+
end
|
49
|
+
end
|
50
|
+
opts.parse!(args)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# Understand robots.txt.
|
2
|
+
|
3
|
+
# Created by James Edward Gray II on 2006-01-31.
|
4
|
+
# Copyright 2006 Gray Productions. All rights reserved.
|
5
|
+
|
6
|
+
require "uri"
|
7
|
+
module Rspider
|
8
|
+
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
|
9
|
+
class RobotRules
|
10
|
+
#user_agent is string like 'Mozilla/IE6.0 ' and some thing else
|
11
|
+
#This will send to site in header field:
|
12
|
+
#============================
|
13
|
+
#GET /robots.txt HTTP/1.1
|
14
|
+
#HOST www.example.com
|
15
|
+
#User-Agent:#{user_agent}
|
16
|
+
#===========================
|
17
|
+
def initialize( user_agent )
|
18
|
+
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
|
19
|
+
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
|
20
|
+
end
|
21
|
+
#parse data of robots.txt
|
22
|
+
def parse( text_uri, robots_data )
|
23
|
+
begin
|
24
|
+
uri = URI.parse(text_uri)
|
25
|
+
rescue Exception=>e
|
26
|
+
puts "-"*80
|
27
|
+
puts "\n"*3
|
28
|
+
puts e
|
29
|
+
puts "uri:#{text_uri}"
|
30
|
+
end
|
31
|
+
location = "#{uri.host}:#{uri.port}"
|
32
|
+
@rules.delete(location)
|
33
|
+
|
34
|
+
rules = robots_data.split(/[\015\012]+/).map do |rule|
|
35
|
+
rule.sub(/\s*#.*$/, "")
|
36
|
+
end
|
37
|
+
anon_rules = Array.new
|
38
|
+
my_rules = Array.new
|
39
|
+
current = anon_rules
|
40
|
+
rules.each do |rule|
|
41
|
+
case rule
|
42
|
+
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
|
43
|
+
break unless my_rules.empty?
|
44
|
+
|
45
|
+
current = if $1 == "*"
|
46
|
+
anon_rules
|
47
|
+
elsif $1.downcase.index(@user_agent)
|
48
|
+
my_rules
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
|
53
|
+
next if current.nil?
|
54
|
+
|
55
|
+
if $1.empty?
|
56
|
+
current << nil
|
57
|
+
else
|
58
|
+
disallow = URI.parse($1)
|
59
|
+
|
60
|
+
next unless disallow.scheme.nil? or disallow.scheme ==
|
61
|
+
uri.scheme
|
62
|
+
next unless disallow.port.nil? or disallow.port == uri.port
|
63
|
+
next unless disallow.host.nil? or
|
64
|
+
disallow.host.downcase == uri.host.downcase
|
65
|
+
|
66
|
+
disallow = disallow.path
|
67
|
+
disallow = "/" if disallow.empty?
|
68
|
+
disallow = "/#{disallow}" unless disallow[0] == ?/
|
69
|
+
|
70
|
+
current << disallow
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
@rules[location] = if my_rules.empty?
|
76
|
+
anon_rules.compact
|
77
|
+
else
|
78
|
+
my_rules.compact
|
79
|
+
end
|
80
|
+
end
|
81
|
+
#decide that if we can cralwer the url
|
82
|
+
def allowed?( text_uri )
|
83
|
+
uri = URI.parse(text_uri)
|
84
|
+
location = "#{uri.host}:#{uri.port}"
|
85
|
+
path = uri.path
|
86
|
+
|
87
|
+
return true unless %w{http https}.include?(uri.scheme)
|
88
|
+
|
89
|
+
not @rules[location].any? { |rule| path.index(rule) == 0 }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
=begin rdoc
|
9
|
+
This class hold a site and avoid from visiting a site in heavy frequency.
|
10
|
+
=end
|
11
|
+
|
12
|
+
class SiteLocker
|
13
|
+
attr_accessor :site,:time,:max
|
14
|
+
#initialization subprocess
|
15
|
+
#
|
16
|
+
def initialize(max,site="www.example.com")
|
17
|
+
@visits=Hash.new
|
18
|
+
@max=max
|
19
|
+
@site=site
|
20
|
+
@time=5
|
21
|
+
end
|
22
|
+
#we visit a site ,we log this
|
23
|
+
def visitedSite()
|
24
|
+
t=Time.now.to_i
|
25
|
+
@visits[t]=@visits[t].to_i+1
|
26
|
+
end
|
27
|
+
#If we can visit the site again?
|
28
|
+
def canVisitSite?()
|
29
|
+
t=Time.now.to_i-@time
|
30
|
+
@visits.delete_if{|k,v|
|
31
|
+
k<(t-@time)
|
32
|
+
}
|
33
|
+
values=0
|
34
|
+
@visits.values.each{|v|
|
35
|
+
values = values +v
|
36
|
+
}
|
37
|
+
return values<@max
|
38
|
+
end
|
39
|
+
#dump the data struct to string
|
40
|
+
def to_s
|
41
|
+
temp=@visits.collect{|k,v| "visits[#{k}]\t=>\t#{v}"}
|
42
|
+
temp.join("\n")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,324 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
require "uri"
|
8
|
+
require "open-uri"
|
9
|
+
require "net/http"
|
10
|
+
require "net/https"
|
11
|
+
require "cgi"
|
12
|
+
|
13
|
+
R_links_regexps=[/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i,
|
14
|
+
/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
|
15
|
+
/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
|
16
|
+
/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
|
17
|
+
/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]+(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i ]
|
18
|
+
|
19
|
+
|
20
|
+
module Rspider
|
21
|
+
#this class if the main class of the Rspider library
|
22
|
+
#It cralwers sites and storage urls,documents
|
23
|
+
class Spider
|
24
|
+
attr_accessor :urls,:can_leave_domain,:threads,:max_depth,:buckets,:source,:threads,:same_domain_regexp,:urlStorage,:logger,:browser,:contentStorage,:siteLockers,:relationStorage
|
25
|
+
#Param conf must be hash from ConfParser
|
26
|
+
def initialize(conf)
|
27
|
+
@can_leave_domain=conf["can_leave_domain"]
|
28
|
+
@max_depth=conf["max_depth"]
|
29
|
+
@max_redirects=conf["max_redirects"]
|
30
|
+
@save_path=conf["save_path"]
|
31
|
+
@buckets=conf["buckets"]
|
32
|
+
@source=conf["source"]
|
33
|
+
@threads=conf["threads"]
|
34
|
+
@same_domain_regexp=Regexp.new(conf["same_domain_regexp"])
|
35
|
+
@conf=conf
|
36
|
+
@accepted_formats="text/html,text/xml,text/plain"
|
37
|
+
@logger=nil
|
38
|
+
@contentStorage=nil
|
39
|
+
@tidy=HtmlTidy.new
|
40
|
+
@setup=nil
|
41
|
+
@teardown=nil
|
42
|
+
@callbacks=Hash.new
|
43
|
+
@siteLockers=Hash.new
|
44
|
+
@robotRules=RobotRules.new(@conf["agent"])
|
45
|
+
@visitedRules=[]
|
46
|
+
@badUrlSymbols=["@","<",">","(",")","$","*","[","]"]
|
47
|
+
@badUrlSymbols.collect!{
|
48
|
+
|k|
|
49
|
+
[k,CGI.escape(k)]
|
50
|
+
}
|
51
|
+
end
|
52
|
+
#main entrance of the class
|
53
|
+
def start_from(url)
|
54
|
+
@logger.log_msg "start url can't be crawled!#" if ( $DEBUG and !urlCanBeCralwered?(url))
|
55
|
+
@urlStorage<<url if urlCanBeCralwered?(url)
|
56
|
+
end
|
57
|
+
def run(max_times=16)
|
58
|
+
j=0
|
59
|
+
while(true) do
|
60
|
+
# $tracker.stop($stdout) if interrupted
|
61
|
+
return if j > max_times
|
62
|
+
j = j +1
|
63
|
+
url=@urlStorage.pop
|
64
|
+
# puts "fetched url:#{url}" if $DEBUG
|
65
|
+
puts "thread ended:no more urls" if ( url.nil? or url=="failed") and $DEBUG
|
66
|
+
@logger.log_msg("error:no more urls","ERROR") if (url.nil? or url=="failed")
|
67
|
+
exit if url=="failed"
|
68
|
+
exit if url.nil?
|
69
|
+
begin
|
70
|
+
uri=URI::parse(url)
|
71
|
+
domain=uri.host.to_s+":"+uri.port.to_s
|
72
|
+
@siteLockers[domain]=SiteLocker.new(12,domain) unless @siteLockers.has_key?(domain)
|
73
|
+
if @siteLockers[domain].canVisitSite?
|
74
|
+
fetch(url,0) if urlCanBeCralwered?(url)
|
75
|
+
#debug $mem_profiler.add url
|
76
|
+
#debug $mem_profiler.report
|
77
|
+
@siteLockers[domain].visitedSite
|
78
|
+
else
|
79
|
+
sleep 0.5
|
80
|
+
next
|
81
|
+
end
|
82
|
+
rescue Exception => e
|
83
|
+
puts "error:Exception #{e} " if $DEBUG
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
#Url can be cralwered . obey the rules from robots.txt
|
88
|
+
def urlCanBeCralwered?(url)
|
89
|
+
uri=URI::parse(url)
|
90
|
+
robot_url="#{uri.scheme}://#{uri.host}:#{uri.port}/robots.txt"
|
91
|
+
if !@visitedRules.include?(robot_url)
|
92
|
+
begin
|
93
|
+
content=@browser.get(URI::parse(robot_url))
|
94
|
+
return true if content.code == "404"
|
95
|
+
can_visit=@robotRules.parse(robot_url,content.body)
|
96
|
+
return can_visit
|
97
|
+
rescue URI::InvalidURIError => invalidUri
|
98
|
+
@logger.log_msg("invalid uri:#{url}")
|
99
|
+
return false
|
100
|
+
rescue Exception => e
|
101
|
+
puts "there is some thing wrong!"
|
102
|
+
return true
|
103
|
+
end
|
104
|
+
@visitedRules << robot_url
|
105
|
+
end
|
106
|
+
@robotRules.allowed?(url)
|
107
|
+
end
|
108
|
+
#apply filters
|
109
|
+
def do_callbacks(url, resp)
|
110
|
+
cbs = [@callbacks[:every],
|
111
|
+
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
112
|
+
@callbacks[resp.code.to_i]]
|
113
|
+
cbs.each do |cb|
|
114
|
+
cb.call(url, resp ) if cb
|
115
|
+
end
|
116
|
+
end
|
117
|
+
def setup(p = nil, &block)
|
118
|
+
@setup = p ? p : block
|
119
|
+
end
|
120
|
+
|
121
|
+
# Run last, once for each page. Given the URL as a string.
|
122
|
+
def teardown(p = nil, &block)
|
123
|
+
@teardown = p ? p : block
|
124
|
+
end
|
125
|
+
|
126
|
+
def on(code, p = nil, &block)
|
127
|
+
f = p ? p : block
|
128
|
+
case code
|
129
|
+
when Fixnum
|
130
|
+
@callbacks[code] = f
|
131
|
+
else
|
132
|
+
@callbacks[code.to_sym] = f
|
133
|
+
end
|
134
|
+
end
|
135
|
+
#execute the fetch task using the browser
|
136
|
+
def fetch(url,redirects=0,depth=0)
|
137
|
+
puts "get the max depth" if depth > @max_depth
|
138
|
+
puts "get the max redirects:redirects:#{redirects},max_redirects:#{@max_redirects}" if redirects > @max_redirects
|
139
|
+
@logger.log_msg( "get the max depth:#{url}") if depth > @max_depth
|
140
|
+
@logger.log_msg( "get the max redirects:#{url}") if redirects > @max_redirects
|
141
|
+
return 1 if depth > max_depth
|
142
|
+
return 1 if redirects > @max_redirects
|
143
|
+
resp=@browser.get(URI::parse(url))
|
144
|
+
do_callbacks(url,resp)
|
145
|
+
if resp.redirect?
|
146
|
+
new_url=gen_full_url(url,resp["Location"])
|
147
|
+
begin
|
148
|
+
@urlStorage<< new_url if urlCanBeCralwered?(new_url)
|
149
|
+
@relationStorage.save(url,new_url) if urlCanBeCralwered?(new_url)
|
150
|
+
rescue
|
151
|
+
end
|
152
|
+
fetch(new_url,redirects+1,depth)
|
153
|
+
return
|
154
|
+
end
|
155
|
+
if !resp.success?
|
156
|
+
@logger.log_msg("url fetch failed:#{url}") unless resp.success?
|
157
|
+
@urlStorage.error(url) if @urlStorage.respond_to? :error
|
158
|
+
return false
|
159
|
+
end
|
160
|
+
content=resp.body
|
161
|
+
#content=@tidy.tidy(content)
|
162
|
+
|
163
|
+
@contentStorage.add(url,content)
|
164
|
+
puts "content nil:#{url}" if $DEBUG and content.nil?
|
165
|
+
return if content.nil?
|
166
|
+
@urlStorage.visited(url)
|
167
|
+
allUrls=GrabLinksByRegex(content,url)
|
168
|
+
allUrls.delete_if { |u| !isGoodUrl(u) }
|
169
|
+
allUrls.each{|u|
|
170
|
+
begin
|
171
|
+
@urlStorage<< u if urlCanBeCralwered?(u)
|
172
|
+
@relationStorage.save(url,u) if urlCanBeCralwered?(u)
|
173
|
+
rescue
|
174
|
+
end
|
175
|
+
}
|
176
|
+
end
|
177
|
+
#if the url is a HTML
|
178
|
+
def isGoodUrl(url)
|
179
|
+
@logger.log_msg "warning:url #{url} is too long to storage." if url.length > @conf["url_max_length"]
|
180
|
+
return false if url.length > @conf["url_max_length"]
|
181
|
+
return false if (url =~ /\.gif$/)
|
182
|
+
return false if (url =~ /\.jpg$/)
|
183
|
+
return false if (url =~ /\.png$/)
|
184
|
+
return false if (url =~ /\.js$/)
|
185
|
+
return false if (url =~ /\.css$/)
|
186
|
+
if @can_leave_domain
|
187
|
+
return true
|
188
|
+
else
|
189
|
+
return true if urlInDomain(url)
|
190
|
+
false
|
191
|
+
end
|
192
|
+
end
|
193
|
+
#if the url leave the domain
|
194
|
+
def urlInDomain(url)
|
195
|
+
return true if (url =~ @same_domain_regexp)
|
196
|
+
false
|
197
|
+
end
|
198
|
+
|
199
|
+
# get the base by url
|
200
|
+
#@param base_url:: url
|
201
|
+
def getBaseUrl(url)
|
202
|
+
return url if url =~ /\/$/
|
203
|
+
base=File.dirname(url)+"/"
|
204
|
+
if base == "http:/"
|
205
|
+
if url=~ /\/$/
|
206
|
+
base=url
|
207
|
+
else
|
208
|
+
base=url+ "/"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
base
|
212
|
+
end
|
213
|
+
# Get the domain substatement
|
214
|
+
#@param u:: url
|
215
|
+
def getDomainField(u)
|
216
|
+
u.sub!(/http:\/\//i,"")
|
217
|
+
ar=u.split("/")
|
218
|
+
domain=ar.shift
|
219
|
+
return "http://"+domain
|
220
|
+
end
|
221
|
+
#Get the links From html content
|
222
|
+
#@param u:: url of the content
|
223
|
+
#@return:: Array
|
224
|
+
def GrabLinksByW3c(html,u)
|
225
|
+
base=u.split("?").shift
|
226
|
+
urls=[]
|
227
|
+
url=""
|
228
|
+
BeautifulStoneSoup.new(html).find_all('a').each do |tag|
|
229
|
+
if tag["href"] =~ /http:/i
|
230
|
+
url = tag['href'] if tag['href']
|
231
|
+
elsif tag["href"] =~ /^\//
|
232
|
+
url = getDomainField(u) + tag["href"]
|
233
|
+
else
|
234
|
+
url = base+tag['href'] if tag['href']
|
235
|
+
end
|
236
|
+
url.gsub!(/\&/,'&')
|
237
|
+
urls << url
|
238
|
+
end
|
239
|
+
return urls
|
240
|
+
end
|
241
|
+
#Get the links of html content by regexp
|
242
|
+
#@param html:: html content
|
243
|
+
#@param u:: original url of the html document
|
244
|
+
#@return:: Array
|
245
|
+
def GrabLinksByRegex(html,u)
|
246
|
+
base_url = (html.scan(/<base\s+href="(.*?)"/i).flatten)[0]
|
247
|
+
u=base_url unless base_url.nil?
|
248
|
+
base=getBaseUrl(u)
|
249
|
+
urls=[]
|
250
|
+
url=""
|
251
|
+
hrefs=scan_html_relative_links(html,base)
|
252
|
+
hrefs.each {|w|
|
253
|
+
next unless w
|
254
|
+
next if (w =~ /^#/)
|
255
|
+
next if w =~ /^mailto:/i
|
256
|
+
next if w =~ /^javascript:/i
|
257
|
+
w.gsub!(/([^#]+)#(.*)/,'\1') #remove the strings after the char '#'
|
258
|
+
if w =~ /([a-zA-Z]{3,6}):\/\//i
|
259
|
+
url = w
|
260
|
+
elsif w =~ /^\//
|
261
|
+
url = getDomainField(base) + w
|
262
|
+
else
|
263
|
+
url = base + w
|
264
|
+
end
|
265
|
+
url.gsub!(/\&/,"&")
|
266
|
+
url=fixUrlValidate(url)
|
267
|
+
urls << url
|
268
|
+
}
|
269
|
+
urls
|
270
|
+
end
|
271
|
+
#get the full url path
|
272
|
+
def gen_full_url(base_uri,link)
|
273
|
+
base=getBaseUrl(base_uri)
|
274
|
+
link.gsub!("./","")
|
275
|
+
link.gsub!(/([^#]+)#(.*)/,'\1')
|
276
|
+
if !(link =~ /([a-zA-Z]{3,6}):\/\//i).nil?
|
277
|
+
url = link
|
278
|
+
elsif link =~ /^\//
|
279
|
+
url = getDomainField(base)+link
|
280
|
+
else
|
281
|
+
url = base + link
|
282
|
+
end
|
283
|
+
url
|
284
|
+
end
|
285
|
+
#change the url from http://www.sohu.com/../i/fin/./../index.html
|
286
|
+
#to http://www.sohu.com/i/index.html
|
287
|
+
def fixUrlValidate(u)
|
288
|
+
a=u.split("/")
|
289
|
+
n=[]
|
290
|
+
a.each{
|
291
|
+
|v|
|
292
|
+
next if v=="."
|
293
|
+
n.pop if v==".." and n.length>3
|
294
|
+
next if v==".."
|
295
|
+
n.push(v)
|
296
|
+
}
|
297
|
+
l=n.join("/")
|
298
|
+
@badUrlSymbols.each{|k|
|
299
|
+
l.gsub! k[0],k[1]
|
300
|
+
}
|
301
|
+
l
|
302
|
+
end
|
303
|
+
#返回所有的链接
|
304
|
+
def scan_html_relative_links(html,base_url)
|
305
|
+
links=[]
|
306
|
+
R_links_regexps.each{ |r|
|
307
|
+
matches=html.scan(r)
|
308
|
+
matches.each{ |m|
|
309
|
+
links.push m[0] if m[3].nil? and m[0] != ""
|
310
|
+
}
|
311
|
+
}
|
312
|
+
links
|
313
|
+
end
|
314
|
+
def scan_html_simple_links(html,base_url)
|
315
|
+
r=regexp.compile('href=[\'\"]([^\'^\"^\s]*)[\'\"]')
|
316
|
+
links=[]
|
317
|
+
matches=html.scan(r)
|
318
|
+
matches.each{ |m|
|
319
|
+
links.push m[0]
|
320
|
+
}
|
321
|
+
links
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|