rspider 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
|
8
|
+
require 'set'
|
9
|
+
require "thread"
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
This class acts as a manager and control threads to run in order
|
13
|
+
and avoid to make CPU load heavy.
|
14
|
+
=== Examples
|
15
|
+
* tp=ThreadPool.new(3)
|
16
|
+
*
|
17
|
+
* 1.upto(30) { |j|
|
18
|
+
* t=tp.dispatch(j) {|i|
|
19
|
+
* puts "thread ##{i} start"
|
20
|
+
* sleep rand
|
21
|
+
* puts "thread ##{i} end"
|
22
|
+
* }
|
23
|
+
* }
|
24
|
+
* tp.shutdown
|
25
|
+
=end
|
26
|
+
class ThreadPool
|
27
|
+
#initialize method
|
28
|
+
#Param: max_size: Max threads can be active at same time
|
29
|
+
def initialize(max_size)
|
30
|
+
@pool=[]
|
31
|
+
@max_size=max_size
|
32
|
+
@pool_mutex=Mutex.new
|
33
|
+
@pool_cv=ConditionVariable.new
|
34
|
+
end
|
35
|
+
#Add a new thread to the pool
|
36
|
+
#
|
37
|
+
def dispatch(*args)
|
38
|
+
Thread.new do
|
39
|
+
@pool_mutex.synchronize do
|
40
|
+
while @pool.size >=@max_size
|
41
|
+
print "pool full;waiting run #{args.join(',')}...\n" if $DEBUG
|
42
|
+
@pool_cv.wait(@pool_mutex)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
@pool << Thread.current
|
46
|
+
begin
|
47
|
+
yield(* args)
|
48
|
+
rescue => e
|
49
|
+
exception(self,e,*args)
|
50
|
+
ensure
|
51
|
+
@pool_mutex.synchronize do
|
52
|
+
@pool.delete(Thread.current)
|
53
|
+
@pool_cv.signal
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
#wait all the threads to exit
|
59
|
+
def shutdown
|
60
|
+
@pool_mutex.synchronize {
|
61
|
+
@pool_cv.wait(@pool_mutex) until @pool.empty?
|
62
|
+
}
|
63
|
+
end
|
64
|
+
#we got an error
|
65
|
+
def exception (thread,exception,*original_args)
|
66
|
+
puts "Exception in thread #{thread}:#{exception}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
|
8
|
+
|
9
|
+
require "socket"
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
Base class of UrlDispatchers
|
13
|
+
=end
|
14
|
+
class UrlDispatcher
|
15
|
+
#@param host:: Host ,like '127.0.0.1'
|
16
|
+
#@param port:: Port ,like 1099
|
17
|
+
def initialize(host,port,source)
|
18
|
+
@host=host
|
19
|
+
@port=port
|
20
|
+
@source=source
|
21
|
+
@visited=[]
|
22
|
+
@urlsToVisit=[]
|
23
|
+
end
|
24
|
+
def AddUrl(url,site)
|
25
|
+
end
|
26
|
+
def FetchUnvisitedUrl(site)
|
27
|
+
end
|
28
|
+
def ReportWhenVisited(url,site)
|
29
|
+
end
|
30
|
+
def IfUrlVisited(url,site)
|
31
|
+
end
|
32
|
+
def getResponse(msg)
|
33
|
+
socket = TCPSocket.new(@host,@port)
|
34
|
+
socket.puts(msg)
|
35
|
+
line = socket.gets
|
36
|
+
return line
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class UrlDispatcherClient < UrlDispatcher
|
41
|
+
|
42
|
+
def <<(url)
|
43
|
+
getResponse("add #{url} #{@source}")
|
44
|
+
end
|
45
|
+
def pop()
|
46
|
+
u=getResponse("fetch #{@source}")
|
47
|
+
return u.strip unless u.nil?
|
48
|
+
end
|
49
|
+
def visited(url)
|
50
|
+
getResponse("mark_visited #{url} #{@source}")
|
51
|
+
end
|
52
|
+
def visited?(url)
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class UrlDispatcherServer < UrlDispatcher
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
class UrlScorer
|
9
|
+
def initialize
|
10
|
+
end
|
11
|
+
#caculate the score of url
|
12
|
+
def UrlScorer.score(url)
|
13
|
+
UrlScorer.scoreDomain(url) + UrlScorer.scoreExt(url) + UrlScorer.scoreDepth(url)
|
14
|
+
end
|
15
|
+
#caculate the score of domain
|
16
|
+
def UrlScorer.scoreDomain(url)
|
17
|
+
u=url.gsub(/http:\/\/([^\/]+)\/(.*)/,'\1')
|
18
|
+
return 5 if u =~ /org$/
|
19
|
+
return 4 if u =~ /org\.cn$/
|
20
|
+
return 2 if u =~ /cn$/
|
21
|
+
return 3
|
22
|
+
end
|
23
|
+
#caculate the score of the extension name of file
|
24
|
+
def UrlScorer.scoreExt(url)
|
25
|
+
ext=File.extname(url).split("?").shift.to_s
|
26
|
+
if (ext == ".jsp" or ext == ".asp" or ext == ".php" or ext == ".aspx" or ext == ".cgi" or ext == ".pl")
|
27
|
+
return 2
|
28
|
+
elsif (ext==".html" or ext == ".html" or ext == ".shtml")
|
29
|
+
return 3
|
30
|
+
elsif (ext == ".doc" or ext == ".ppt" or ext == ".zip" )
|
31
|
+
return 4
|
32
|
+
else
|
33
|
+
return 2
|
34
|
+
end
|
35
|
+
end
|
36
|
+
#caculate the score of the url's file depth
|
37
|
+
def UrlScorer.scoreDepth(url)
|
38
|
+
u=url.gsub("http://","")
|
39
|
+
depth=u.split("/").length
|
40
|
+
return 1 if depth > 4
|
41
|
+
return 5-depth
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Rspider
|
2
|
+
class UrlStorage
|
3
|
+
attr_accessor :urls,:visitedUrls
|
4
|
+
def initialize()
|
5
|
+
@visitedUrls=[]
|
6
|
+
@urls=[]
|
7
|
+
end
|
8
|
+
def <<(u)
|
9
|
+
#puts "add u:#{u}"
|
10
|
+
return nil if @visitedUrls.include?(u)
|
11
|
+
return nil if @urls.include?(u)
|
12
|
+
@urls<< u
|
13
|
+
end
|
14
|
+
def visited?(u)
|
15
|
+
@visitedUrls.include?(u)
|
16
|
+
end
|
17
|
+
def pop()
|
18
|
+
@urls[rand(@urls.length-1)]
|
19
|
+
end
|
20
|
+
def visited(u)
|
21
|
+
@urls.delete(u) { puts "not found when delete #{u}"}
|
22
|
+
@visitedUrls<<u
|
23
|
+
end
|
24
|
+
def error(url)
|
25
|
+
@urls.delete(url)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
class UrlStorageInMemcache
|
29
|
+
def initialize()
|
30
|
+
@cache=MemCache.new "localhost:11211",:namespace=>"hel"
|
31
|
+
end
|
32
|
+
def <<(u)
|
33
|
+
@cache.set("#{u}","N",86400,true)
|
34
|
+
end
|
35
|
+
def pop()
|
36
|
+
end
|
37
|
+
def visited(u)
|
38
|
+
@cache.set("#{u}","Y",86400,true)
|
39
|
+
end
|
40
|
+
def visited?(u)
|
41
|
+
@cache.get("#{u}",true) == "Y"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#
|
2
|
+
require "net/http"
|
3
|
+
require "open-uri"
|
4
|
+
module Net #:nodoc:
|
5
|
+
class HTTPResponse #:nodoc:
|
6
|
+
def success?; false; end
|
7
|
+
def redirect?; false; end
|
8
|
+
end
|
9
|
+
class HTTPSuccess #:nodoc:
|
10
|
+
def success?; true; end
|
11
|
+
end
|
12
|
+
class HTTPRedirection #:nodoc:
|
13
|
+
def redirect?; true; end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
# httpal/lib/httpal/browser.rb
|
17
|
+
#
|
18
|
+
# Created by Bryce Kerley on 2006-12-23.
|
19
|
+
# Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
|
20
|
+
# This software is licensed under the GNU LGPL 2.1 .
|
21
|
+
# See COPYING in the httpal root for the full license.
|
22
|
+
#
|
23
|
+
# Subversion info:
|
24
|
+
# $Id: browser.rb 28 2007-03-23 15:41:07Z bkerley $
|
25
|
+
module HTTPal
|
26
|
+
class Browser
|
27
|
+
attr_accessor :read_timeout,:open_timeout
|
28
|
+
def initialize(ua="Mozilla/Firefox 2.0.11",max_len=2048)
|
29
|
+
@cookies = []
|
30
|
+
@referer = nil
|
31
|
+
@userAgent=ua
|
32
|
+
@max_len=max_len-1
|
33
|
+
@open_timeout=5
|
34
|
+
@read_timeout=5
|
35
|
+
end
|
36
|
+
|
37
|
+
def use(&block)
|
38
|
+
instance_eval(&block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def get(uri)
|
42
|
+
uri,path = parseuri(uri)
|
43
|
+
req = Net::HTTP::Get.new(path)
|
44
|
+
send_request(req, uri)
|
45
|
+
end
|
46
|
+
|
47
|
+
def post(uri, fields)
|
48
|
+
uri,path = parseuri(uri)
|
49
|
+
req = Net::HTTP::Post.new(path)
|
50
|
+
req.set_form_data fields
|
51
|
+
send_request(req, uri)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def parseuri(uri)
|
57
|
+
uri = URI.parse(uri) unless uri.is_a? URI::HTTP
|
58
|
+
path = uri.path.empty? ? '/' : uri.path
|
59
|
+
path << '?' << uri.query if uri.query
|
60
|
+
return [uri, path]
|
61
|
+
end
|
62
|
+
|
63
|
+
def send_request(req, uri)
|
64
|
+
req['cookie'] = get_cookies_for_uri(uri)
|
65
|
+
req['referer'] =get_referer
|
66
|
+
req["User-Agent"]=@userAgent
|
67
|
+
req["Range"]="bytes=0-#{@max_len}"
|
68
|
+
default_port = uri.scheme == "https" ? "443" : "80"
|
69
|
+
http = Net::HTTP.new(uri.host, uri.port ? uri.port : default_port)
|
70
|
+
http.open_timeout=@open_timeout
|
71
|
+
http.read_timeout=@read_timeout
|
72
|
+
http.use_ssl = true if uri.scheme == "https"
|
73
|
+
|
74
|
+
res = http.start {
|
75
|
+
|h|
|
76
|
+
h.request(req)
|
77
|
+
}
|
78
|
+
set_cookie_for_uri(uri, res['set-cookie'])
|
79
|
+
set_referer(uri)
|
80
|
+
|
81
|
+
return res
|
82
|
+
end
|
83
|
+
|
84
|
+
def get_referer
|
85
|
+
@referer if @referer
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
def set_referer(referer)
|
90
|
+
@referer = referer.to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_cookies_for_uri(uri)
|
94
|
+
hs = uri.host.split('.')
|
95
|
+
hm = (1..hs.length).inject([]) do |c,n|
|
96
|
+
c << hs[(-n)..-1].join('.')
|
97
|
+
end
|
98
|
+
ps = uri.path.split('/')
|
99
|
+
pm = ['/']
|
100
|
+
ps.length.times do |n|
|
101
|
+
pm << ps[0..n].join('/')
|
102
|
+
end
|
103
|
+
pm.delete ''
|
104
|
+
|
105
|
+
hostmatch = @cookies.inject([]) do |set, cur|
|
106
|
+
set << cur if hm.include? cur.domain
|
107
|
+
set
|
108
|
+
end
|
109
|
+
|
110
|
+
# TODO: PATH-MATCH COOKIES
|
111
|
+
end
|
112
|
+
|
113
|
+
def set_cookie_for_uri(uri, setcookie)
|
114
|
+
return unless setcookie
|
115
|
+
newcookies = CookieMonster.parse_set_cookie(setcookie)
|
116
|
+
@cookies.instance_eval do
|
117
|
+
newcookies.each do |c|
|
118
|
+
c.domain = uri.host unless c.domain
|
119
|
+
self[index(c)] = c if include?(c)
|
120
|
+
self << c unless include?(c)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#
|
2
|
+
# httpal/lib/httpal/cookie.rb
|
3
|
+
#
|
4
|
+
# Created by Bryce Kerley on 2006-12-23.
|
5
|
+
# Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
|
6
|
+
# This software is licensed under the GNU LGPL 2.1 .
|
7
|
+
# See COPYING in the httpal root for the full license.
|
8
|
+
#
|
9
|
+
# Subversion info:
|
10
|
+
# $Id: cookie.rb 20 2007-02-08 02:13:16Z bkerley $
|
11
|
+
module HTTPal
|
12
|
+
class CookieMonster
|
13
|
+
def self.parse_set_cookie(sc)
|
14
|
+
r = []
|
15
|
+
# need to figure out how to do the multi-cookie split
|
16
|
+
# since the comma separates both cookies and dates
|
17
|
+
|
18
|
+
# TODO: IMPLEMENT A LESS BAD SOLUTION
|
19
|
+
sc = sc.gsub(/expires=(\w\w\w)\,/,'expires=\1')
|
20
|
+
|
21
|
+
sc.split(',').each do |str|
|
22
|
+
str.chomp!
|
23
|
+
r << Cookie.new(str)
|
24
|
+
end
|
25
|
+
return r
|
26
|
+
end
|
27
|
+
end
|
28
|
+
class CookieManager
|
29
|
+
def initialize
|
30
|
+
@cookies=[]
|
31
|
+
end
|
32
|
+
def get_cookies_for_uri(uri)
|
33
|
+
hs = uri.host.split('.')
|
34
|
+
hm = (1..hs.length).inject([]) do |c,n|
|
35
|
+
c << hs[(-n)..-1].join('.')
|
36
|
+
end
|
37
|
+
ps = uri.path.split('/')
|
38
|
+
pm = ['/']
|
39
|
+
ps.length.times do |n|
|
40
|
+
pm << ps[0..n].join('/')
|
41
|
+
end
|
42
|
+
pm.delete ''
|
43
|
+
|
44
|
+
hostmatch = @cookies.inject([]) do |set, cur|
|
45
|
+
set << cur if hm.include? cur.domain
|
46
|
+
set
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO: PATH-MATCH COOKIES
|
50
|
+
end
|
51
|
+
|
52
|
+
def set_cookie_for_uri(uri, setcookie)
|
53
|
+
return unless setcookie
|
54
|
+
newcookies = CookieMonster.parse_set_cookie(setcookie)
|
55
|
+
@cookies.instance_eval do
|
56
|
+
newcookies.each do |c|
|
57
|
+
c.domain = uri.host unless c.domain
|
58
|
+
self[index(c)] = c if include?(c)
|
59
|
+
self << c unless include?(c)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
class Cookie
|
66
|
+
@@fields = [:path, :domain, :expires]
|
67
|
+
attr_accessor :name, :value
|
68
|
+
attr_accessor *@@fields
|
69
|
+
def initialize(string)
|
70
|
+
components = string.split(';')
|
71
|
+
firstpair = nil
|
72
|
+
components.each do |p|
|
73
|
+
sp = p.split('=',2)
|
74
|
+
k = sp[0]
|
75
|
+
v = sp[1]
|
76
|
+
k.strip!
|
77
|
+
v.strip!
|
78
|
+
if firstpair == nil
|
79
|
+
send(:name=, k)
|
80
|
+
send(:value=, v)
|
81
|
+
firstpair = true
|
82
|
+
else
|
83
|
+
next unless @@fields.include? k.downcase.to_sym
|
84
|
+
v = v.gsub(/^\./,'') if k.downcase.to_sym == :domain
|
85
|
+
send((k+'=').downcase.to_sym, v)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def to_s
|
91
|
+
r = "#{name}=#{value}"
|
92
|
+
@@fields.each do |f|
|
93
|
+
r << "; #{f.to_s}=#{send(f)}" if send(f)
|
94
|
+
end
|
95
|
+
return r
|
96
|
+
end
|
97
|
+
|
98
|
+
def inspect
|
99
|
+
"\#<Cookie #{name}=#{value} from #{domain}#{path} until #{expires}>"
|
100
|
+
end
|
101
|
+
|
102
|
+
include Comparable
|
103
|
+
def <=> (anOther)
|
104
|
+
#sort by domain, then path, then name
|
105
|
+
d = self.domain.<=>(anOther.domain)
|
106
|
+
p = self.path.<=>(anOther.path)
|
107
|
+
n = self.name.<=>(anOther.name)
|
108
|
+
return d unless d == 0
|
109
|
+
return p unless p == 0
|
110
|
+
return n
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|