rspider 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
|
8
|
+
require 'set'
|
9
|
+
require "thread"
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
This class acts as a manager and control threads to run in order
|
13
|
+
and avoid to make CPU load heavy.
|
14
|
+
=== Examples
|
15
|
+
* tp=ThreadPool.new(3)
|
16
|
+
*
|
17
|
+
* 1.upto(30) { |j|
|
18
|
+
* t=tp.dispatch(j) {|i|
|
19
|
+
* puts "thread ##{i} start"
|
20
|
+
* sleep rand
|
21
|
+
* puts "thread ##{i} end"
|
22
|
+
* }
|
23
|
+
* }
|
24
|
+
* tp.shutdown
|
25
|
+
=end
|
26
|
+
class ThreadPool
|
27
|
+
#initialize method
|
28
|
+
#Param: max_size: Max threads can be active at same time
|
29
|
+
def initialize(max_size)
|
30
|
+
@pool=[]
|
31
|
+
@max_size=max_size
|
32
|
+
@pool_mutex=Mutex.new
|
33
|
+
@pool_cv=ConditionVariable.new
|
34
|
+
end
|
35
|
+
#Add a new thread to the pool
|
36
|
+
#
|
37
|
+
def dispatch(*args)
|
38
|
+
Thread.new do
|
39
|
+
@pool_mutex.synchronize do
|
40
|
+
while @pool.size >=@max_size
|
41
|
+
print "pool full;waiting run #{args.join(',')}...\n" if $DEBUG
|
42
|
+
@pool_cv.wait(@pool_mutex)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
@pool << Thread.current
|
46
|
+
begin
|
47
|
+
yield(* args)
|
48
|
+
rescue => e
|
49
|
+
exception(self,e,*args)
|
50
|
+
ensure
|
51
|
+
@pool_mutex.synchronize do
|
52
|
+
@pool.delete(Thread.current)
|
53
|
+
@pool_cv.signal
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
#wait all the threads to exit
|
59
|
+
def shutdown
|
60
|
+
@pool_mutex.synchronize {
|
61
|
+
@pool_cv.wait(@pool_mutex) until @pool.empty?
|
62
|
+
}
|
63
|
+
end
|
64
|
+
#we got an error
|
65
|
+
def exception (thread,exception,*original_args)
|
66
|
+
puts "Exception in thread #{thread}:#{exception}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
|
8
|
+
|
9
|
+
require "socket"
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
Base class of UrlDispatchers
|
13
|
+
=end
|
14
|
+
class UrlDispatcher
|
15
|
+
#@param host:: Host ,like '127.0.0.1'
|
16
|
+
#@param port:: Port ,like 1099
|
17
|
+
def initialize(host,port,source)
|
18
|
+
@host=host
|
19
|
+
@port=port
|
20
|
+
@source=source
|
21
|
+
@visited=[]
|
22
|
+
@urlsToVisit=[]
|
23
|
+
end
|
24
|
+
def AddUrl(url,site)
|
25
|
+
end
|
26
|
+
def FetchUnvisitedUrl(site)
|
27
|
+
end
|
28
|
+
def ReportWhenVisited(url,site)
|
29
|
+
end
|
30
|
+
def IfUrlVisited(url,site)
|
31
|
+
end
|
32
|
+
def getResponse(msg)
|
33
|
+
socket = TCPSocket.new(@host,@port)
|
34
|
+
socket.puts(msg)
|
35
|
+
line = socket.gets
|
36
|
+
return line
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class UrlDispatcherClient < UrlDispatcher
|
41
|
+
|
42
|
+
def <<(url)
|
43
|
+
getResponse("add #{url} #{@source}")
|
44
|
+
end
|
45
|
+
def pop()
|
46
|
+
u=getResponse("fetch #{@source}")
|
47
|
+
return u.strip unless u.nil?
|
48
|
+
end
|
49
|
+
def visited(url)
|
50
|
+
getResponse("mark_visited #{url} #{@source}")
|
51
|
+
end
|
52
|
+
def visited?(url)
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class UrlDispatcherServer < UrlDispatcher
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
class UrlScorer
|
9
|
+
def initialize
|
10
|
+
end
|
11
|
+
#caculate the score of url
|
12
|
+
def UrlScorer.score(url)
|
13
|
+
UrlScorer.scoreDomain(url) + UrlScorer.scoreExt(url) + UrlScorer.scoreDepth(url)
|
14
|
+
end
|
15
|
+
#caculate the score of domain
|
16
|
+
def UrlScorer.scoreDomain(url)
|
17
|
+
u=url.gsub(/http:\/\/([^\/]+)\/(.*)/,'\1')
|
18
|
+
return 5 if u =~ /org$/
|
19
|
+
return 4 if u =~ /org\.cn$/
|
20
|
+
return 2 if u =~ /cn$/
|
21
|
+
return 3
|
22
|
+
end
|
23
|
+
#caculate the score of the extension name of file
|
24
|
+
def UrlScorer.scoreExt(url)
|
25
|
+
ext=File.extname(url).split("?").shift.to_s
|
26
|
+
if (ext == ".jsp" or ext == ".asp" or ext == ".php" or ext == ".aspx" or ext == ".cgi" or ext == ".pl")
|
27
|
+
return 2
|
28
|
+
elsif (ext==".html" or ext == ".html" or ext == ".shtml")
|
29
|
+
return 3
|
30
|
+
elsif (ext == ".doc" or ext == ".ppt" or ext == ".zip" )
|
31
|
+
return 4
|
32
|
+
else
|
33
|
+
return 2
|
34
|
+
end
|
35
|
+
end
|
36
|
+
#caculate the score of the url's file depth
|
37
|
+
def UrlScorer.scoreDepth(url)
|
38
|
+
u=url.gsub("http://","")
|
39
|
+
depth=u.split("/").length
|
40
|
+
return 1 if depth > 4
|
41
|
+
return 5-depth
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Rspider
|
2
|
+
class UrlStorage
|
3
|
+
attr_accessor :urls,:visitedUrls
|
4
|
+
def initialize()
|
5
|
+
@visitedUrls=[]
|
6
|
+
@urls=[]
|
7
|
+
end
|
8
|
+
def <<(u)
|
9
|
+
#puts "add u:#{u}"
|
10
|
+
return nil if @visitedUrls.include?(u)
|
11
|
+
return nil if @urls.include?(u)
|
12
|
+
@urls<< u
|
13
|
+
end
|
14
|
+
def visited?(u)
|
15
|
+
@visitedUrls.include?(u)
|
16
|
+
end
|
17
|
+
def pop()
|
18
|
+
@urls[rand(@urls.length-1)]
|
19
|
+
end
|
20
|
+
def visited(u)
|
21
|
+
@urls.delete(u) { puts "not found when delete #{u}"}
|
22
|
+
@visitedUrls<<u
|
23
|
+
end
|
24
|
+
def error(url)
|
25
|
+
@urls.delete(url)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
class UrlStorageInMemcache
|
29
|
+
def initialize()
|
30
|
+
@cache=MemCache.new "localhost:11211",:namespace=>"hel"
|
31
|
+
end
|
32
|
+
def <<(u)
|
33
|
+
@cache.set("#{u}","N",86400,true)
|
34
|
+
end
|
35
|
+
def pop()
|
36
|
+
end
|
37
|
+
def visited(u)
|
38
|
+
@cache.set("#{u}","Y",86400,true)
|
39
|
+
end
|
40
|
+
def visited?(u)
|
41
|
+
@cache.get("#{u}",true) == "Y"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#
|
2
|
+
require "net/http"
|
3
|
+
require "open-uri"
|
4
|
+
module Net #:nodoc:
|
5
|
+
class HTTPResponse #:nodoc:
|
6
|
+
def success?; false; end
|
7
|
+
def redirect?; false; end
|
8
|
+
end
|
9
|
+
class HTTPSuccess #:nodoc:
|
10
|
+
def success?; true; end
|
11
|
+
end
|
12
|
+
class HTTPRedirection #:nodoc:
|
13
|
+
def redirect?; true; end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
# httpal/lib/httpal/browser.rb
|
17
|
+
#
|
18
|
+
# Created by Bryce Kerley on 2006-12-23.
|
19
|
+
# Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
|
20
|
+
# This software is licensed under the GNU LGPL 2.1 .
|
21
|
+
# See COPYING in the httpal root for the full license.
|
22
|
+
#
|
23
|
+
# Subversion info:
|
24
|
+
# $Id: browser.rb 28 2007-03-23 15:41:07Z bkerley $
|
25
|
+
module HTTPal
|
26
|
+
class Browser
|
27
|
+
attr_accessor :read_timeout,:open_timeout
|
28
|
+
def initialize(ua="Mozilla/Firefox 2.0.11",max_len=2048)
|
29
|
+
@cookies = []
|
30
|
+
@referer = nil
|
31
|
+
@userAgent=ua
|
32
|
+
@max_len=max_len-1
|
33
|
+
@open_timeout=5
|
34
|
+
@read_timeout=5
|
35
|
+
end
|
36
|
+
|
37
|
+
def use(&block)
|
38
|
+
instance_eval(&block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def get(uri)
|
42
|
+
uri,path = parseuri(uri)
|
43
|
+
req = Net::HTTP::Get.new(path)
|
44
|
+
send_request(req, uri)
|
45
|
+
end
|
46
|
+
|
47
|
+
def post(uri, fields)
|
48
|
+
uri,path = parseuri(uri)
|
49
|
+
req = Net::HTTP::Post.new(path)
|
50
|
+
req.set_form_data fields
|
51
|
+
send_request(req, uri)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def parseuri(uri)
|
57
|
+
uri = URI.parse(uri) unless uri.is_a? URI::HTTP
|
58
|
+
path = uri.path.empty? ? '/' : uri.path
|
59
|
+
path << '?' << uri.query if uri.query
|
60
|
+
return [uri, path]
|
61
|
+
end
|
62
|
+
|
63
|
+
def send_request(req, uri)
|
64
|
+
req['cookie'] = get_cookies_for_uri(uri)
|
65
|
+
req['referer'] =get_referer
|
66
|
+
req["User-Agent"]=@userAgent
|
67
|
+
req["Range"]="bytes=0-#{@max_len}"
|
68
|
+
default_port = uri.scheme == "https" ? "443" : "80"
|
69
|
+
http = Net::HTTP.new(uri.host, uri.port ? uri.port : default_port)
|
70
|
+
http.open_timeout=@open_timeout
|
71
|
+
http.read_timeout=@read_timeout
|
72
|
+
http.use_ssl = true if uri.scheme == "https"
|
73
|
+
|
74
|
+
res = http.start {
|
75
|
+
|h|
|
76
|
+
h.request(req)
|
77
|
+
}
|
78
|
+
set_cookie_for_uri(uri, res['set-cookie'])
|
79
|
+
set_referer(uri)
|
80
|
+
|
81
|
+
return res
|
82
|
+
end
|
83
|
+
|
84
|
+
def get_referer
|
85
|
+
@referer if @referer
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
def set_referer(referer)
|
90
|
+
@referer = referer.to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_cookies_for_uri(uri)
|
94
|
+
hs = uri.host.split('.')
|
95
|
+
hm = (1..hs.length).inject([]) do |c,n|
|
96
|
+
c << hs[(-n)..-1].join('.')
|
97
|
+
end
|
98
|
+
ps = uri.path.split('/')
|
99
|
+
pm = ['/']
|
100
|
+
ps.length.times do |n|
|
101
|
+
pm << ps[0..n].join('/')
|
102
|
+
end
|
103
|
+
pm.delete ''
|
104
|
+
|
105
|
+
hostmatch = @cookies.inject([]) do |set, cur|
|
106
|
+
set << cur if hm.include? cur.domain
|
107
|
+
set
|
108
|
+
end
|
109
|
+
|
110
|
+
# TODO: PATH-MATCH COOKIES
|
111
|
+
end
|
112
|
+
|
113
|
+
def set_cookie_for_uri(uri, setcookie)
|
114
|
+
return unless setcookie
|
115
|
+
newcookies = CookieMonster.parse_set_cookie(setcookie)
|
116
|
+
@cookies.instance_eval do
|
117
|
+
newcookies.each do |c|
|
118
|
+
c.domain = uri.host unless c.domain
|
119
|
+
self[index(c)] = c if include?(c)
|
120
|
+
self << c unless include?(c)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#
|
2
|
+
# httpal/lib/httpal/cookie.rb
|
3
|
+
#
|
4
|
+
# Created by Bryce Kerley on 2006-12-23.
|
5
|
+
# Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
|
6
|
+
# This software is licensed under the GNU LGPL 2.1 .
|
7
|
+
# See COPYING in the httpal root for the full license.
|
8
|
+
#
|
9
|
+
# Subversion info:
|
10
|
+
# $Id: cookie.rb 20 2007-02-08 02:13:16Z bkerley $
|
11
|
+
module HTTPal
|
12
|
+
class CookieMonster
|
13
|
+
def self.parse_set_cookie(sc)
|
14
|
+
r = []
|
15
|
+
# need to figure out how to do the multi-cookie split
|
16
|
+
# since the comma separates both cookies and dates
|
17
|
+
|
18
|
+
# TODO: IMPLEMENT A LESS BAD SOLUTION
|
19
|
+
sc = sc.gsub(/expires=(\w\w\w)\,/,'expires=\1')
|
20
|
+
|
21
|
+
sc.split(',').each do |str|
|
22
|
+
str.chomp!
|
23
|
+
r << Cookie.new(str)
|
24
|
+
end
|
25
|
+
return r
|
26
|
+
end
|
27
|
+
end
|
28
|
+
class CookieManager
|
29
|
+
def initialize
|
30
|
+
@cookies=[]
|
31
|
+
end
|
32
|
+
def get_cookies_for_uri(uri)
|
33
|
+
hs = uri.host.split('.')
|
34
|
+
hm = (1..hs.length).inject([]) do |c,n|
|
35
|
+
c << hs[(-n)..-1].join('.')
|
36
|
+
end
|
37
|
+
ps = uri.path.split('/')
|
38
|
+
pm = ['/']
|
39
|
+
ps.length.times do |n|
|
40
|
+
pm << ps[0..n].join('/')
|
41
|
+
end
|
42
|
+
pm.delete ''
|
43
|
+
|
44
|
+
hostmatch = @cookies.inject([]) do |set, cur|
|
45
|
+
set << cur if hm.include? cur.domain
|
46
|
+
set
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO: PATH-MATCH COOKIES
|
50
|
+
end
|
51
|
+
|
52
|
+
def set_cookie_for_uri(uri, setcookie)
|
53
|
+
return unless setcookie
|
54
|
+
newcookies = CookieMonster.parse_set_cookie(setcookie)
|
55
|
+
@cookies.instance_eval do
|
56
|
+
newcookies.each do |c|
|
57
|
+
c.domain = uri.host unless c.domain
|
58
|
+
self[index(c)] = c if include?(c)
|
59
|
+
self << c unless include?(c)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
class Cookie
|
66
|
+
@@fields = [:path, :domain, :expires]
|
67
|
+
attr_accessor :name, :value
|
68
|
+
attr_accessor *@@fields
|
69
|
+
def initialize(string)
|
70
|
+
components = string.split(';')
|
71
|
+
firstpair = nil
|
72
|
+
components.each do |p|
|
73
|
+
sp = p.split('=',2)
|
74
|
+
k = sp[0]
|
75
|
+
v = sp[1]
|
76
|
+
k.strip!
|
77
|
+
v.strip!
|
78
|
+
if firstpair == nil
|
79
|
+
send(:name=, k)
|
80
|
+
send(:value=, v)
|
81
|
+
firstpair = true
|
82
|
+
else
|
83
|
+
next unless @@fields.include? k.downcase.to_sym
|
84
|
+
v = v.gsub(/^\./,'') if k.downcase.to_sym == :domain
|
85
|
+
send((k+'=').downcase.to_sym, v)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def to_s
|
91
|
+
r = "#{name}=#{value}"
|
92
|
+
@@fields.each do |f|
|
93
|
+
r << "; #{f.to_s}=#{send(f)}" if send(f)
|
94
|
+
end
|
95
|
+
return r
|
96
|
+
end
|
97
|
+
|
98
|
+
def inspect
|
99
|
+
"\#<Cookie #{name}=#{value} from #{domain}#{path} until #{expires}>"
|
100
|
+
end
|
101
|
+
|
102
|
+
include Comparable
|
103
|
+
def <=> (anOther)
|
104
|
+
#sort by domain, then path, then name
|
105
|
+
d = self.domain.<=>(anOther.domain)
|
106
|
+
p = self.path.<=>(anOther.path)
|
107
|
+
n = self.name.<=>(anOther.name)
|
108
|
+
return d unless d == 0
|
109
|
+
return p unless p == 0
|
110
|
+
return n
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|