rspider 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,69 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+
8
+ require 'set'
9
+ require "thread"
10
+ module Rspider
11
+ =begin rdoc
12
+ This class acts as a manager and control threads to run in order
13
+ and avoid to make CPU load heavy.
14
+ === Examples
15
+ * tp=ThreadPool.new(3)
16
+ *
17
+ * 1.upto(30) { |j|
18
+ * t=tp.dispatch(j) {|i|
19
+ * puts "thread ##{i} start"
20
+ * sleep rand
21
+ * puts "thread ##{i} end"
22
+ * }
23
+ * }
24
+ * tp.shutdown
25
+ =end
26
+ class ThreadPool
27
+ #initialize method
28
+ #Param: max_size: Max threads can be active at same time
29
+ def initialize(max_size)
30
+ @pool=[]
31
+ @max_size=max_size
32
+ @pool_mutex=Mutex.new
33
+ @pool_cv=ConditionVariable.new
34
+ end
35
+ #Add a new thread to the pool
36
+ #
37
+ def dispatch(*args)
38
+ Thread.new do
39
+ @pool_mutex.synchronize do
40
+ while @pool.size >=@max_size
41
+ print "pool full;waiting run #{args.join(',')}...\n" if $DEBUG
42
+ @pool_cv.wait(@pool_mutex)
43
+ end
44
+ end
45
+ @pool << Thread.current
46
+ begin
47
+ yield(* args)
48
+ rescue => e
49
+ exception(self,e,*args)
50
+ ensure
51
+ @pool_mutex.synchronize do
52
+ @pool.delete(Thread.current)
53
+ @pool_cv.signal
54
+ end
55
+ end
56
+ end
57
+ end
58
+ #wait all the threads to exit
59
+ def shutdown
60
+ @pool_mutex.synchronize {
61
+ @pool_cv.wait(@pool_mutex) until @pool.empty?
62
+ }
63
+ end
64
+ #we got an error
65
+ def exception (thread,exception,*original_args)
66
+ puts "Exception in thread #{thread}:#{exception}"
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,59 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+
8
+
9
+ require "socket"
10
+ module Rspider
11
+ =begin rdoc
12
+ Base class of UrlDispatchers
13
+ =end
14
+ class UrlDispatcher
15
+ #@param host:: Host ,like '127.0.0.1'
16
+ #@param port:: Port ,like 1099
17
+ def initialize(host,port,source)
18
+ @host=host
19
+ @port=port
20
+ @source=source
21
+ @visited=[]
22
+ @urlsToVisit=[]
23
+ end
24
+ def AddUrl(url,site)
25
+ end
26
+ def FetchUnvisitedUrl(site)
27
+ end
28
+ def ReportWhenVisited(url,site)
29
+ end
30
+ def IfUrlVisited(url,site)
31
+ end
32
+ def getResponse(msg)
33
+ socket = TCPSocket.new(@host,@port)
34
+ socket.puts(msg)
35
+ line = socket.gets
36
+ return line
37
+ end
38
+ end
39
+
40
+ class UrlDispatcherClient < UrlDispatcher
41
+
42
+ def <<(url)
43
+ getResponse("add #{url} #{@source}")
44
+ end
45
+ def pop()
46
+ u=getResponse("fetch #{@source}")
47
+ return u.strip unless u.nil?
48
+ end
49
+ def visited(url)
50
+ getResponse("mark_visited #{url} #{@source}")
51
+ end
52
+ def visited?(url)
53
+
54
+ end
55
+ end
56
+
57
+ class UrlDispatcherServer < UrlDispatcher
58
+ end
59
+ end
@@ -0,0 +1,44 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ class UrlScorer
9
+ def initialize
10
+ end
11
+ #caculate the score of url
12
+ def UrlScorer.score(url)
13
+ UrlScorer.scoreDomain(url) + UrlScorer.scoreExt(url) + UrlScorer.scoreDepth(url)
14
+ end
15
+ #caculate the score of domain
16
+ def UrlScorer.scoreDomain(url)
17
+ u=url.gsub(/http:\/\/([^\/]+)\/(.*)/,'\1')
18
+ return 5 if u =~ /org$/
19
+ return 4 if u =~ /org\.cn$/
20
+ return 2 if u =~ /cn$/
21
+ return 3
22
+ end
23
+ #caculate the score of the extension name of file
24
+ def UrlScorer.scoreExt(url)
25
+ ext=File.extname(url).split("?").shift.to_s
26
+ if (ext == ".jsp" or ext == ".asp" or ext == ".php" or ext == ".aspx" or ext == ".cgi" or ext == ".pl")
27
+ return 2
28
+ elsif (ext==".html" or ext == ".html" or ext == ".shtml")
29
+ return 3
30
+ elsif (ext == ".doc" or ext == ".ppt" or ext == ".zip" )
31
+ return 4
32
+ else
33
+ return 2
34
+ end
35
+ end
36
+ #caculate the score of the url's file depth
37
+ def UrlScorer.scoreDepth(url)
38
+ u=url.gsub("http://","")
39
+ depth=u.split("/").length
40
+ return 1 if depth > 4
41
+ return 5-depth
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,44 @@
1
+ module Rspider
2
+ class UrlStorage
3
+ attr_accessor :urls,:visitedUrls
4
+ def initialize()
5
+ @visitedUrls=[]
6
+ @urls=[]
7
+ end
8
+ def <<(u)
9
+ #puts "add u:#{u}"
10
+ return nil if @visitedUrls.include?(u)
11
+ return nil if @urls.include?(u)
12
+ @urls<< u
13
+ end
14
+ def visited?(u)
15
+ @visitedUrls.include?(u)
16
+ end
17
+ def pop()
18
+ @urls[rand(@urls.length-1)]
19
+ end
20
+ def visited(u)
21
+ @urls.delete(u) { puts "not found when delete #{u}"}
22
+ @visitedUrls<<u
23
+ end
24
+ def error(url)
25
+ @urls.delete(url)
26
+ end
27
+ end
28
+ class UrlStorageInMemcache
29
+ def initialize()
30
+ @cache=MemCache.new "localhost:11211",:namespace=>"hel"
31
+ end
32
+ def <<(u)
33
+ @cache.set("#{u}","N",86400,true)
34
+ end
35
+ def pop()
36
+ end
37
+ def visited(u)
38
+ @cache.set("#{u}","Y",86400,true)
39
+ end
40
+ def visited?(u)
41
+ @cache.get("#{u}",true) == "Y"
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,127 @@
1
+ #
2
+ require "net/http"
3
+ require "open-uri"
4
+ module Net #:nodoc:
5
+ class HTTPResponse #:nodoc:
6
+ def success?; false; end
7
+ def redirect?; false; end
8
+ end
9
+ class HTTPSuccess #:nodoc:
10
+ def success?; true; end
11
+ end
12
+ class HTTPRedirection #:nodoc:
13
+ def redirect?; true; end
14
+ end
15
+ end
16
+ # httpal/lib/httpal/browser.rb
17
+ #
18
+ # Created by Bryce Kerley on 2006-12-23.
19
+ # Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
20
+ # This software is licensed under the GNU LGPL 2.1 .
21
+ # See COPYING in the httpal root for the full license.
22
+ #
23
+ # Subversion info:
24
+ # $Id: browser.rb 28 2007-03-23 15:41:07Z bkerley $
25
+ module HTTPal
26
+ class Browser
27
+ attr_accessor :read_timeout,:open_timeout
28
+ def initialize(ua="Mozilla/Firefox 2.0.11",max_len=2048)
29
+ @cookies = []
30
+ @referer = nil
31
+ @userAgent=ua
32
+ @max_len=max_len-1
33
+ @open_timeout=5
34
+ @read_timeout=5
35
+ end
36
+
37
+ def use(&block)
38
+ instance_eval(&block)
39
+ end
40
+
41
+ def get(uri)
42
+ uri,path = parseuri(uri)
43
+ req = Net::HTTP::Get.new(path)
44
+ send_request(req, uri)
45
+ end
46
+
47
+ def post(uri, fields)
48
+ uri,path = parseuri(uri)
49
+ req = Net::HTTP::Post.new(path)
50
+ req.set_form_data fields
51
+ send_request(req, uri)
52
+ end
53
+
54
+ private
55
+
56
+ def parseuri(uri)
57
+ uri = URI.parse(uri) unless uri.is_a? URI::HTTP
58
+ path = uri.path.empty? ? '/' : uri.path
59
+ path << '?' << uri.query if uri.query
60
+ return [uri, path]
61
+ end
62
+
63
+ def send_request(req, uri)
64
+ req['cookie'] = get_cookies_for_uri(uri)
65
+ req['referer'] =get_referer
66
+ req["User-Agent"]=@userAgent
67
+ req["Range"]="bytes=0-#{@max_len}"
68
+ default_port = uri.scheme == "https" ? "443" : "80"
69
+ http = Net::HTTP.new(uri.host, uri.port ? uri.port : default_port)
70
+ http.open_timeout=@open_timeout
71
+ http.read_timeout=@read_timeout
72
+ http.use_ssl = true if uri.scheme == "https"
73
+
74
+ res = http.start {
75
+ |h|
76
+ h.request(req)
77
+ }
78
+ set_cookie_for_uri(uri, res['set-cookie'])
79
+ set_referer(uri)
80
+
81
+ return res
82
+ end
83
+
84
+ def get_referer
85
+ @referer if @referer
86
+ ""
87
+ end
88
+
89
+ def set_referer(referer)
90
+ @referer = referer.to_s
91
+ end
92
+
93
+ def get_cookies_for_uri(uri)
94
+ hs = uri.host.split('.')
95
+ hm = (1..hs.length).inject([]) do |c,n|
96
+ c << hs[(-n)..-1].join('.')
97
+ end
98
+ ps = uri.path.split('/')
99
+ pm = ['/']
100
+ ps.length.times do |n|
101
+ pm << ps[0..n].join('/')
102
+ end
103
+ pm.delete ''
104
+
105
+ hostmatch = @cookies.inject([]) do |set, cur|
106
+ set << cur if hm.include? cur.domain
107
+ set
108
+ end
109
+
110
+ # TODO: PATH-MATCH COOKIES
111
+ end
112
+
113
+ def set_cookie_for_uri(uri, setcookie)
114
+ return unless setcookie
115
+ newcookies = CookieMonster.parse_set_cookie(setcookie)
116
+ @cookies.instance_eval do
117
+ newcookies.each do |c|
118
+ c.domain = uri.host unless c.domain
119
+ self[index(c)] = c if include?(c)
120
+ self << c unless include?(c)
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ end
127
+ end
@@ -0,0 +1,113 @@
1
+ #
2
+ # httpal/lib/httpal/cookie.rb
3
+ #
4
+ # Created by Bryce Kerley on 2006-12-23.
5
+ # Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
6
+ # This software is licensed under the GNU LGPL 2.1 .
7
+ # See COPYING in the httpal root for the full license.
8
+ #
9
+ # Subversion info:
10
+ # $Id: cookie.rb 20 2007-02-08 02:13:16Z bkerley $
11
+ module HTTPal
12
+ class CookieMonster
13
+ def self.parse_set_cookie(sc)
14
+ r = []
15
+ # need to figure out how to do the multi-cookie split
16
+ # since the comma separates both cookies and dates
17
+
18
+ # TODO: IMPLEMENT A LESS BAD SOLUTION
19
+ sc = sc.gsub(/expires=(\w\w\w)\,/,'expires=\1')
20
+
21
+ sc.split(',').each do |str|
22
+ str.chomp!
23
+ r << Cookie.new(str)
24
+ end
25
+ return r
26
+ end
27
+ end
28
+ class CookieManager
29
+ def initialize
30
+ @cookies=[]
31
+ end
32
+ def get_cookies_for_uri(uri)
33
+ hs = uri.host.split('.')
34
+ hm = (1..hs.length).inject([]) do |c,n|
35
+ c << hs[(-n)..-1].join('.')
36
+ end
37
+ ps = uri.path.split('/')
38
+ pm = ['/']
39
+ ps.length.times do |n|
40
+ pm << ps[0..n].join('/')
41
+ end
42
+ pm.delete ''
43
+
44
+ hostmatch = @cookies.inject([]) do |set, cur|
45
+ set << cur if hm.include? cur.domain
46
+ set
47
+ end
48
+
49
+ # TODO: PATH-MATCH COOKIES
50
+ end
51
+
52
+ def set_cookie_for_uri(uri, setcookie)
53
+ return unless setcookie
54
+ newcookies = CookieMonster.parse_set_cookie(setcookie)
55
+ @cookies.instance_eval do
56
+ newcookies.each do |c|
57
+ c.domain = uri.host unless c.domain
58
+ self[index(c)] = c if include?(c)
59
+ self << c unless include?(c)
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ class Cookie
66
+ @@fields = [:path, :domain, :expires]
67
+ attr_accessor :name, :value
68
+ attr_accessor *@@fields
69
+ def initialize(string)
70
+ components = string.split(';')
71
+ firstpair = nil
72
+ components.each do |p|
73
+ sp = p.split('=',2)
74
+ k = sp[0]
75
+ v = sp[1]
76
+ k.strip!
77
+ v.strip!
78
+ if firstpair == nil
79
+ send(:name=, k)
80
+ send(:value=, v)
81
+ firstpair = true
82
+ else
83
+ next unless @@fields.include? k.downcase.to_sym
84
+ v = v.gsub(/^\./,'') if k.downcase.to_sym == :domain
85
+ send((k+'=').downcase.to_sym, v)
86
+ end
87
+ end
88
+ end
89
+
90
+ def to_s
91
+ r = "#{name}=#{value}"
92
+ @@fields.each do |f|
93
+ r << "; #{f.to_s}=#{send(f)}" if send(f)
94
+ end
95
+ return r
96
+ end
97
+
98
+ def inspect
99
+ "\#<Cookie #{name}=#{value} from #{domain}#{path} until #{expires}>"
100
+ end
101
+
102
+ include Comparable
103
+ def <=> (anOther)
104
+ #sort by domain, then path, then name
105
+ d = self.domain.<=>(anOther.domain)
106
+ p = self.path.<=>(anOther.path)
107
+ n = self.name.<=>(anOther.name)
108
+ return d unless d == 0
109
+ return p unless p == 0
110
+ return n
111
+ end
112
+ end
113
+ end