rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+
8
+ require 'set'
9
+ require "thread"
10
+ module Rspider
11
+ =begin rdoc
12
+ This class acts as a manager and control threads to run in order
13
+ and avoid to make CPU load heavy.
14
+ === Examples
15
+ * tp=ThreadPool.new(3)
16
+ *
17
+ * 1.upto(30) { |j|
18
+ * t=tp.dispatch(j) {|i|
19
+ * puts "thread ##{i} start"
20
+ * sleep rand
21
+ * puts "thread ##{i} end"
22
+ * }
23
+ * }
24
+ * tp.shutdown
25
+ =end
26
+ class ThreadPool
27
+ #initialize method
28
+ #Param: max_size: Max threads can be active at same time
29
+ def initialize(max_size)
30
+ @pool=[]
31
+ @max_size=max_size
32
+ @pool_mutex=Mutex.new
33
+ @pool_cv=ConditionVariable.new
34
+ end
35
+ #Add a new thread to the pool
36
+ #
37
+ def dispatch(*args)
38
+ Thread.new do
39
+ @pool_mutex.synchronize do
40
+ while @pool.size >=@max_size
41
+ print "pool full;waiting run #{args.join(',')}...\n" if $DEBUG
42
+ @pool_cv.wait(@pool_mutex)
43
+ end
44
+ end
45
+ @pool << Thread.current
46
+ begin
47
+ yield(* args)
48
+ rescue => e
49
+ exception(self,e,*args)
50
+ ensure
51
+ @pool_mutex.synchronize do
52
+ @pool.delete(Thread.current)
53
+ @pool_cv.signal
54
+ end
55
+ end
56
+ end
57
+ end
58
+ #wait all the threads to exit
59
+ def shutdown
60
+ @pool_mutex.synchronize {
61
+ @pool_cv.wait(@pool_mutex) until @pool.empty?
62
+ }
63
+ end
64
+ #we got an error
65
+ def exception (thread,exception,*original_args)
66
+ puts "Exception in thread #{thread}:#{exception}"
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,59 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+
8
+
9
+ require "socket"
10
+ module Rspider
11
+ =begin rdoc
12
+ Base class of UrlDispatchers
13
+ =end
14
+ class UrlDispatcher
15
+ #@param host:: Host ,like '127.0.0.1'
16
+ #@param port:: Port ,like 1099
17
+ def initialize(host,port,source)
18
+ @host=host
19
+ @port=port
20
+ @source=source
21
+ @visited=[]
22
+ @urlsToVisit=[]
23
+ end
24
+ def AddUrl(url,site)
25
+ end
26
+ def FetchUnvisitedUrl(site)
27
+ end
28
+ def ReportWhenVisited(url,site)
29
+ end
30
+ def IfUrlVisited(url,site)
31
+ end
32
+ def getResponse(msg)
33
+ socket = TCPSocket.new(@host,@port)
34
+ socket.puts(msg)
35
+ line = socket.gets
36
+ return line
37
+ end
38
+ end
39
+
40
+ class UrlDispatcherClient < UrlDispatcher
41
+
42
+ def <<(url)
43
+ getResponse("add #{url} #{@source}")
44
+ end
45
+ def pop()
46
+ u=getResponse("fetch #{@source}")
47
+ return u.strip unless u.nil?
48
+ end
49
+ def visited(url)
50
+ getResponse("mark_visited #{url} #{@source}")
51
+ end
52
+ def visited?(url)
53
+
54
+ end
55
+ end
56
+
57
+ class UrlDispatcherServer < UrlDispatcher
58
+ end
59
+ end
@@ -0,0 +1,44 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ class UrlScorer
9
+ def initialize
10
+ end
11
+ #caculate the score of url
12
+ def UrlScorer.score(url)
13
+ UrlScorer.scoreDomain(url) + UrlScorer.scoreExt(url) + UrlScorer.scoreDepth(url)
14
+ end
15
+ #caculate the score of domain
16
+ def UrlScorer.scoreDomain(url)
17
+ u=url.gsub(/http:\/\/([^\/]+)\/(.*)/,'\1')
18
+ return 5 if u =~ /org$/
19
+ return 4 if u =~ /org\.cn$/
20
+ return 2 if u =~ /cn$/
21
+ return 3
22
+ end
23
+ #caculate the score of the extension name of file
24
+ def UrlScorer.scoreExt(url)
25
+ ext=File.extname(url).split("?").shift.to_s
26
+ if (ext == ".jsp" or ext == ".asp" or ext == ".php" or ext == ".aspx" or ext == ".cgi" or ext == ".pl")
27
+ return 2
28
+ elsif (ext==".html" or ext == ".html" or ext == ".shtml")
29
+ return 3
30
+ elsif (ext == ".doc" or ext == ".ppt" or ext == ".zip" )
31
+ return 4
32
+ else
33
+ return 2
34
+ end
35
+ end
36
+ #caculate the score of the url's file depth
37
+ def UrlScorer.scoreDepth(url)
38
+ u=url.gsub("http://","")
39
+ depth=u.split("/").length
40
+ return 1 if depth > 4
41
+ return 5-depth
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,44 @@
1
+ module Rspider
2
+ class UrlStorage
3
+ attr_accessor :urls,:visitedUrls
4
+ def initialize()
5
+ @visitedUrls=[]
6
+ @urls=[]
7
+ end
8
+ def <<(u)
9
+ #puts "add u:#{u}"
10
+ return nil if @visitedUrls.include?(u)
11
+ return nil if @urls.include?(u)
12
+ @urls<< u
13
+ end
14
+ def visited?(u)
15
+ @visitedUrls.include?(u)
16
+ end
17
+ def pop()
18
+ @urls[rand(@urls.length-1)]
19
+ end
20
+ def visited(u)
21
+ @urls.delete(u) { puts "not found when delete #{u}"}
22
+ @visitedUrls<<u
23
+ end
24
+ def error(url)
25
+ @urls.delete(url)
26
+ end
27
+ end
28
+ class UrlStorageInMemcache
29
+ def initialize()
30
+ @cache=MemCache.new "localhost:11211",:namespace=>"hel"
31
+ end
32
+ def <<(u)
33
+ @cache.set("#{u}","N",86400,true)
34
+ end
35
+ def pop()
36
+ end
37
+ def visited(u)
38
+ @cache.set("#{u}","Y",86400,true)
39
+ end
40
+ def visited?(u)
41
+ @cache.get("#{u}",true) == "Y"
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,127 @@
1
+ #
2
+ require "net/http"
3
+ require "open-uri"
4
+ module Net #:nodoc:
5
+ class HTTPResponse #:nodoc:
6
+ def success?; false; end
7
+ def redirect?; false; end
8
+ end
9
+ class HTTPSuccess #:nodoc:
10
+ def success?; true; end
11
+ end
12
+ class HTTPRedirection #:nodoc:
13
+ def redirect?; true; end
14
+ end
15
+ end
16
+ # httpal/lib/httpal/browser.rb
17
+ #
18
+ # Created by Bryce Kerley on 2006-12-23.
19
+ # Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
20
+ # This software is licensed under the GNU LGPL 2.1 .
21
+ # See COPYING in the httpal root for the full license.
22
+ #
23
+ # Subversion info:
24
+ # $Id: browser.rb 28 2007-03-23 15:41:07Z bkerley $
25
+ module HTTPal
26
+ class Browser
27
+ attr_accessor :read_timeout,:open_timeout
28
+ def initialize(ua="Mozilla/Firefox 2.0.11",max_len=2048)
29
+ @cookies = []
30
+ @referer = nil
31
+ @userAgent=ua
32
+ @max_len=max_len-1
33
+ @open_timeout=5
34
+ @read_timeout=5
35
+ end
36
+
37
+ def use(&block)
38
+ instance_eval(&block)
39
+ end
40
+
41
+ def get(uri)
42
+ uri,path = parseuri(uri)
43
+ req = Net::HTTP::Get.new(path)
44
+ send_request(req, uri)
45
+ end
46
+
47
+ def post(uri, fields)
48
+ uri,path = parseuri(uri)
49
+ req = Net::HTTP::Post.new(path)
50
+ req.set_form_data fields
51
+ send_request(req, uri)
52
+ end
53
+
54
+ private
55
+
56
+ def parseuri(uri)
57
+ uri = URI.parse(uri) unless uri.is_a? URI::HTTP
58
+ path = uri.path.empty? ? '/' : uri.path
59
+ path << '?' << uri.query if uri.query
60
+ return [uri, path]
61
+ end
62
+
63
+ def send_request(req, uri)
64
+ req['cookie'] = get_cookies_for_uri(uri)
65
+ req['referer'] =get_referer
66
+ req["User-Agent"]=@userAgent
67
+ req["Range"]="bytes=0-#{@max_len}"
68
+ default_port = uri.scheme == "https" ? "443" : "80"
69
+ http = Net::HTTP.new(uri.host, uri.port ? uri.port : default_port)
70
+ http.open_timeout=@open_timeout
71
+ http.read_timeout=@read_timeout
72
+ http.use_ssl = true if uri.scheme == "https"
73
+
74
+ res = http.start {
75
+ |h|
76
+ h.request(req)
77
+ }
78
+ set_cookie_for_uri(uri, res['set-cookie'])
79
+ set_referer(uri)
80
+
81
+ return res
82
+ end
83
+
84
+ def get_referer
85
+ @referer if @referer
86
+ ""
87
+ end
88
+
89
+ def set_referer(referer)
90
+ @referer = referer.to_s
91
+ end
92
+
93
+ def get_cookies_for_uri(uri)
94
+ hs = uri.host.split('.')
95
+ hm = (1..hs.length).inject([]) do |c,n|
96
+ c << hs[(-n)..-1].join('.')
97
+ end
98
+ ps = uri.path.split('/')
99
+ pm = ['/']
100
+ ps.length.times do |n|
101
+ pm << ps[0..n].join('/')
102
+ end
103
+ pm.delete ''
104
+
105
+ hostmatch = @cookies.inject([]) do |set, cur|
106
+ set << cur if hm.include? cur.domain
107
+ set
108
+ end
109
+
110
+ # TODO: PATH-MATCH COOKIES
111
+ end
112
+
113
+ def set_cookie_for_uri(uri, setcookie)
114
+ return unless setcookie
115
+ newcookies = CookieMonster.parse_set_cookie(setcookie)
116
+ @cookies.instance_eval do
117
+ newcookies.each do |c|
118
+ c.domain = uri.host unless c.domain
119
+ self[index(c)] = c if include?(c)
120
+ self << c unless include?(c)
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ end
127
+ end
@@ -0,0 +1,113 @@
1
+ #
2
+ # httpal/lib/httpal/cookie.rb
3
+ #
4
+ # Created by Bryce Kerley on 2006-12-23.
5
+ # Copyright (c) 2006-2007 Bryce Kerley. All rights reserved.
6
+ # This software is licensed under the GNU LGPL 2.1 .
7
+ # See COPYING in the httpal root for the full license.
8
+ #
9
+ # Subversion info:
10
+ # $Id: cookie.rb 20 2007-02-08 02:13:16Z bkerley $
11
+ module HTTPal
12
+ class CookieMonster
13
+ def self.parse_set_cookie(sc)
14
+ r = []
15
+ # need to figure out how to do the multi-cookie split
16
+ # since the comma separates both cookies and dates
17
+
18
+ # TODO: IMPLEMENT A LESS BAD SOLUTION
19
+ sc = sc.gsub(/expires=(\w\w\w)\,/,'expires=\1')
20
+
21
+ sc.split(',').each do |str|
22
+ str.chomp!
23
+ r << Cookie.new(str)
24
+ end
25
+ return r
26
+ end
27
+ end
28
+ class CookieManager
29
+ def initialize
30
+ @cookies=[]
31
+ end
32
+ def get_cookies_for_uri(uri)
33
+ hs = uri.host.split('.')
34
+ hm = (1..hs.length).inject([]) do |c,n|
35
+ c << hs[(-n)..-1].join('.')
36
+ end
37
+ ps = uri.path.split('/')
38
+ pm = ['/']
39
+ ps.length.times do |n|
40
+ pm << ps[0..n].join('/')
41
+ end
42
+ pm.delete ''
43
+
44
+ hostmatch = @cookies.inject([]) do |set, cur|
45
+ set << cur if hm.include? cur.domain
46
+ set
47
+ end
48
+
49
+ # TODO: PATH-MATCH COOKIES
50
+ end
51
+
52
+ def set_cookie_for_uri(uri, setcookie)
53
+ return unless setcookie
54
+ newcookies = CookieMonster.parse_set_cookie(setcookie)
55
+ @cookies.instance_eval do
56
+ newcookies.each do |c|
57
+ c.domain = uri.host unless c.domain
58
+ self[index(c)] = c if include?(c)
59
+ self << c unless include?(c)
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ class Cookie
66
+ @@fields = [:path, :domain, :expires]
67
+ attr_accessor :name, :value
68
+ attr_accessor *@@fields
69
+ def initialize(string)
70
+ components = string.split(';')
71
+ firstpair = nil
72
+ components.each do |p|
73
+ sp = p.split('=',2)
74
+ k = sp[0]
75
+ v = sp[1]
76
+ k.strip!
77
+ v.strip!
78
+ if firstpair == nil
79
+ send(:name=, k)
80
+ send(:value=, v)
81
+ firstpair = true
82
+ else
83
+ next unless @@fields.include? k.downcase.to_sym
84
+ v = v.gsub(/^\./,'') if k.downcase.to_sym == :domain
85
+ send((k+'=').downcase.to_sym, v)
86
+ end
87
+ end
88
+ end
89
+
90
+ def to_s
91
+ r = "#{name}=#{value}"
92
+ @@fields.each do |f|
93
+ r << "; #{f.to_s}=#{send(f)}" if send(f)
94
+ end
95
+ return r
96
+ end
97
+
98
+ def inspect
99
+ "\#<Cookie #{name}=#{value} from #{domain}#{path} until #{expires}>"
100
+ end
101
+
102
+ include Comparable
103
+ def <=> (anOther)
104
+ #sort by domain, then path, then name
105
+ d = self.domain.<=>(anOther.domain)
106
+ p = self.path.<=>(anOther.path)
107
+ n = self.name.<=>(anOther.name)
108
+ return d unless d == 0
109
+ return p unless p == 0
110
+ return n
111
+ end
112
+ end
113
+ end