RubyGems - rspider - Versions diffs - 0.8.4 - Mend

rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/Changelog +32 -0
data/Rakefile +66 -0
data/ToDo +19 -0
data/bin/linkcheck.rb +37 -0
data/bin/main.rb +41 -0
data/conf/local.conf +23 -0
data/lib/rspider.rb +34 -0
data/lib/rspider/ConfParser.rb +149 -0
data/lib/rspider/ContentStorage.rb +130 -0
data/lib/rspider/DataWasher.rb +129 -0
data/lib/rspider/Document.rb +100 -0
data/lib/rspider/DocumentExtractor.rb +21 -0
data/lib/rspider/HtmlTidy.rb +34 -0
data/lib/rspider/Logger.rb +49 -0
data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
data/lib/rspider/MysqlUrlStorage.rb +107 -0
data/lib/rspider/OptParser.rb +53 -0
data/lib/rspider/RobotRules.rb +92 -0
data/lib/rspider/SiteLocker.rb +45 -0
data/lib/rspider/Spider.rb +324 -0
data/lib/rspider/ThreadPool.rb +69 -0
data/lib/rspider/UrlDispatcher.rb +59 -0
data/lib/rspider/UrlScorer.rb +44 -0
data/lib/rspider/UrlStorage.rb +44 -0
data/lib/rspider/browser.rb +127 -0
data/lib/rspider/cookie.rb +113 -0
data/lib/rspider/links.rb +111 -0
data/lib/rspider/mysql.rb +1131 -0
data/sql/db.sql +90 -0
metadata +73 -0

data/lib/rspider/OptParser.rb ADDED

@@ -0,0 +1,53 @@
+module Rspider
+	class OptParser < Hash
+		def initialize(args)
+			super()
+			self[:conf]=""
+			self[:env]="TEST"
+			self[:debug]="on"
+			opts=OptionParser.new do |opt|
+				opt.banner="Usage:#$0 [options]"
+				opt.on("-c","--conf [STRING]",
+					'The Configuration File') do |confFile|
+					confFile.chomp!
+					if confFile == ""
+						puts "No configuration file given"
+						exit
+					end
+					if confFile.nil?
+						puts "Configuration not specifed"
+						exit
+					end
+					if !File.file?(confFile)
+						puts "Configuration #{confFile} not exists"
+						exit
+					end
+					self[:conf]=confFile
+				end
+				opt.on("-e","--env [STRING]",
+					'The Enviroment ') do |env|
+					if env.upcase =="PRO"
+						env="PRO"
+					else
+						env="TEST"
+					end
+					self[:env]=env
+				end
+				opt.on("-d","--debug [on|off]",'show debug messages') do |d|
+					if d.upcase == "ON"
+						d="on"
+					else
+						d="off"
+					end
+					self[:debug]=d
+				end
+				opt.on("-h","--help",'display this help and exit') do
+					puts opt
+					exit
+				end
+			end
+			opts.parse!(args)
+		end
+	end
+end

data/lib/rspider/RobotRules.rb ADDED

@@ -0,0 +1,92 @@
+# Understand robots.txt.
+#  Created by James Edward Gray II on 2006-01-31.
+#  Copyright 2006 Gray Productions. All rights reserved.
+require "uri"
+module Rspider
+# Based on Perl's WWW::RobotRules module, by Gisle Aas.
+	class RobotRules
+	#user_agent is string like 'Mozilla/IE6.0 ' and some thing else
+	#This will send to site in header field:
+	#============================
+	#GET /robots.txt HTTP/1.1
+	#HOST www.example.com
+	#User-Agent:#{user_agent}
+	#===========================
+	  def initialize( user_agent )
+		@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
+		@rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
+	  end
+	#parse  data of robots.txt
+	  def parse( text_uri, robots_data )
+		begin
+		uri      = URI.parse(text_uri)
+		rescue Exception=>e
+			puts "-"*80
+			puts "\n"*3
+			puts e
+			puts "uri:#{text_uri}"
+		end
+		location = "#{uri.host}:#{uri.port}"
+		@rules.delete(location)
+		rules      = robots_data.split(/[\015\012]+/).map do |rule|
+		  rule.sub(/\s*#.*$/, "")
+		end
+		anon_rules = Array.new
+		my_rules   = Array.new
+		current    = anon_rules
+		rules.each do |rule|
+		  case rule
+		  when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
+			break unless my_rules.empty?
+			current = if $1 == "*"
+						anon_rules
+					  elsif $1.downcase.index(@user_agent)
+						my_rules
+					  else
+						nil
+					  end
+		  when /^\s*Disallow\s*:\s*(.*?)\s*$/i
+			next if current.nil?
+			if $1.empty?
+			  current << nil
+			else
+			  disallow = URI.parse($1)
+			  next unless disallow.scheme.nil? or disallow.scheme ==
+				uri.scheme
+			  next unless disallow.port.nil?   or disallow.port == uri.port
+			  next unless disallow.host.nil?   or
+			  disallow.host.downcase == uri.host.downcase
+			  disallow = disallow.path
+			  disallow = "/"            if disallow.empty?
+			  disallow = "/#{disallow}" unless disallow[0] == ?/
+			  current << disallow
+			end
+		  end
+		end
+		@rules[location] = if my_rules.empty?
+							 anon_rules.compact
+						   else
+							 my_rules.compact
+						   end
+	  end
+	#decide that if we can cralwer the url
+	  def allowed?( text_uri )
+		uri      = URI.parse(text_uri)
+		location = "#{uri.host}:#{uri.port}"
+		path     = uri.path
+		return true unless %w{http https}.include?(uri.scheme)
+		not @rules[location].any? { |rule| path.index(rule) == 0 }
+	  end
+	end
+end

data/lib/rspider/SiteLocker.rb ADDED

@@ -0,0 +1,45 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+module Rspider
+=begin rdoc
+This class hold a site and avoid from visiting a site in heavy frequency.
+=end
+	class SiteLocker
+		attr_accessor :site,:time,:max
+#initialization subprocess
+#
+		def initialize(max,site="www.example.com")
+			@visits=Hash.new
+			@max=max
+			@site=site
+			@time=5
+		end
+#we visit a site ,we log this
+		def visitedSite()
+			t=Time.now.to_i
+			@visits[t]=@visits[t].to_i+1
+		end
+#If we can visit the site again?
+		def canVisitSite?()
+			t=Time.now.to_i-@time
+			@visits.delete_if{|k,v|
+				k<(t-@time)
+			}
+			values=0
+			@visits.values.each{|v|
+				values = values +v
+			}
+			return values<@max
+		end
+#dump the data struct to string
+		def to_s
+			temp=@visits.collect{|k,v| "visits[#{k}]\t=>\t#{v}"}
+			temp.join("\n")
+		end
+	end
+end

data/lib/rspider/Spider.rb ADDED

@@ -0,0 +1,324 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+require "uri"
+require "open-uri"
+require "net/http"
+require "net/https"
+require "cgi"
+	R_links_regexps=[/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i,
+	/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
+	/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
+	/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i,
+	/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]+(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i ]
+module Rspider
+#this class if the main class of the Rspider library
+#It cralwers sites and storage urls,documents
+	class Spider
+		attr_accessor :urls,:can_leave_domain,:threads,:max_depth,:buckets,:source,:threads,:same_domain_regexp,:urlStorage,:logger,:browser,:contentStorage,:siteLockers,:relationStorage
+#Param conf must be hash from ConfParser
+		def initialize(conf)
+			@can_leave_domain=conf["can_leave_domain"]
+			@max_depth=conf["max_depth"]
+			@max_redirects=conf["max_redirects"]
+			@save_path=conf["save_path"]
+			@buckets=conf["buckets"]
+			@source=conf["source"]
+			@threads=conf["threads"]
+			@same_domain_regexp=Regexp.new(conf["same_domain_regexp"])
+			@conf=conf
+			@accepted_formats="text/html,text/xml,text/plain"
+			@logger=nil
+			@contentStorage=nil
+			@tidy=HtmlTidy.new
+			@setup=nil
+			@teardown=nil
+			@callbacks=Hash.new
+			@siteLockers=Hash.new
+			@robotRules=RobotRules.new(@conf["agent"])
+			@visitedRules=[]
+			@badUrlSymbols=["@","<",">","(",")","$","*","[","]"]
+			@badUrlSymbols.collect!{
+				|k|
+				[k,CGI.escape(k)]
+			}
+		end
+#main entrance of the class
+		def start_from(url)
+			@logger.log_msg "start url can't be crawled!#"  if ( $DEBUG and !urlCanBeCralwered?(url))
+			@urlStorage<<url if urlCanBeCralwered?(url)
+		end
+		def run(max_times=16)
+			j=0
+            while(true) do
+			#	$tracker.stop($stdout)  if interrupted
+				return if j > max_times
+				j = j +1
+                url=@urlStorage.pop
+#				puts "fetched url:#{url}" if $DEBUG
+                puts "thread ended:no more urls" if (  url.nil? or url=="failed") and  $DEBUG
+				@logger.log_msg("error:no more urls","ERROR") if (url.nil? or url=="failed")
+                exit if url=="failed"
+                exit if url.nil?
+				begin
+					uri=URI::parse(url)
+					domain=uri.host.to_s+":"+uri.port.to_s
+					@siteLockers[domain]=SiteLocker.new(12,domain) unless @siteLockers.has_key?(domain)
+					if @siteLockers[domain].canVisitSite?
+						fetch(url,0) if urlCanBeCralwered?(url)
+			#debug $mem_profiler.add url
+			#debug $mem_profiler.report
+						@siteLockers[domain].visitedSite
+					else
+						sleep 0.5
+						next
+					end
+				rescue Exception => e
+					puts "error:Exception #{e} " if $DEBUG
+				end
+            end
+		end
+#Url can be cralwered . obey the rules from robots.txt
+		def urlCanBeCralwered?(url)
+			uri=URI::parse(url)
+			robot_url="#{uri.scheme}://#{uri.host}:#{uri.port}/robots.txt"
+			if !@visitedRules.include?(robot_url)
+				begin
+					content=@browser.get(URI::parse(robot_url))
+					return true if content.code == "404"
+					can_visit=@robotRules.parse(robot_url,content.body)
+					return can_visit
+				rescue URI::InvalidURIError => invalidUri
+					@logger.log_msg("invalid uri:#{url}")
+					return false
+				rescue Exception => e
+					puts "there is some thing wrong!"
+					return true
+				end
+				@visitedRules << robot_url
+			end
+			@robotRules.allowed?(url)
+		end
+#apply filters
+		def do_callbacks(url, resp)
+			cbs = [@callbacks[:every],
+			resp.success? ?  @callbacks[:success] : @callbacks[:failure],
+			@callbacks[resp.code.to_i]]
+			cbs.each do |cb|
+				cb.call(url, resp ) if cb
+			end
+		end
+        def setup(p = nil, &block)
+            @setup = p ? p : block
+        end
+        # Run last, once for each page. Given the URL as a string.
+        def teardown(p = nil, &block)
+            @teardown = p ? p : block
+        end
+        def on(code, p = nil, &block)
+            f = p ? p : block
+            case code
+            when Fixnum
+                @callbacks[code] = f
+            else
+                @callbacks[code.to_sym] = f
+            end
+        end
+#execute the fetch task using the browser
+		def fetch(url,redirects=0,depth=0)
+			puts "get the max depth" if depth > @max_depth
+			puts "get the max redirects:redirects:#{redirects},max_redirects:#{@max_redirects}" if redirects > @max_redirects
+			@logger.log_msg(  "get the max depth:#{url}") if depth > @max_depth
+			@logger.log_msg(  "get the max redirects:#{url}") if redirects > @max_redirects
+			return 1 if depth > max_depth
+			return 1 if redirects > @max_redirects
+			resp=@browser.get(URI::parse(url))
+			do_callbacks(url,resp)
+			if resp.redirect?
+				new_url=gen_full_url(url,resp["Location"])
+				begin
+					@urlStorage<< new_url if  urlCanBeCralwered?(new_url)
+					@relationStorage.save(url,new_url) if  urlCanBeCralwered?(new_url)
+				rescue
+				end
+				fetch(new_url,redirects+1,depth)
+				return
+			end
+			if !resp.success?
+				@logger.log_msg("url fetch failed:#{url}") unless resp.success?
+				@urlStorage.error(url) if @urlStorage.respond_to? :error
+				return false
+			end
+			content=resp.body
+			#content=@tidy.tidy(content)
+			@contentStorage.add(url,content)
+			puts "content nil:#{url}" if $DEBUG and content.nil?
+			return if content.nil?
+			@urlStorage.visited(url)
+			allUrls=GrabLinksByRegex(content,url)
+			allUrls.delete_if { |u|  !isGoodUrl(u) }
+			allUrls.each{|u|
+				begin
+				@urlStorage<< u if  urlCanBeCralwered?(u)
+				@relationStorage.save(url,u) if  urlCanBeCralwered?(u)
+				rescue
+				end
+			}
+		end
+#if the url is a HTML
+		def isGoodUrl(url)
+			@logger.log_msg  "warning:url #{url} is too long to storage." if url.length > @conf["url_max_length"]
+			return false if url.length > @conf["url_max_length"]
+			return false if (url =~ /\.gif$/)
+			return false if (url =~ /\.jpg$/)
+			return false if (url =~ /\.png$/)
+			return false if (url =~ /\.js$/)
+			return false if (url =~ /\.css$/)
+			if @can_leave_domain
+				return true
+			else
+				return true if urlInDomain(url)
+				false
+			end
+		end
+#if the url leave the domain
+		def urlInDomain(url)
+			return true if (url =~ @same_domain_regexp)
+			false
+		end
+		# get the base by url
+		#@param base_url:: url
+		def getBaseUrl(url)
+			return url if url =~ /\/$/
+            base=File.dirname(url)+"/"
+            if base == "http:/"
+                if url=~ /\/$/
+                    base=url
+                else
+                    base=url+ "/"
+                end
+            end
+			base
+		end
+		# Get the domain substatement
+		#@param u:: url
+		def getDomainField(u)
+			u.sub!(/http:\/\//i,"")
+			ar=u.split("/")
+			domain=ar.shift
+			return "http://"+domain
+		end
+		#Get the links From html content
+		#@param u:: url of the content
+		#@return::	Array
+		def GrabLinksByW3c(html,u)
+			base=u.split("?").shift
+			urls=[]
+			url=""
+			BeautifulStoneSoup.new(html).find_all('a').each do |tag|
+				if tag["href"] =~ /http:/i
+					url = tag['href'] if tag['href']
+				elsif tag["href"] =~ /^\//
+					url = getDomainField(u) + tag["href"]
+				else
+					url = base+tag['href'] if tag['href']
+				end
+				url.gsub!(/\&amp;/,'&')
+				urls << url
+			end
+			return urls
+		end
+		#Get the links of html content by regexp
+		#@param html:: html content
+		#@param u:: original url of the html document
+		#@return:: Array
+		def GrabLinksByRegex(html,u)
+			base_url = (html.scan(/<base\s+href="(.*?)"/i).flatten)[0]
+			u=base_url unless base_url.nil?
+			base=getBaseUrl(u)
+			urls=[]
+			url=""
+			hrefs=scan_html_relative_links(html,base)
+			hrefs.each {|w|
+				next unless w
+				next if (w =~ /^#/)
+				next if w =~ /^mailto:/i
+				next if w =~ /^javascript:/i
+				w.gsub!(/([^#]+)#(.*)/,'\1') #remove the strings after the char '#'
+				if w =~ /([a-zA-Z]{3,6}):\/\//i
+					url = w
+				elsif w =~ /^\//
+					url = getDomainField(base) + w
+				else
+					url = base + w
+				end
+				url.gsub!(/\&amp;/,"&")
+				url=fixUrlValidate(url)
+				urls << url
+			}
+			urls
+		end
+#get the full url path
+		def gen_full_url(base_uri,link)
+            base=getBaseUrl(base_uri)
+			link.gsub!("./","")
+			link.gsub!(/([^#]+)#(.*)/,'\1')
+			if !(link =~ /([a-zA-Z]{3,6}):\/\//i).nil?
+				url = link
+			elsif link =~ /^\//
+				url = getDomainField(base)+link
+			else
+				url = base + link
+			end
+			url
+		end
+#change the url from http://www.sohu.com/../i/fin/./../index.html
+#to http://www.sohu.com/i/index.html
+		def fixUrlValidate(u)
+			a=u.split("/")
+			n=[]
+			a.each{
+				|v|
+				next if v=="."
+				n.pop if v==".." and n.length>3
+				next if v==".."
+				n.push(v)
+			}
+			l=n.join("/")
+			@badUrlSymbols.each{|k|
+				l.gsub! k[0],k[1]
+			}
+			l
+		end
+		#返回所有的链接
+		def scan_html_relative_links(html,base_url)
+			links=[]
+			R_links_regexps.each{ |r|
+				matches=html.scan(r)
+				matches.each{ |m|
+					links.push m[0] if m[3].nil? and m[0] != ""
+				}
+			}
+			links
+		end
+		def scan_html_simple_links(html,base_url)
+			r=regexp.compile('href=[\'\"]([^\'^\"^\s]*)[\'\"]')
+			links=[]
+			matches=html.scan(r)
+			matches.each{ |m|
+				links.push m[0]
+			}
+			links
+		end
+	end
+end