RubyGems - rspider - Versions diffs - 0.8.4 - Mend

rspider 0.8.4

Files changed (30) hide show

data/Changelog +32 -0
data/Rakefile +66 -0
data/ToDo +19 -0
data/bin/linkcheck.rb +37 -0
data/bin/main.rb +41 -0
data/conf/local.conf +23 -0
data/lib/rspider.rb +34 -0
data/lib/rspider/ConfParser.rb +149 -0
data/lib/rspider/ContentStorage.rb +130 -0
data/lib/rspider/DataWasher.rb +129 -0
data/lib/rspider/Document.rb +100 -0
data/lib/rspider/DocumentExtractor.rb +21 -0
data/lib/rspider/HtmlTidy.rb +34 -0
data/lib/rspider/Logger.rb +49 -0
data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
data/lib/rspider/MysqlUrlStorage.rb +107 -0
data/lib/rspider/OptParser.rb +53 -0
data/lib/rspider/RobotRules.rb +92 -0
data/lib/rspider/SiteLocker.rb +45 -0
data/lib/rspider/Spider.rb +324 -0
data/lib/rspider/ThreadPool.rb +69 -0
data/lib/rspider/UrlDispatcher.rb +59 -0
data/lib/rspider/UrlScorer.rb +44 -0
data/lib/rspider/UrlStorage.rb +44 -0
data/lib/rspider/browser.rb +127 -0
data/lib/rspider/cookie.rb +113 -0
data/lib/rspider/links.rb +111 -0
data/lib/rspider/mysql.rb +1131 -0
data/sql/db.sql +90 -0
metadata +73 -0

data/Changelog ADDED

@@ -0,0 +1,32 @@
+--0.8.3 Thu Sep 11 21:22:02 CST 2008
+1.add timeout field when fetching a page
+2.package published as gem
+--0.8.2 Tue Sep  9 00:18:10 CST 2008
+1.move url Grabber funcitons to Spider class,so ,Spider object can extracts links itself;
+2.Add Url-relation-Storage support
+--0.8.1
+1. Add range field for Spider
+	So we avoid to download very huge files
+2. Add Cookie support for spider
+--0.8.0 Sat Sep  6 00:35:01 CST 2008
+1.optmized MysqlUrlStorage:add a memory cache,so would not fire so much mysql
+duplicate key errors
+2.Optmized urlGrabber ,will gen less wrong urls.
+--0.7.9 Wed Sep  3 02:06:13 CST 2008
+1.add Local MysqlUrlStorage
+2.Add local Mysql Content Storage
+3.add Url score support ,low score urls will have low chance to be cralwered.
+--0.7.8
+1.Add logger to log the user interrupts,download failings...
+2.Add SiteLocker to ensure not to cralwer same site with high frequency.
+--0.7.7
+CharsetGuess added,you can handle GBK files and storage it to UTF-8 now
+--0.7.6
+ContentStorage added,Downloader switch to Net/HTTP
+--0.7.5
+added callback supprot,not completed
+--0.7.4
+add UrlStorage callback object
+--0.7.3
+robotRules added

data/Rakefile ADDED

@@ -0,0 +1,66 @@
+# The name of your project
+require 'rake/gempackagetask'
+PROJECT = "rspider"
+# Your name, used in packaging.
+MY_NAME = "Renlu Xu"
+# Your email address, used in packaging.
+MY_EMAIL = "xurenlu@gmail.com"
+# Short summary of your project, used in packaging.
+PROJECT_SUMMARY = "Web cralwer"
+# The project's package name (as opposed to its display name). Used for
+# RubyForge connectivity and packaging.
+UNIX_NAME = "rspider" # Your RubyForge user name.  RUBYFORGE_USER = ENV["RUBYFORGE_USER"] || "iam162"
+# Output directory for the rdoc html files.
+# If you don't have a custom homepage, and want to use the RDoc
+RDOC_FILES=FileList[]
+BIN_FILES=FileList["bin/*.rb"]
+GENERAL_RDOC_OPTS=""
+# Variable settings for extension support.
+EXT_DIR = "ext"
+HAVE_EXT = File.directory?(EXT_DIR)
+EXTCONF_FILES = FileList["#{EXT_DIR}/**/extconf.rb"]
+# Eventually add other files from EXT_DIR, like "MANIFEST"
+TEST_FILES = FileList["test/**/tc_*.rb"]
+DIST_FILES = FileList["lib/*/*.rb", "lib/rspider.rb","sql/*.sql","Changelog","ToDo","conf/local.conf"]
+DIST_FILES.include("Rakefile")
+# Don't package files which are autogenerated by RDocTask
+# Include extension source files.
+# Don't package temporary files, perhaps created by tests.
+DIST_FILES.exclude("**/temp_*", "**/*.tmp")
+# Don't get into recursion…
+DIST_FILES.exclude(/^(\.\/)?pkg(\/|$)/)
+REQUIRE_PATHS = ["lib"]
+REQUIRE_PATHS << EXT_DIR if HAVE_EXT
+$LOAD_PATH.concat(REQUIRE_PATHS)
+# This library file defines the MyProject::VERSION constant.
+require "#{UNIX_NAME}"
+#PROJECT_VERSION = "#{PROJECT}::#{VERSION}" # e.g., "1.0.2"
+PROJECT_VERSION="0.8.4"
+GEM_SPEC = Gem::Specification.new do |s|
+	s.name = UNIX_NAME
+	s.version = PROJECT_VERSION
+	s.summary = PROJECT_SUMMARY
+	s.rubyforge_project = UNIX_NAME
+	#s.homepage = "http://#{UNIX_NAME}.rubyforge.org/"
+	s.homepage = "http://www.162cm.com/"
+	s.author = MY_NAME
+	s.email = MY_EMAIL
+	s.files = DIST_FILES
+	s.test_files = TEST_FILES
+	s.executables = BIN_FILES.map { |fn| File.basename(fn) }
+	s.has_rdoc = true
+	s.extra_rdoc_files = RDOC_FILES
+	s.rdoc_options = GENERAL_RDOC_OPTS.to_a.flatten
+	if HAVE_EXT
+		s.extensions = EXTCONF_FILES
+		s.require_paths >> EXT_DIR
+	end
+end
+# Now we can generate the package-related tasks.
+Rake::GemPackageTask.new(GEM_SPEC) do |pkg|
+pkg.need_zip = true
+pkg.need_tar = true
+end

data/ToDo ADDED

@@ -0,0 +1,19 @@
+Mon Sep  1 10:52:10 CST 2008:
+    0.7.9 Rdoc++
+    0.7.10  全面更换为支持Mysql的存储(替换掉HDB,提高通用性)
+fixed ---  0.8.0   加上带Cookie抓取功能
+           We can cralwer site with cookie
+    0.8.1 Add Gem,Makefile and so on
+	0.8.2 Add Url relations storage
+fixed ---设定最长Url限制
+设定最大文档限制 避免下载rm,avi等大文件
+#记录300 Redirect 的URL
+HTMLTidy部分严重地泄露内存。考虑替换方案。
+#将UrlGrabber中的功能植入到spider类中来
+将仅记录url,content改为记录:
+	Keywords,Description,Charset...Summary
+除HTML之外,增加对Doc,XML,PDF,text的解析
+核查Cookie功能,并加入Load-cookie功能,将用文件记录Cookies
+发现Blog feed的功能

data/bin/linkcheck.rb ADDED

@@ -0,0 +1,37 @@
+#! /usr/bin/ruby
+require "lib/rspider"
+require "optparse"
+#require "profile"
+#debug $mem_profiler = MemoryProfiler.new
+$OPT=Rspider::OptParser.new(ARGV)
+if $OPT[:debug]=="on"
+	$DEBUG=true
+else
+	$DEBUG=false
+end
+conf=Rspider::SpiderConfParser.new($OPT[:conf])
+puts "Configuration file parsed!"
+spider=Rspider::Spider.new(conf)
+#spider.urlStorage= Rspider::UrlDispatcherClient.new("127.0.0.1",10001,conf["source"])
+#spider.urlStorage=Rspider::UrlStorage.new
+spider.logger=Rspider::Logger.new(conf["logger"])
+spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
+#spider.contentStorage=Rspider::ContentStorage.new
+#spider.contentStorage=Rspider::HDBContentStorage.new(conf["save_path"])
+spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
+spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
+spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
+spider.on :failure do |url,resp|
+	puts "ERROR:#{url}"
+end
+#$tracker = CallTracker.new
+#$tracker.register(String, :new)
+pool=Rspider::ThreadPool.new(2)
+pool.dispatch() {
+	spider.start_from conf["urls"]
+}
+puts "threads inited!"
+pool.shutdown

data/bin/main.rb ADDED

@@ -0,0 +1,41 @@
+#! /usr/bin/ruby
+require "lib/rspider"
+require "optparse"
+require "lib/Gc"
+require 'rubygems'
+require 'bleak_house'
+#require "profile"
+$mem= MemoryProfiler.new
+$OPT=Rspider::OptParser.new(ARGV)
+if $OPT[:debug]=="on"
+	$DEBUG=true
+else
+	$DEBUG=false
+end
+puts "Configuration file parsed!"
+interrupted = false
+trap("SIGINT") { interrupted = true }
+def run(x=0)
+conf=Rspider::SpiderConfParser.new($OPT[:conf])
+	spider=Rspider::Spider.new(conf)
+	spider.logger=Rspider::Logger.new(conf["logger"])
+	spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
+	spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
+	spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
+	spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
+	spider.start_from("http://localhost/search_doc/") if x == 2
+	spider.run(4)
+	spider.contentStorage.close
+	spider.urlStorage.close
+end
+j=1
+while(true) do
+	j=j+1
+	puts "-"*30
+	exit if interrupted
+	$mem.report
+	run(j)
+	exit if j>4
+end

data/conf/local.conf ADDED

@@ -0,0 +1,23 @@
+#Conf for spider  site :chin.bokee.com
+#urls="http://localhost/search_doc/"
+#urls="http://localhost/search_doc/soft/apache2.0/vhosts"
+#urls="http://localhost/search_doc/soft/apache2.0/sitemap.html"
+urls="http://localhost/search_doc/"
+#can_leave_domain must be "yes|no"
+can_leave_domain= "no"
+max_depth=4
+max_redirects=4
+save_path="./testdata/local.hdb"
+buckets=128
+source="localdb2"
+threads=10
+same_domain_regexp="localhost"
+logger="./testdata/local.log"
+agent="Rspider/1.0 (build 20080824,+http://www.162cm.com/)"
+url_max_length=130
+max_document_length=204800
+host="localhost"
+user="root"
+pass=""
+db="sphider2"
+timeout=3

data/lib/rspider.rb ADDED

@@ -0,0 +1,34 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+#
+require "optparse"
+rspider_lib_files=["rspider/ConfParser",
+ "rspider/DataWasher",
+ "rspider/cookie",
+ "rspider/browser",
+ "rspider/HtmlTidy",
+ "rspider/Logger",
+ "rspider/OptParser",
+ "rspider/UrlDispatcher",
+ "rspider/Spider",
+ "rspider/SiteLocker",
+ "rspider/ThreadPool",
+ "rspider/RobotRules",
+ "rspider/UrlStorage",
+ "rspider/UrlScorer",
+ "rspider/ContentStorage",
+ "rspider/MysqlUrlStorage",
+ "rspider/mysql",
+ "rspider/MysqlUrlRelationStorage"
+]
+rspider_lib_dir=File.expand_path(File.dirname(__FILE__))+"/"
+rspider_lib_files.collect!{|f|
+	rspider_lib_dir+f
+}
+rspider_lib_files.each{|f|
+	require f
+}

data/lib/rspider/ConfParser.rb ADDED

@@ -0,0 +1,149 @@
+#!/usr/bin/env ruby
+# $Id: parseconfig.rb 37 2008-02-29 07:27:33Z wdierkes $
+#
+# Author::      BJ Dierkes <wdierkes@5dollarwhitebox.org>
+# Copyright::   Copyright (c) 2006,2007 5dollarwhitebox.org
+# License::     GPL
+# URL::         http://www.5dollarwhitebox.org
+#
+# This class was written to simplify the parsing of configuration
+# files in the format of "param = value".  Please review the
+# demo files included with this package.
+#
+# For further information please refer to the './doc' directory
+# as well as the ChangeLog and README files included.
+#
+module Rspider
+	class ConfParseError < Exception
+		attr_reader :errno,:error
+		def initialize(errno,error)
+			@errno=errno
+			@error=error
+			super error
+		end
+		def to_s
+			@error
+		end
+	end
+	class ConfParser < Hash
+	  Version = '0.4.2'
+	  # Initialize the class with the path to the 'config_file'
+	  # The class objects are dynamically generated by the
+	  # name of the 'param' in the config file.  Therefore, if
+	  # the config file is 'param = value' then the itializer
+	  # will eval "@param = value"
+	  #
+	  def initialize(config_file)
+		super()
+		@config_file = config_file
+		raise Errno::EACCES, "#{self.config_file} is not readable" unless File.readable?(self.config_file)
+		open(self.config_file).each { |line|
+		  line.chomp
+		  unless (/^\#/.match(line))
+			if(/\s*=\s*/.match(line))
+			  param, value = line.split(/\s*=\s*/, 2)
+			  var_name = "#{param}".chomp.strip
+			  value = value.chomp.strip
+			  new_value = ''
+			  if (value)
+				if value =~ /^['"](.*)['"]$/
+				  new_value = $1
+				else
+				  new_value = value
+				end
+			  else
+				new_value = ''
+			  end
+	#          self.instance_variable_set("@#{var_name}", new_value)
+				if self.has_key?(var_name)
+					self[var_name].push(new_value)
+				else
+					self[var_name]=[]
+					self[var_name].push(new_value)
+				end
+			  #self[var_name]=new_value
+			end
+		  end
+		}
+	  end
+	  # This method will provide the value held by the object "@param"
+	  # where "@param" is actually the name of the param in the config
+	  # file.
+	  def get_value(param)
+		self[param]
+	  end
+	  # This method is simple.  Should you need to override a value
+	  # dynamically, use override_value(param, value) where 'param' is
+	  # the name of the paramater in the config file.
+	  #
+	  def override_value(param, value)
+		self[param]=value
+	  end
+	  # This method will set the value of '@param' to nil (not in the config
+	  # file, only in the app).
+	  def nil_value(param)
+		self[param]=nil
+	  end
+	  def config_file=(config_file)
+		@config_file = config_file
+	  end
+	  def config_file()
+		@config_file
+	  end
+	  def to_s()
+			self.each{|k,v|
+				puts "#{k}:=> #{v}\n"
+			}
+	  end
+	end
+	class SpiderConfParser < ConfParser
+		def initialize(config_file)
+			super(config_file)
+			begin
+				if(self["can_leave_domain"].pop.upcase=="YES")
+					self["can_leave_domain"]=true
+				else
+					self["can_leave_domain"]=false
+				end
+				self["max_depth"]=self["max_depth"].pop.to_i
+				self["max_redirects"]=self["max_redirects"].pop.to_i
+				self["save_path"]=self["save_path"].pop
+				self["source"]=self["source"].pop
+				self["buckets"]=self["buckets"].pop.to_i
+				self["threads"]=self["threads"].pop.to_i
+				self["same_domain_regexp"]=self["same_domain_regexp"].pop
+				self["agent"]=self["agent"].pop
+				self["urls"]=self["urls"].pop
+				self["logger"]=self["logger"].pop
+				self["url_max_length"]=self["url_max_length"].pop.to_i
+				self["max_document_length"]=self["max_document_length"].pop.to_i
+	#Mysql settings
+				self["host"]=self["host"].pop
+				self["db"]=self["db"].pop
+				self["user"]=self["user"].pop
+				self["pass"]=self["pass"].pop
+				self["timeout"]=self["timeout"].pop.to_i
+			rescue NoMethodError
+				raise "Some thing error while conf pop"
+				exit
+			end
+#urls="http://www.coolcode.cn/"
+#can_leave_domain= "yes"
+#max_depth=4
+#save_path="/tmp/coolcode/"
+#buckets=128
+#source="coolcode"
+#threads=10
+#same_domain_regexp="\.coolcode\.cn"
+		end
+	end
+end

data/lib/rspider/ContentStorage.rb ADDED

@@ -0,0 +1,130 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+module Rspider
+#when you need the tokyocabinet ,remove the comment
+#	require 'tokyocabinet'
+	require "digest/md5"
+	#this class store the content of documents in Hash (memory)
+	#So program uses lots of memory and can play fast
+	#
+	class ContentStorage < Hash
+		def initialize()
+		end
+		#store an url and content of the url
+		def add(url,content)
+			self[url]=content
+		end
+		#list the urls
+		def urls
+			self.keys
+		end
+		#close the db
+		def close
+		end
+		#get the content of url #{url}
+		def get(url)
+			return self[url]
+		end
+	end
+	#this class store the content in tokyocabinet database
+	#so she can get perfect performance and uses little memory
+	class HDBContentStorage
+		#the file path to hold the HDB file
+		def initialize(path)
+			@hdb = TokyoCabinet::HDB::new
+			if(!@hdb.open(path, TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT))
+				ecode = @hdb.ecode
+				STDERR.printf("open error: %s\n", @hdb.errmsg(ecode))
+			end
+		end
+		#store an url and content of the url
+		def add(url,content)
+			@hdb.put(url,content)
+		end
+		#close the db
+		def close
+			@hdb.close
+		end
+		#list all the urls
+		def urls
+			@hdb.iterinit
+			keys=[]
+			while(key = @hdb.iternext)
+				keys<< key
+			end
+			keys
+		end
+		#fetch the content of specificed url
+		def get(url)
+			return @hdb.get(url)
+		end
+	end
+	class MysqlException < Exception
+		def to_s
+			return "Can't connect to mysql "
+		end
+	end
+	#class MysqlContentStorage 	store the content of urls in an msyql_db
+	class MysqlContentStorage
+		#get md5 hash of string
+		def md5(string)
+			t=Digest::MD5.new
+			t << string
+			t.to_s
+		end
+		#initialize the object
+		#hash must be an hash includes mysql connection information such as host,user,pass,database and so on
+		#source specific the task name
+		def initialize(hash,source="default")
+			@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
+			raise MysqlException if @my.nil?
+			@source=source
+		end
+		#store an url and content of the url
+		def add(url,content)
+			sql="INSERT INTO `htmls` (`source`,`url`,`url_crc32`,`html`,`html_crc32`,`created`,`ukey`)
+VALUES ('"+@my.quote(@source)+"','"+@my.quote(url)+"','0','"+@my.quote(content)+"','0','"+Time.now.to_i.to_s+"','"+@my.quote(md5(url)+@source)+"')"
+			begin
+				@my.query(sql)
+			rescue Mysql::Error =>e
+				return nil
+			end
+			return true
+		end
+		#get the content of url #{url}
+		def get(url)
+			sql="select html from htmls where ukey='"+@my.quote(md5(url)+@source)+"'"
+			begin
+				rs=@my.query(sql)
+				rs.each do |r|
+					return	r[0]
+				end
+			rescue Mysql::Error => e
+				return nil
+			end
+		end
+		#list the urls
+		#@return Array
+		def urls()
+			sql="select url from htmls where source='"+@my.quote(@source)+"'"
+			begin
+				rs=@my.query(sql)
+				keys=[]
+				rs.each do |r|
+					keys.push(r[0])
+				end
+				return keys
+			rescue Mysql::Error => e
+				return []
+			end
+		end
+		#close the database connection
+		def close()
+			@my.close
+		end
+	end
+end