RubyGems - rspider - Versions diffs - 0.8.4 - Mend

rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/Changelog +32 -0
data/Rakefile +66 -0
data/ToDo +19 -0
data/bin/linkcheck.rb +37 -0
data/bin/main.rb +41 -0
data/conf/local.conf +23 -0
data/lib/rspider.rb +34 -0
data/lib/rspider/ConfParser.rb +149 -0
data/lib/rspider/ContentStorage.rb +130 -0
data/lib/rspider/DataWasher.rb +129 -0
data/lib/rspider/Document.rb +100 -0
data/lib/rspider/DocumentExtractor.rb +21 -0
data/lib/rspider/HtmlTidy.rb +34 -0
data/lib/rspider/Logger.rb +49 -0
data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
data/lib/rspider/MysqlUrlStorage.rb +107 -0
data/lib/rspider/OptParser.rb +53 -0
data/lib/rspider/RobotRules.rb +92 -0
data/lib/rspider/SiteLocker.rb +45 -0
data/lib/rspider/Spider.rb +324 -0
data/lib/rspider/ThreadPool.rb +69 -0
data/lib/rspider/UrlDispatcher.rb +59 -0
data/lib/rspider/UrlScorer.rb +44 -0
data/lib/rspider/UrlStorage.rb +44 -0
data/lib/rspider/browser.rb +127 -0
data/lib/rspider/cookie.rb +113 -0
data/lib/rspider/links.rb +111 -0
data/lib/rspider/mysql.rb +1131 -0
data/sql/db.sql +90 -0
metadata +73 -0

data/Changelog ADDED

@@ -0,0 +1,32 @@
+--0.8.3 Thu Sep 11 21:22:02 CST 2008
+1.add timeout field when fetching a page
+2.package published as gem
+--0.8.2 Tue Sep  9 00:18:10 CST 2008
+1.move url Grabber funcitons to Spider class,so ,Spider object can extracts links itself;
+2.Add Url-relation-Storage support
+--0.8.1
+1. Add range field for Spider
+	So we avoid to download very huge files
+2. Add Cookie support for spider
+--0.8.0 Sat Sep  6 00:35:01 CST 2008
+1.optmized MysqlUrlStorage:add a memory cache,so would not fire so much mysql
+duplicate key errors
+2.Optmized urlGrabber ,will gen less wrong urls.
+--0.7.9 Wed Sep  3 02:06:13 CST 2008
+1.add Local MysqlUrlStorage
+2.Add local Mysql Content Storage
+3.add Url score support ,low score urls will have low chance to be cralwered.
+--0.7.8
+1.Add logger to log the user interrupts,download failings...
+2.Add SiteLocker to ensure not to cralwer same site with high frequency.
+--0.7.7
+CharsetGuess added,you can handle GBK files and storage it to UTF-8 now
+--0.7.6
+ContentStorage added,Downloader switch to Net/HTTP
+--0.7.5
+added callback supprot,not completed
+--0.7.4
+add UrlStorage callback object
+--0.7.3
+robotRules added

data/Rakefile ADDED

@@ -0,0 +1,66 @@
+# The name of your project
+require 'rake/gempackagetask'
+PROJECT = "rspider"
+# Your name, used in packaging.
+MY_NAME = "Renlu Xu"
+# Your email address, used in packaging.
+MY_EMAIL = "xurenlu@gmail.com"
+# Short summary of your project, used in packaging.
+PROJECT_SUMMARY = "Web cralwer"
+# The project's package name (as opposed to its display name). Used for
+# RubyForge connectivity and packaging.
+UNIX_NAME = "rspider" # Your RubyForge user name.  RUBYFORGE_USER = ENV["RUBYFORGE_USER"] || "iam162"
+# Output directory for the rdoc html files.
+# If you don't have a custom homepage, and want to use the RDoc
+RDOC_FILES=FileList[]
+BIN_FILES=FileList["bin/*.rb"]
+GENERAL_RDOC_OPTS=""
+# Variable settings for extension support.
+EXT_DIR = "ext"
+HAVE_EXT = File.directory?(EXT_DIR)
+EXTCONF_FILES = FileList["#{EXT_DIR}/**/extconf.rb"]
+# Eventually add other files from EXT_DIR, like "MANIFEST"
+TEST_FILES = FileList["test/**/tc_*.rb"]
+DIST_FILES = FileList["lib/*/*.rb", "lib/rspider.rb","sql/*.sql","Changelog","ToDo","conf/local.conf"]
+DIST_FILES.include("Rakefile")
+# Don't package files which are autogenerated by RDocTask
+# Include extension source files.
+# Don't package temporary files, perhaps created by tests.
+DIST_FILES.exclude("**/temp_*", "**/*.tmp")
+# Don't get into recursion…
+DIST_FILES.exclude(/^(\.\/)?pkg(\/|$)/)
+REQUIRE_PATHS = ["lib"]
+REQUIRE_PATHS << EXT_DIR if HAVE_EXT
+$LOAD_PATH.concat(REQUIRE_PATHS)
+# This library file defines the MyProject::VERSION constant.
+require "#{UNIX_NAME}"
+#PROJECT_VERSION = "#{PROJECT}::#{VERSION}" # e.g., "1.0.2"
+PROJECT_VERSION="0.8.4"
+GEM_SPEC = Gem::Specification.new do |s|
+	s.name = UNIX_NAME
+	s.version = PROJECT_VERSION
+	s.summary = PROJECT_SUMMARY
+	s.rubyforge_project = UNIX_NAME
+	#s.homepage = "http://#{UNIX_NAME}.rubyforge.org/"
+	s.homepage = "http://www.162cm.com/"
+	s.author = MY_NAME
+	s.email = MY_EMAIL
+	s.files = DIST_FILES
+	s.test_files = TEST_FILES
+	s.executables = BIN_FILES.map { |fn| File.basename(fn) }
+	s.has_rdoc = true
+	s.extra_rdoc_files = RDOC_FILES
+	s.rdoc_options = GENERAL_RDOC_OPTS.to_a.flatten
+	if HAVE_EXT
+		s.extensions = EXTCONF_FILES
+		s.require_paths >> EXT_DIR
+	end
+end
+# Now we can generate the package-related tasks.
+Rake::GemPackageTask.new(GEM_SPEC) do |pkg|
+pkg.need_zip = true
+pkg.need_tar = true
+end

data/ToDo ADDED

@@ -0,0 +1,19 @@
+Mon Sep  1 10:52:10 CST 2008:
+    0.7.9 Rdoc++
+    0.7.10  全面更换为支持Mysql的存储(替换掉HDB,提高通用性)
+fixed ---  0.8.0   加上带Cookie抓取功能
+           We can cralwer site with cookie
+    0.8.1 Add Gem,Makefile and so on
+	0.8.2 Add Url relations storage
+fixed ---设定最长Url限制
+设定最大文档限制 避免下载rm,avi等大文件
+#记录300 Redirect 的URL
+HTMLTidy部分严重地泄露内存。考虑替换方案。
+#将UrlGrabber中的功能植入到spider类中来
+将仅记录url,content改为记录:
+	Keywords,Description,Charset...Summary
+除HTML之外,增加对Doc,XML,PDF,text的解析
+核查Cookie功能,并加入Load-cookie功能,将用文件记录Cookies
+发现Blog feed的功能

data/bin/linkcheck.rb ADDED

@@ -0,0 +1,37 @@
+#! /usr/bin/ruby
+require "lib/rspider"
+require "optparse"
+#require "profile"
+#debug $mem_profiler = MemoryProfiler.new
+$OPT=Rspider::OptParser.new(ARGV)
+if $OPT[:debug]=="on"
+	$DEBUG=true
+else
+	$DEBUG=false
+end
+conf=Rspider::SpiderConfParser.new($OPT[:conf])
+puts "Configuration file parsed!"
+spider=Rspider::Spider.new(conf)
+#spider.urlStorage= Rspider::UrlDispatcherClient.new("127.0.0.1",10001,conf["source"])
+#spider.urlStorage=Rspider::UrlStorage.new
+spider.logger=Rspider::Logger.new(conf["logger"])
+spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
+#spider.contentStorage=Rspider::ContentStorage.new
+#spider.contentStorage=Rspider::HDBContentStorage.new(conf["save_path"])
+spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
+spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
+spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
+spider.on :failure do |url,resp|
+	puts "ERROR:#{url}"
+end
+#$tracker = CallTracker.new
+#$tracker.register(String, :new)
+pool=Rspider::ThreadPool.new(2)
+pool.dispatch() {
+	spider.start_from conf["urls"]
+}
+puts "threads inited!"
+pool.shutdown

data/bin/main.rb ADDED

@@ -0,0 +1,41 @@
+#! /usr/bin/ruby
+require "lib/rspider"
+require "optparse"
+require "lib/Gc"
+require 'rubygems'
+require 'bleak_house'
+#require "profile"
+$mem= MemoryProfiler.new
+$OPT=Rspider::OptParser.new(ARGV)
+if $OPT[:debug]=="on"
+	$DEBUG=true
+else
+	$DEBUG=false
+end
+puts "Configuration file parsed!"
+interrupted = false
+trap("SIGINT") { interrupted = true }
+def run(x=0)
+conf=Rspider::SpiderConfParser.new($OPT[:conf])
+	spider=Rspider::Spider.new(conf)
+	spider.logger=Rspider::Logger.new(conf["logger"])
+	spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
+	spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
+	spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
+	spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
+	spider.start_from("http://localhost/search_doc/") if x == 2
+	spider.run(4)
+	spider.contentStorage.close
+	spider.urlStorage.close
+end
+j=1
+while(true) do
+	j=j+1
+	puts "-"*30
+	exit if interrupted
+	$mem.report
+	run(j)
+	exit if j>4
+end

data/conf/local.conf ADDED

@@ -0,0 +1,23 @@
+#Conf for spider  site :chin.bokee.com
+#urls="http://localhost/search_doc/"
+#urls="http://localhost/search_doc/soft/apache2.0/vhosts"
+#urls="http://localhost/search_doc/soft/apache2.0/sitemap.html"
+urls="http://localhost/search_doc/"
+#can_leave_domain must be "yes|no"
+can_leave_domain= "no"
+max_depth=4
+max_redirects=4
+save_path="./testdata/local.hdb"
+buckets=128
+source="localdb2"
+threads=10
+same_domain_regexp="localhost"
+logger="./testdata/local.log"
+agent="Rspider/1.0 (build 20080824,+http://www.162cm.com/)"
+url_max_length=130
+max_document_length=204800
+host="localhost"
+user="root"
+pass=""
+db="sphider2"
+timeout=3

data/lib/rspider.rb ADDED

@@ -0,0 +1,34 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+#
+require "optparse"
+rspider_lib_files=["rspider/ConfParser",
+ "rspider/DataWasher",
+ "rspider/cookie",
+ "rspider/browser",
+ "rspider/HtmlTidy",
+ "rspider/Logger",
+ "rspider/OptParser",
+ "rspider/UrlDispatcher",
+ "rspider/Spider",
+ "rspider/SiteLocker",
+ "rspider/ThreadPool",
+ "rspider/RobotRules",
+ "rspider/UrlStorage",
+ "rspider/UrlScorer",
+ "rspider/ContentStorage",
+ "rspider/MysqlUrlStorage",
+ "rspider/mysql",
+ "rspider/MysqlUrlRelationStorage"
+]
+rspider_lib_dir=File.expand_path(File.dirname(__FILE__))+"/"
+rspider_lib_files.collect!{|f|
+	rspider_lib_dir+f
+}
+rspider_lib_files.each{|f|
+	require f
+}

data/lib/rspider/ConfParser.rb ADDED

@@ -0,0 +1,149 @@
+#!/usr/bin/env ruby
+# $Id: parseconfig.rb 37 2008-02-29 07:27:33Z wdierkes $
+#
+# Author::      BJ Dierkes <wdierkes@5dollarwhitebox.org>
+# Copyright::   Copyright (c) 2006,2007 5dollarwhitebox.org
+# License::     GPL
+# URL::         http://www.5dollarwhitebox.org
+#
+# This class was written to simplify the parsing of configuration
+# files in the format of "param = value".  Please review the
+# demo files included with this package.
+#
+# For further information please refer to the './doc' directory
+# as well as the ChangeLog and README files included.
+#
+module Rspider
+	class ConfParseError < Exception
+		attr_reader :errno,:error
+		def initialize(errno,error)
+			@errno=errno
+			@error=error
+			super error
+		end
+		def to_s
+			@error
+		end
+	end
+	class ConfParser < Hash
+	  Version = '0.4.2'
+	  # Initialize the class with the path to the 'config_file'
+	  # The class objects are dynamically generated by the
+	  # name of the 'param' in the config file.  Therefore, if
+	  # the config file is 'param = value' then the itializer
+	  # will eval "@param = value"
+	  #
+	  def initialize(config_file)
+		super()
+		@config_file = config_file
+		raise Errno::EACCES, "#{self.config_file} is not readable" unless File.readable?(self.config_file)
+		open(self.config_file).each { |line|
+		  line.chomp
+		  unless (/^\#/.match(line))
+			if(/\s*=\s*/.match(line))
+			  param, value = line.split(/\s*=\s*/, 2)
+			  var_name = "#{param}".chomp.strip
+			  value = value.chomp.strip
+			  new_value = ''
+			  if (value)
+				if value =~ /^['"](.*)['"]$/
+				  new_value = $1
+				else
+				  new_value = value
+				end
+			  else
+				new_value = ''
+			  end
+	#          self.instance_variable_set("@#{var_name}", new_value)
+				if self.has_key?(var_name)
+					self[var_name].push(new_value)
+				else
+					self[var_name]=[]
+					self[var_name].push(new_value)
+				end
+			  #self[var_name]=new_value
+			end
+		  end
+		}
+	  end
+	  # This method will provide the value held by the object "@param"
+	  # where "@param" is actually the name of the param in the config
+	  # file.
+	  def get_value(param)
+		self[param]
+	  end
+	  # This method is simple.  Should you need to override a value
+	  # dynamically, use override_value(param, value) where 'param' is
+	  # the name of the paramater in the config file.
+	  #
+	  def override_value(param, value)
+		self[param]=value
+	  end
+	  # This method will set the value of '@param' to nil (not in the config
+	  # file, only in the app).
+	  def nil_value(param)
+		self[param]=nil
+	  end
+	  def config_file=(config_file)
+		@config_file = config_file
+	  end
+	  def config_file()
+		@config_file
+	  end
+	  def to_s()
+			self.each{|k,v|
+				puts "#{k}:=> #{v}\n"
+			}
+	  end
+	end
+	class SpiderConfParser < ConfParser
+		def initialize(config_file)
+			super(config_file)
+			begin
+				if(self["can_leave_domain"].pop.upcase=="YES")
+					self["can_leave_domain"]=true
+				else
+					self["can_leave_domain"]=false
+				end
+				self["max_depth"]=self["max_depth"].pop.to_i
+				self["max_redirects"]=self["max_redirects"].pop.to_i
+				self["save_path"]=self["save_path"].pop
+				self["source"]=self["source"].pop
+				self["buckets"]=self["buckets"].pop.to_i
+				self["threads"]=self["threads"].pop.to_i
+				self["same_domain_regexp"]=self["same_domain_regexp"].pop
+				self["agent"]=self["agent"].pop
+				self["urls"]=self["urls"].pop
+				self["logger"]=self["logger"].pop
+				self["url_max_length"]=self["url_max_length"].pop.to_i
+				self["max_document_length"]=self["max_document_length"].pop.to_i
+	#Mysql settings
+				self["host"]=self["host"].pop
+				self["db"]=self["db"].pop
+				self["user"]=self["user"].pop
+				self["pass"]=self["pass"].pop
+				self["timeout"]=self["timeout"].pop.to_i
+			rescue NoMethodError
+				raise "Some thing error while conf pop"
+				exit
+			end
+#urls="http://www.coolcode.cn/"
+#can_leave_domain= "yes"
+#max_depth=4
+#save_path="/tmp/coolcode/"
+#buckets=128
+#source="coolcode"
+#threads=10
+#same_domain_regexp="\.coolcode\.cn"
+		end
+	end
+end

data/lib/rspider/ContentStorage.rb ADDED

@@ -0,0 +1,130 @@
+=begin rdoc
+Author:: aragorn(xurenlu@gmail.com)
+URL::	http://www.162cm.com/
+Version:: 1.0.0
+License:: LGPL
+=end
+module Rspider
+#when you need the tokyocabinet ,remove the comment
+#	require 'tokyocabinet'
+	require "digest/md5"
+	#this class store the content of documents in Hash (memory)
+	#So program uses lots of memory and can play fast
+	#
+	class ContentStorage < Hash
+		def initialize()
+		end
+		#store an url and content of the url
+		def add(url,content)
+			self[url]=content
+		end
+		#list the urls
+		def urls
+			self.keys
+		end
+		#close the db
+		def close
+		end
+		#get the content of url #{url}
+		def get(url)
+			return self[url]
+		end
+	end
+	#this class store the content in tokyocabinet database
+	#so she can get perfect performance and uses little memory
+	class HDBContentStorage
+		#the file path to hold the HDB file
+		def initialize(path)
+			@hdb = TokyoCabinet::HDB::new
+			if(!@hdb.open(path, TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT))
+				ecode = @hdb.ecode
+				STDERR.printf("open error: %s\n", @hdb.errmsg(ecode))
+			end
+		end
+		#store an url and content of the url
+		def add(url,content)
+			@hdb.put(url,content)
+		end
+		#close the db
+		def close
+			@hdb.close
+		end
+		#list all the urls
+		def urls
+			@hdb.iterinit
+			keys=[]
+			while(key = @hdb.iternext)
+				keys<< key
+			end
+			keys
+		end
+		#fetch the content of specificed url
+		def get(url)
+			return @hdb.get(url)
+		end
+	end
+	class MysqlException < Exception
+		def to_s
+			return "Can't connect to mysql "
+		end
+	end
+	#class MysqlContentStorage 	store the content of urls in an msyql_db
+	class MysqlContentStorage
+		#get md5 hash of string
+		def md5(string)
+			t=Digest::MD5.new
+			t << string
+			t.to_s
+		end
+		#initialize the object
+		#hash must be an hash includes mysql connection information such as host,user,pass,database and so on
+		#source specific the task name
+		def initialize(hash,source="default")
+			@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
+			raise MysqlException if @my.nil?
+			@source=source
+		end
+		#store an url and content of the url
+		def add(url,content)
+			sql="INSERT INTO `htmls` (`source`,`url`,`url_crc32`,`html`,`html_crc32`,`created`,`ukey`)
+VALUES ('"+@my.quote(@source)+"','"+@my.quote(url)+"','0','"+@my.quote(content)+"','0','"+Time.now.to_i.to_s+"','"+@my.quote(md5(url)+@source)+"')"
+			begin
+				@my.query(sql)
+			rescue Mysql::Error =>e
+				return nil
+			end
+			return true
+		end
+		#get the content of url #{url}
+		def get(url)
+			sql="select html from htmls where ukey='"+@my.quote(md5(url)+@source)+"'"
+			begin
+				rs=@my.query(sql)
+				rs.each do |r|
+					return	r[0]
+				end
+			rescue Mysql::Error => e
+				return nil
+			end
+		end
+		#list the urls
+		#@return Array
+		def urls()
+			sql="select url from htmls where source='"+@my.quote(@source)+"'"
+			begin
+				rs=@my.query(sql)
+				keys=[]
+				rs.each do |r|
+					keys.push(r[0])
+				end
+				return keys
+			rescue Mysql::Error => e
+				return []
+			end
+		end
+		#close the database connection
+		def close()
+			@my.close
+		end
+	end
+end