rspider 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+
2
+ --0.8.3 Thu Sep 11 21:22:02 CST 2008
3
+ 1.add timeout field when fetching a page
4
+ 2.package published as gem
5
+ --0.8.2 Tue Sep 9 00:18:10 CST 2008
6
+ 1.move url Grabber funcitons to Spider class,so ,Spider object can extracts links itself;
7
+ 2.Add Url-relation-Storage support
8
+ --0.8.1
9
+ 1. Add range field for Spider
10
+ So we avoid to download very huge files
11
+ 2. Add Cookie support for spider
12
+ --0.8.0 Sat Sep 6 00:35:01 CST 2008
13
+ 1.optmized MysqlUrlStorage:add a memory cache,so would not fire so much mysql
14
+ duplicate key errors
15
+ 2.Optmized urlGrabber ,will gen less wrong urls.
16
+ --0.7.9 Wed Sep 3 02:06:13 CST 2008
17
+ 1.add Local MysqlUrlStorage
18
+ 2.Add local Mysql Content Storage
19
+ 3.add Url score support ,low score urls will have low chance to be cralwered.
20
+ --0.7.8
21
+ 1.Add logger to log the user interrupts,download failings...
22
+ 2.Add SiteLocker to ensure not to cralwer same site with high frequency.
23
+ --0.7.7
24
+ CharsetGuess added,you can handle GBK files and storage it to UTF-8 now
25
+ --0.7.6
26
+ ContentStorage added,Downloader switch to Net/HTTP
27
+ --0.7.5
28
+ added callback supprot,not completed
29
+ --0.7.4
30
+ add UrlStorage callback object
31
+ --0.7.3
32
+ robotRules added
@@ -0,0 +1,66 @@
1
+ # The name of your project
2
+ require 'rake/gempackagetask'
3
+ PROJECT = "rspider"
4
+ # Your name, used in packaging.
5
+ MY_NAME = "Renlu Xu"
6
+ # Your email address, used in packaging.
7
+ MY_EMAIL = "xurenlu@gmail.com"
8
+ # Short summary of your project, used in packaging.
9
+ PROJECT_SUMMARY = "Web cralwer"
10
+ # The project's package name (as opposed to its display name). Used for
11
+ # RubyForge connectivity and packaging.
12
+ UNIX_NAME = "rspider" # Your RubyForge user name. RUBYFORGE_USER = ENV["RUBYFORGE_USER"] || "iam162"
13
+ # Output directory for the rdoc html files.
14
+ # If you don't have a custom homepage, and want to use the RDoc
15
+ RDOC_FILES=FileList[]
16
+ BIN_FILES=FileList["bin/*.rb"]
17
+ GENERAL_RDOC_OPTS=""
18
+ # Variable settings for extension support.
19
+ EXT_DIR = "ext"
20
+ HAVE_EXT = File.directory?(EXT_DIR)
21
+ EXTCONF_FILES = FileList["#{EXT_DIR}/**/extconf.rb"]
22
+ # Eventually add other files from EXT_DIR, like "MANIFEST"
23
+ TEST_FILES = FileList["test/**/tc_*.rb"]
24
+
25
+ DIST_FILES = FileList["lib/*/*.rb", "lib/rspider.rb","sql/*.sql","Changelog","ToDo","conf/local.conf"]
26
+ DIST_FILES.include("Rakefile")
27
+ # Don't package files which are autogenerated by RDocTask
28
+ # Include extension source files.
29
+ # Don't package temporary files, perhaps created by tests.
30
+ DIST_FILES.exclude("**/temp_*", "**/*.tmp")
31
+ # Don't get into recursion…
32
+ DIST_FILES.exclude(/^(\.\/)?pkg(\/|$)/)
33
+
34
+ REQUIRE_PATHS = ["lib"]
35
+ REQUIRE_PATHS << EXT_DIR if HAVE_EXT
36
+ $LOAD_PATH.concat(REQUIRE_PATHS)
37
+ # This library file defines the MyProject::VERSION constant.
38
+ require "#{UNIX_NAME}"
39
+ #PROJECT_VERSION = "#{PROJECT}::#{VERSION}" # e.g., "1.0.2"
40
+ PROJECT_VERSION="0.8.4"
41
+
42
+ GEM_SPEC = Gem::Specification.new do |s|
43
+ s.name = UNIX_NAME
44
+ s.version = PROJECT_VERSION
45
+ s.summary = PROJECT_SUMMARY
46
+ s.rubyforge_project = UNIX_NAME
47
+ #s.homepage = "http://#{UNIX_NAME}.rubyforge.org/"
48
+ s.homepage = "http://www.162cm.com/"
49
+ s.author = MY_NAME
50
+ s.email = MY_EMAIL
51
+ s.files = DIST_FILES
52
+ s.test_files = TEST_FILES
53
+ s.executables = BIN_FILES.map { |fn| File.basename(fn) }
54
+ s.has_rdoc = true
55
+ s.extra_rdoc_files = RDOC_FILES
56
+ s.rdoc_options = GENERAL_RDOC_OPTS.to_a.flatten
57
+ if HAVE_EXT
58
+ s.extensions = EXTCONF_FILES
59
+ s.require_paths >> EXT_DIR
60
+ end
61
+ end
62
+ # Now we can generate the package-related tasks.
63
+ Rake::GemPackageTask.new(GEM_SPEC) do |pkg|
64
+ pkg.need_zip = true
65
+ pkg.need_tar = true
66
+ end
data/ToDo ADDED
@@ -0,0 +1,19 @@
1
+ Mon Sep 1 10:52:10 CST 2008:
2
+ 0.7.9 Rdoc++
3
+ 0.7.10 全面更换为支持Mysql的存储(替换掉HDB,提高通用性)
4
+ fixed --- 0.8.0 加上带Cookie抓取功能
5
+ We can cralwer site with cookie
6
+
7
+ 0.8.1 Add Gem,Makefile and so on
8
+ 0.8.2 Add Url relations storage
9
+ fixed ---设定最长Url限制
10
+ 设定最大文档限制 避免下载rm,avi等大文件
11
+ #记录300 Redirect 的URL
12
+ HTMLTidy部分严重地泄露内存。考虑替换方案。
13
+ #将UrlGrabber中的功能植入到spider类中来
14
+
15
+ 将仅记录url,content改为记录:
16
+ Keywords,Description,Charset...Summary
17
+ 除HTML之外,增加对Doc,XML,PDF,text的解析
18
+ 核查Cookie功能,并加入Load-cookie功能,将用文件记录Cookies
19
+ 发现Blog feed的功能
@@ -0,0 +1,37 @@
1
+ #! /usr/bin/ruby
2
+ require "lib/rspider"
3
+ require "optparse"
4
+
5
+ #require "profile"
6
+ #debug $mem_profiler = MemoryProfiler.new
7
+ $OPT=Rspider::OptParser.new(ARGV)
8
+ if $OPT[:debug]=="on"
9
+ $DEBUG=true
10
+ else
11
+ $DEBUG=false
12
+ end
13
+ conf=Rspider::SpiderConfParser.new($OPT[:conf])
14
+
15
+ puts "Configuration file parsed!"
16
+ spider=Rspider::Spider.new(conf)
17
+ #spider.urlStorage= Rspider::UrlDispatcherClient.new("127.0.0.1",10001,conf["source"])
18
+ #spider.urlStorage=Rspider::UrlStorage.new
19
+ spider.logger=Rspider::Logger.new(conf["logger"])
20
+ spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
21
+ #spider.contentStorage=Rspider::ContentStorage.new
22
+ #spider.contentStorage=Rspider::HDBContentStorage.new(conf["save_path"])
23
+ spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
24
+ spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
25
+ spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
26
+ spider.on :failure do |url,resp|
27
+ puts "ERROR:#{url}"
28
+ end
29
+ #$tracker = CallTracker.new
30
+ #$tracker.register(String, :new)
31
+
32
+ pool=Rspider::ThreadPool.new(2)
33
+ pool.dispatch() {
34
+ spider.start_from conf["urls"]
35
+ }
36
+ puts "threads inited!"
37
+ pool.shutdown
@@ -0,0 +1,41 @@
1
+ #! /usr/bin/ruby
2
+ require "lib/rspider"
3
+ require "optparse"
4
+ require "lib/Gc"
5
+
6
+ require 'rubygems'
7
+ require 'bleak_house'
8
+
9
+ #require "profile"
10
+ $mem= MemoryProfiler.new
11
+ $OPT=Rspider::OptParser.new(ARGV)
12
+ if $OPT[:debug]=="on"
13
+ $DEBUG=true
14
+ else
15
+ $DEBUG=false
16
+ end
17
+ puts "Configuration file parsed!"
18
+ interrupted = false
19
+ trap("SIGINT") { interrupted = true }
20
+ def run(x=0)
21
+ conf=Rspider::SpiderConfParser.new($OPT[:conf])
22
+ spider=Rspider::Spider.new(conf)
23
+ spider.logger=Rspider::Logger.new(conf["logger"])
24
+ spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
25
+ spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
26
+ spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
27
+ spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
28
+ spider.start_from("http://localhost/search_doc/") if x == 2
29
+ spider.run(4)
30
+ spider.contentStorage.close
31
+ spider.urlStorage.close
32
+ end
33
+ j=1
34
+ while(true) do
35
+ j=j+1
36
+ puts "-"*30
37
+ exit if interrupted
38
+ $mem.report
39
+ run(j)
40
+ exit if j>4
41
+ end
@@ -0,0 +1,23 @@
1
+ #Conf for spider site :chin.bokee.com
2
+ #urls="http://localhost/search_doc/"
3
+ #urls="http://localhost/search_doc/soft/apache2.0/vhosts"
4
+ #urls="http://localhost/search_doc/soft/apache2.0/sitemap.html"
5
+ urls="http://localhost/search_doc/"
6
+ #can_leave_domain must be "yes|no"
7
+ can_leave_domain= "no"
8
+ max_depth=4
9
+ max_redirects=4
10
+ save_path="./testdata/local.hdb"
11
+ buckets=128
12
+ source="localdb2"
13
+ threads=10
14
+ same_domain_regexp="localhost"
15
+ logger="./testdata/local.log"
16
+ agent="Rspider/1.0 (build 20080824,+http://www.162cm.com/)"
17
+ url_max_length=130
18
+ max_document_length=204800
19
+ host="localhost"
20
+ user="root"
21
+ pass=""
22
+ db="sphider2"
23
+ timeout=3
@@ -0,0 +1,34 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ #
8
+ require "optparse"
9
+ rspider_lib_files=["rspider/ConfParser",
10
+ "rspider/DataWasher",
11
+ "rspider/cookie",
12
+ "rspider/browser",
13
+ "rspider/HtmlTidy",
14
+ "rspider/Logger",
15
+ "rspider/OptParser",
16
+ "rspider/UrlDispatcher",
17
+ "rspider/Spider",
18
+ "rspider/SiteLocker",
19
+ "rspider/ThreadPool",
20
+ "rspider/RobotRules",
21
+ "rspider/UrlStorage",
22
+ "rspider/UrlScorer",
23
+ "rspider/ContentStorage",
24
+ "rspider/MysqlUrlStorage",
25
+ "rspider/mysql",
26
+ "rspider/MysqlUrlRelationStorage"
27
+ ]
28
+ rspider_lib_dir=File.expand_path(File.dirname(__FILE__))+"/"
29
+ rspider_lib_files.collect!{|f|
30
+ rspider_lib_dir+f
31
+ }
32
+ rspider_lib_files.each{|f|
33
+ require f
34
+ }
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env ruby
2
+ # $Id: parseconfig.rb 37 2008-02-29 07:27:33Z wdierkes $
3
+ #
4
+ # Author:: BJ Dierkes <wdierkes@5dollarwhitebox.org>
5
+ # Copyright:: Copyright (c) 2006,2007 5dollarwhitebox.org
6
+ # License:: GPL
7
+ # URL:: http://www.5dollarwhitebox.org
8
+ #
9
+
10
+ # This class was written to simplify the parsing of configuration
11
+ # files in the format of "param = value". Please review the
12
+ # demo files included with this package.
13
+ #
14
+ # For further information please refer to the './doc' directory
15
+ # as well as the ChangeLog and README files included.
16
+ #
17
+ module Rspider
18
+ class ConfParseError < Exception
19
+ attr_reader :errno,:error
20
+ def initialize(errno,error)
21
+ @errno=errno
22
+ @error=error
23
+ super error
24
+ end
25
+ def to_s
26
+ @error
27
+ end
28
+ end
29
+ class ConfParser < Hash
30
+
31
+ Version = '0.4.2'
32
+
33
+ # Initialize the class with the path to the 'config_file'
34
+ # The class objects are dynamically generated by the
35
+ # name of the 'param' in the config file. Therefore, if
36
+ # the config file is 'param = value' then the itializer
37
+ # will eval "@param = value"
38
+ #
39
+ def initialize(config_file)
40
+ super()
41
+ @config_file = config_file
42
+ raise Errno::EACCES, "#{self.config_file} is not readable" unless File.readable?(self.config_file)
43
+ open(self.config_file).each { |line|
44
+ line.chomp
45
+ unless (/^\#/.match(line))
46
+ if(/\s*=\s*/.match(line))
47
+ param, value = line.split(/\s*=\s*/, 2)
48
+ var_name = "#{param}".chomp.strip
49
+ value = value.chomp.strip
50
+ new_value = ''
51
+ if (value)
52
+ if value =~ /^['"](.*)['"]$/
53
+ new_value = $1
54
+ else
55
+ new_value = value
56
+ end
57
+ else
58
+ new_value = ''
59
+ end
60
+ # self.instance_variable_set("@#{var_name}", new_value)
61
+ if self.has_key?(var_name)
62
+ self[var_name].push(new_value)
63
+ else
64
+ self[var_name]=[]
65
+ self[var_name].push(new_value)
66
+ end
67
+ #self[var_name]=new_value
68
+ end
69
+ end
70
+ }
71
+ end
72
+
73
+ # This method will provide the value held by the object "@param"
74
+ # where "@param" is actually the name of the param in the config
75
+ # file.
76
+ def get_value(param)
77
+ self[param]
78
+ end
79
+
80
+ # This method is simple. Should you need to override a value
81
+ # dynamically, use override_value(param, value) where 'param' is
82
+ # the name of the paramater in the config file.
83
+ #
84
+ def override_value(param, value)
85
+ self[param]=value
86
+ end
87
+
88
+ # This method will set the value of '@param' to nil (not in the config
89
+ # file, only in the app).
90
+ def nil_value(param)
91
+ self[param]=nil
92
+ end
93
+
94
+ def config_file=(config_file)
95
+ @config_file = config_file
96
+ end
97
+
98
+ def config_file()
99
+ @config_file
100
+ end
101
+ def to_s()
102
+ self.each{|k,v|
103
+ puts "#{k}:=> #{v}\n"
104
+ }
105
+ end
106
+ end
107
+
108
+ class SpiderConfParser < ConfParser
109
+ def initialize(config_file)
110
+ super(config_file)
111
+ begin
112
+ if(self["can_leave_domain"].pop.upcase=="YES")
113
+ self["can_leave_domain"]=true
114
+ else
115
+ self["can_leave_domain"]=false
116
+ end
117
+ self["max_depth"]=self["max_depth"].pop.to_i
118
+ self["max_redirects"]=self["max_redirects"].pop.to_i
119
+ self["save_path"]=self["save_path"].pop
120
+ self["source"]=self["source"].pop
121
+ self["buckets"]=self["buckets"].pop.to_i
122
+ self["threads"]=self["threads"].pop.to_i
123
+ self["same_domain_regexp"]=self["same_domain_regexp"].pop
124
+ self["agent"]=self["agent"].pop
125
+ self["urls"]=self["urls"].pop
126
+ self["logger"]=self["logger"].pop
127
+ self["url_max_length"]=self["url_max_length"].pop.to_i
128
+ self["max_document_length"]=self["max_document_length"].pop.to_i
129
+ #Mysql settings
130
+ self["host"]=self["host"].pop
131
+ self["db"]=self["db"].pop
132
+ self["user"]=self["user"].pop
133
+ self["pass"]=self["pass"].pop
134
+ self["timeout"]=self["timeout"].pop.to_i
135
+ rescue NoMethodError
136
+ raise "Some thing error while conf pop"
137
+ exit
138
+ end
139
+ #urls="http://www.coolcode.cn/"
140
+ #can_leave_domain= "yes"
141
+ #max_depth=4
142
+ #save_path="/tmp/coolcode/"
143
+ #buckets=128
144
+ #source="coolcode"
145
+ #threads=10
146
+ #same_domain_regexp="\.coolcode\.cn"
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,130 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ #when you need the tokyocabinet ,remove the comment
9
+ # require 'tokyocabinet'
10
+ require "digest/md5"
11
+ #this class store the content of documents in Hash (memory)
12
+ #So program uses lots of memory and can play fast
13
+ #
14
+ class ContentStorage < Hash
15
+ def initialize()
16
+ end
17
+ #store an url and content of the url
18
+ def add(url,content)
19
+ self[url]=content
20
+ end
21
+ #list the urls
22
+ def urls
23
+ self.keys
24
+ end
25
+ #close the db
26
+ def close
27
+ end
28
+ #get the content of url #{url}
29
+ def get(url)
30
+ return self[url]
31
+ end
32
+ end
33
+ #this class store the content in tokyocabinet database
34
+ #so she can get perfect performance and uses little memory
35
+ class HDBContentStorage
36
+ #the file path to hold the HDB file
37
+ def initialize(path)
38
+ @hdb = TokyoCabinet::HDB::new
39
+ if(!@hdb.open(path, TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT))
40
+ ecode = @hdb.ecode
41
+ STDERR.printf("open error: %s\n", @hdb.errmsg(ecode))
42
+ end
43
+ end
44
+ #store an url and content of the url
45
+ def add(url,content)
46
+ @hdb.put(url,content)
47
+ end
48
+ #close the db
49
+ def close
50
+ @hdb.close
51
+ end
52
+ #list all the urls
53
+ def urls
54
+ @hdb.iterinit
55
+ keys=[]
56
+ while(key = @hdb.iternext)
57
+ keys<< key
58
+ end
59
+ keys
60
+ end
61
+ #fetch the content of specificed url
62
+ def get(url)
63
+ return @hdb.get(url)
64
+ end
65
+ end
66
+ class MysqlException < Exception
67
+ def to_s
68
+ return "Can't connect to mysql "
69
+ end
70
+ end
71
+ #class MysqlContentStorage store the content of urls in an msyql_db
72
+ class MysqlContentStorage
73
+ #get md5 hash of string
74
+ def md5(string)
75
+ t=Digest::MD5.new
76
+ t << string
77
+ t.to_s
78
+ end
79
+ #initialize the object
80
+ #hash must be an hash includes mysql connection information such as host,user,pass,database and so on
81
+ #source specific the task name
82
+ def initialize(hash,source="default")
83
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
84
+ raise MysqlException if @my.nil?
85
+ @source=source
86
+ end
87
+ #store an url and content of the url
88
+ def add(url,content)
89
+ sql="INSERT INTO `htmls` (`source`,`url`,`url_crc32`,`html`,`html_crc32`,`created`,`ukey`)
90
+ VALUES ('"+@my.quote(@source)+"','"+@my.quote(url)+"','0','"+@my.quote(content)+"','0','"+Time.now.to_i.to_s+"','"+@my.quote(md5(url)+@source)+"')"
91
+ begin
92
+ @my.query(sql)
93
+ rescue Mysql::Error =>e
94
+ return nil
95
+ end
96
+ return true
97
+ end
98
+ #get the content of url #{url}
99
+ def get(url)
100
+ sql="select html from htmls where ukey='"+@my.quote(md5(url)+@source)+"'"
101
+ begin
102
+ rs=@my.query(sql)
103
+ rs.each do |r|
104
+ return r[0]
105
+ end
106
+ rescue Mysql::Error => e
107
+ return nil
108
+ end
109
+ end
110
+ #list the urls
111
+ #@return Array
112
+ def urls()
113
+ sql="select url from htmls where source='"+@my.quote(@source)+"'"
114
+ begin
115
+ rs=@my.query(sql)
116
+ keys=[]
117
+ rs.each do |r|
118
+ keys.push(r[0])
119
+ end
120
+ return keys
121
+ rescue Mysql::Error => e
122
+ return []
123
+ end
124
+ end
125
+ #close the database connection
126
+ def close()
127
+ @my.close
128
+ end
129
+ end
130
+ end