rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+
2
+ --0.8.3 Thu Sep 11 21:22:02 CST 2008
3
+ 1.add timeout field when fetching a page
4
+ 2.package published as gem
5
+ --0.8.2 Tue Sep 9 00:18:10 CST 2008
6
+ 1.move url Grabber funcitons to Spider class,so ,Spider object can extracts links itself;
7
+ 2.Add Url-relation-Storage support
8
+ --0.8.1
9
+ 1. Add range field for Spider
10
+ So we avoid to download very huge files
11
+ 2. Add Cookie support for spider
12
+ --0.8.0 Sat Sep 6 00:35:01 CST 2008
13
+ 1.optmized MysqlUrlStorage:add a memory cache,so would not fire so much mysql
14
+ duplicate key errors
15
+ 2.Optmized urlGrabber ,will gen less wrong urls.
16
+ --0.7.9 Wed Sep 3 02:06:13 CST 2008
17
+ 1.add Local MysqlUrlStorage
18
+ 2.Add local Mysql Content Storage
19
+ 3.add Url score support ,low score urls will have low chance to be cralwered.
20
+ --0.7.8
21
+ 1.Add logger to log the user interrupts,download failings...
22
+ 2.Add SiteLocker to ensure not to cralwer same site with high frequency.
23
+ --0.7.7
24
+ CharsetGuess added,you can handle GBK files and storage it to UTF-8 now
25
+ --0.7.6
26
+ ContentStorage added,Downloader switch to Net/HTTP
27
+ --0.7.5
28
+ added callback supprot,not completed
29
+ --0.7.4
30
+ add UrlStorage callback object
31
+ --0.7.3
32
+ robotRules added
@@ -0,0 +1,66 @@
1
+ # The name of your project
2
+ require 'rake/gempackagetask'
3
+ PROJECT = "rspider"
4
+ # Your name, used in packaging.
5
+ MY_NAME = "Renlu Xu"
6
+ # Your email address, used in packaging.
7
+ MY_EMAIL = "xurenlu@gmail.com"
8
+ # Short summary of your project, used in packaging.
9
+ PROJECT_SUMMARY = "Web cralwer"
10
+ # The project's package name (as opposed to its display name). Used for
11
+ # RubyForge connectivity and packaging.
12
+ UNIX_NAME = "rspider" # Your RubyForge user name. RUBYFORGE_USER = ENV["RUBYFORGE_USER"] || "iam162"
13
+ # Output directory for the rdoc html files.
14
+ # If you don't have a custom homepage, and want to use the RDoc
15
+ RDOC_FILES=FileList[]
16
+ BIN_FILES=FileList["bin/*.rb"]
17
+ GENERAL_RDOC_OPTS=""
18
+ # Variable settings for extension support.
19
+ EXT_DIR = "ext"
20
+ HAVE_EXT = File.directory?(EXT_DIR)
21
+ EXTCONF_FILES = FileList["#{EXT_DIR}/**/extconf.rb"]
22
+ # Eventually add other files from EXT_DIR, like "MANIFEST"
23
+ TEST_FILES = FileList["test/**/tc_*.rb"]
24
+
25
+ DIST_FILES = FileList["lib/*/*.rb", "lib/rspider.rb","sql/*.sql","Changelog","ToDo","conf/local.conf"]
26
+ DIST_FILES.include("Rakefile")
27
+ # Don't package files which are autogenerated by RDocTask
28
+ # Include extension source files.
29
+ # Don't package temporary files, perhaps created by tests.
30
+ DIST_FILES.exclude("**/temp_*", "**/*.tmp")
31
+ # Don't get into recursion…
32
+ DIST_FILES.exclude(/^(\.\/)?pkg(\/|$)/)
33
+
34
+ REQUIRE_PATHS = ["lib"]
35
+ REQUIRE_PATHS << EXT_DIR if HAVE_EXT
36
+ $LOAD_PATH.concat(REQUIRE_PATHS)
37
+ # This library file defines the MyProject::VERSION constant.
38
+ require "#{UNIX_NAME}"
39
+ #PROJECT_VERSION = "#{PROJECT}::#{VERSION}" # e.g., "1.0.2"
40
+ PROJECT_VERSION="0.8.4"
41
+
42
+ GEM_SPEC = Gem::Specification.new do |s|
43
+ s.name = UNIX_NAME
44
+ s.version = PROJECT_VERSION
45
+ s.summary = PROJECT_SUMMARY
46
+ s.rubyforge_project = UNIX_NAME
47
+ #s.homepage = "http://#{UNIX_NAME}.rubyforge.org/"
48
+ s.homepage = "http://www.162cm.com/"
49
+ s.author = MY_NAME
50
+ s.email = MY_EMAIL
51
+ s.files = DIST_FILES
52
+ s.test_files = TEST_FILES
53
+ s.executables = BIN_FILES.map { |fn| File.basename(fn) }
54
+ s.has_rdoc = true
55
+ s.extra_rdoc_files = RDOC_FILES
56
+ s.rdoc_options = GENERAL_RDOC_OPTS.to_a.flatten
57
+ if HAVE_EXT
58
+ s.extensions = EXTCONF_FILES
59
+ s.require_paths >> EXT_DIR
60
+ end
61
+ end
62
+ # Now we can generate the package-related tasks.
63
+ Rake::GemPackageTask.new(GEM_SPEC) do |pkg|
64
+ pkg.need_zip = true
65
+ pkg.need_tar = true
66
+ end
data/ToDo ADDED
@@ -0,0 +1,19 @@
1
+ Mon Sep 1 10:52:10 CST 2008:
2
+ 0.7.9 Rdoc++
3
+ 0.7.10 全面更换为支持Mysql的存储(替换掉HDB,提高通用性)
4
+ fixed --- 0.8.0 加上带Cookie抓取功能
5
+ We can cralwer site with cookie
6
+
7
+ 0.8.1 Add Gem,Makefile and so on
8
+ 0.8.2 Add Url relations storage
9
+ fixed ---设定最长Url限制
10
+ 设定最大文档限制 避免下载rm,avi等大文件
11
+ #记录300 Redirect 的URL
12
+ HTMLTidy部分严重地泄露内存。考虑替换方案。
13
+ #将UrlGrabber中的功能植入到spider类中来
14
+
15
+ 将仅记录url,content改为记录:
16
+ Keywords,Description,Charset...Summary
17
+ 除HTML之外,增加对Doc,XML,PDF,text的解析
18
+ 核查Cookie功能,并加入Load-cookie功能,将用文件记录Cookies
19
+ 发现Blog feed的功能
@@ -0,0 +1,37 @@
1
+ #! /usr/bin/ruby
2
+ require "lib/rspider"
3
+ require "optparse"
4
+
5
+ #require "profile"
6
+ #debug $mem_profiler = MemoryProfiler.new
7
+ $OPT=Rspider::OptParser.new(ARGV)
8
+ if $OPT[:debug]=="on"
9
+ $DEBUG=true
10
+ else
11
+ $DEBUG=false
12
+ end
13
+ conf=Rspider::SpiderConfParser.new($OPT[:conf])
14
+
15
+ puts "Configuration file parsed!"
16
+ spider=Rspider::Spider.new(conf)
17
+ #spider.urlStorage= Rspider::UrlDispatcherClient.new("127.0.0.1",10001,conf["source"])
18
+ #spider.urlStorage=Rspider::UrlStorage.new
19
+ spider.logger=Rspider::Logger.new(conf["logger"])
20
+ spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
21
+ #spider.contentStorage=Rspider::ContentStorage.new
22
+ #spider.contentStorage=Rspider::HDBContentStorage.new(conf["save_path"])
23
+ spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
24
+ spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
25
+ spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
26
+ spider.on :failure do |url,resp|
27
+ puts "ERROR:#{url}"
28
+ end
29
+ #$tracker = CallTracker.new
30
+ #$tracker.register(String, :new)
31
+
32
+ pool=Rspider::ThreadPool.new(2)
33
+ pool.dispatch() {
34
+ spider.start_from conf["urls"]
35
+ }
36
+ puts "threads inited!"
37
+ pool.shutdown
@@ -0,0 +1,41 @@
1
+ #! /usr/bin/ruby
2
+ require "lib/rspider"
3
+ require "optparse"
4
+ require "lib/Gc"
5
+
6
+ require 'rubygems'
7
+ require 'bleak_house'
8
+
9
+ #require "profile"
10
+ $mem= MemoryProfiler.new
11
+ $OPT=Rspider::OptParser.new(ARGV)
12
+ if $OPT[:debug]=="on"
13
+ $DEBUG=true
14
+ else
15
+ $DEBUG=false
16
+ end
17
+ puts "Configuration file parsed!"
18
+ interrupted = false
19
+ trap("SIGINT") { interrupted = true }
20
+ def run(x=0)
21
+ conf=Rspider::SpiderConfParser.new($OPT[:conf])
22
+ spider=Rspider::Spider.new(conf)
23
+ spider.logger=Rspider::Logger.new(conf["logger"])
24
+ spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
25
+ spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
26
+ spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
27
+ spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
28
+ spider.start_from("http://localhost/search_doc/") if x == 2
29
+ spider.run(4)
30
+ spider.contentStorage.close
31
+ spider.urlStorage.close
32
+ end
33
+ j=1
34
+ while(true) do
35
+ j=j+1
36
+ puts "-"*30
37
+ exit if interrupted
38
+ $mem.report
39
+ run(j)
40
+ exit if j>4
41
+ end
@@ -0,0 +1,23 @@
1
+ #Conf for spider site :chin.bokee.com
2
+ #urls="http://localhost/search_doc/"
3
+ #urls="http://localhost/search_doc/soft/apache2.0/vhosts"
4
+ #urls="http://localhost/search_doc/soft/apache2.0/sitemap.html"
5
+ urls="http://localhost/search_doc/"
6
+ #can_leave_domain must be "yes|no"
7
+ can_leave_domain= "no"
8
+ max_depth=4
9
+ max_redirects=4
10
+ save_path="./testdata/local.hdb"
11
+ buckets=128
12
+ source="localdb2"
13
+ threads=10
14
+ same_domain_regexp="localhost"
15
+ logger="./testdata/local.log"
16
+ agent="Rspider/1.0 (build 20080824,+http://www.162cm.com/)"
17
+ url_max_length=130
18
+ max_document_length=204800
19
+ host="localhost"
20
+ user="root"
21
+ pass=""
22
+ db="sphider2"
23
+ timeout=3
@@ -0,0 +1,34 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ #
8
+ require "optparse"
9
+ rspider_lib_files=["rspider/ConfParser",
10
+ "rspider/DataWasher",
11
+ "rspider/cookie",
12
+ "rspider/browser",
13
+ "rspider/HtmlTidy",
14
+ "rspider/Logger",
15
+ "rspider/OptParser",
16
+ "rspider/UrlDispatcher",
17
+ "rspider/Spider",
18
+ "rspider/SiteLocker",
19
+ "rspider/ThreadPool",
20
+ "rspider/RobotRules",
21
+ "rspider/UrlStorage",
22
+ "rspider/UrlScorer",
23
+ "rspider/ContentStorage",
24
+ "rspider/MysqlUrlStorage",
25
+ "rspider/mysql",
26
+ "rspider/MysqlUrlRelationStorage"
27
+ ]
28
+ rspider_lib_dir=File.expand_path(File.dirname(__FILE__))+"/"
29
+ rspider_lib_files.collect!{|f|
30
+ rspider_lib_dir+f
31
+ }
32
+ rspider_lib_files.each{|f|
33
+ require f
34
+ }
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env ruby
2
+ # $Id: parseconfig.rb 37 2008-02-29 07:27:33Z wdierkes $
3
+ #
4
+ # Author:: BJ Dierkes <wdierkes@5dollarwhitebox.org>
5
+ # Copyright:: Copyright (c) 2006,2007 5dollarwhitebox.org
6
+ # License:: GPL
7
+ # URL:: http://www.5dollarwhitebox.org
8
+ #
9
+
10
+ # This class was written to simplify the parsing of configuration
11
+ # files in the format of "param = value". Please review the
12
+ # demo files included with this package.
13
+ #
14
+ # For further information please refer to the './doc' directory
15
+ # as well as the ChangeLog and README files included.
16
+ #
17
+ module Rspider
18
+ class ConfParseError < Exception
19
+ attr_reader :errno,:error
20
+ def initialize(errno,error)
21
+ @errno=errno
22
+ @error=error
23
+ super error
24
+ end
25
+ def to_s
26
+ @error
27
+ end
28
+ end
29
+ class ConfParser < Hash
30
+
31
+ Version = '0.4.2'
32
+
33
+ # Initialize the class with the path to the 'config_file'
34
+ # The class objects are dynamically generated by the
35
+ # name of the 'param' in the config file. Therefore, if
36
+ # the config file is 'param = value' then the itializer
37
+ # will eval "@param = value"
38
+ #
39
+ def initialize(config_file)
40
+ super()
41
+ @config_file = config_file
42
+ raise Errno::EACCES, "#{self.config_file} is not readable" unless File.readable?(self.config_file)
43
+ open(self.config_file).each { |line|
44
+ line.chomp
45
+ unless (/^\#/.match(line))
46
+ if(/\s*=\s*/.match(line))
47
+ param, value = line.split(/\s*=\s*/, 2)
48
+ var_name = "#{param}".chomp.strip
49
+ value = value.chomp.strip
50
+ new_value = ''
51
+ if (value)
52
+ if value =~ /^['"](.*)['"]$/
53
+ new_value = $1
54
+ else
55
+ new_value = value
56
+ end
57
+ else
58
+ new_value = ''
59
+ end
60
+ # self.instance_variable_set("@#{var_name}", new_value)
61
+ if self.has_key?(var_name)
62
+ self[var_name].push(new_value)
63
+ else
64
+ self[var_name]=[]
65
+ self[var_name].push(new_value)
66
+ end
67
+ #self[var_name]=new_value
68
+ end
69
+ end
70
+ }
71
+ end
72
+
73
+ # This method will provide the value held by the object "@param"
74
+ # where "@param" is actually the name of the param in the config
75
+ # file.
76
+ def get_value(param)
77
+ self[param]
78
+ end
79
+
80
+ # This method is simple. Should you need to override a value
81
+ # dynamically, use override_value(param, value) where 'param' is
82
+ # the name of the paramater in the config file.
83
+ #
84
+ def override_value(param, value)
85
+ self[param]=value
86
+ end
87
+
88
+ # This method will set the value of '@param' to nil (not in the config
89
+ # file, only in the app).
90
+ def nil_value(param)
91
+ self[param]=nil
92
+ end
93
+
94
+ def config_file=(config_file)
95
+ @config_file = config_file
96
+ end
97
+
98
+ def config_file()
99
+ @config_file
100
+ end
101
+ def to_s()
102
+ self.each{|k,v|
103
+ puts "#{k}:=> #{v}\n"
104
+ }
105
+ end
106
+ end
107
+
108
+ class SpiderConfParser < ConfParser
109
+ def initialize(config_file)
110
+ super(config_file)
111
+ begin
112
+ if(self["can_leave_domain"].pop.upcase=="YES")
113
+ self["can_leave_domain"]=true
114
+ else
115
+ self["can_leave_domain"]=false
116
+ end
117
+ self["max_depth"]=self["max_depth"].pop.to_i
118
+ self["max_redirects"]=self["max_redirects"].pop.to_i
119
+ self["save_path"]=self["save_path"].pop
120
+ self["source"]=self["source"].pop
121
+ self["buckets"]=self["buckets"].pop.to_i
122
+ self["threads"]=self["threads"].pop.to_i
123
+ self["same_domain_regexp"]=self["same_domain_regexp"].pop
124
+ self["agent"]=self["agent"].pop
125
+ self["urls"]=self["urls"].pop
126
+ self["logger"]=self["logger"].pop
127
+ self["url_max_length"]=self["url_max_length"].pop.to_i
128
+ self["max_document_length"]=self["max_document_length"].pop.to_i
129
+ #Mysql settings
130
+ self["host"]=self["host"].pop
131
+ self["db"]=self["db"].pop
132
+ self["user"]=self["user"].pop
133
+ self["pass"]=self["pass"].pop
134
+ self["timeout"]=self["timeout"].pop.to_i
135
+ rescue NoMethodError
136
+ raise "Some thing error while conf pop"
137
+ exit
138
+ end
139
+ #urls="http://www.coolcode.cn/"
140
+ #can_leave_domain= "yes"
141
+ #max_depth=4
142
+ #save_path="/tmp/coolcode/"
143
+ #buckets=128
144
+ #source="coolcode"
145
+ #threads=10
146
+ #same_domain_regexp="\.coolcode\.cn"
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,130 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ #when you need the tokyocabinet ,remove the comment
9
+ # require 'tokyocabinet'
10
+ require "digest/md5"
11
+ #this class store the content of documents in Hash (memory)
12
+ #So program uses lots of memory and can play fast
13
+ #
14
+ class ContentStorage < Hash
15
+ def initialize()
16
+ end
17
+ #store an url and content of the url
18
+ def add(url,content)
19
+ self[url]=content
20
+ end
21
+ #list the urls
22
+ def urls
23
+ self.keys
24
+ end
25
+ #close the db
26
+ def close
27
+ end
28
+ #get the content of url #{url}
29
+ def get(url)
30
+ return self[url]
31
+ end
32
+ end
33
+ #this class store the content in tokyocabinet database
34
+ #so she can get perfect performance and uses little memory
35
+ class HDBContentStorage
36
+ #the file path to hold the HDB file
37
+ def initialize(path)
38
+ @hdb = TokyoCabinet::HDB::new
39
+ if(!@hdb.open(path, TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT))
40
+ ecode = @hdb.ecode
41
+ STDERR.printf("open error: %s\n", @hdb.errmsg(ecode))
42
+ end
43
+ end
44
+ #store an url and content of the url
45
+ def add(url,content)
46
+ @hdb.put(url,content)
47
+ end
48
+ #close the db
49
+ def close
50
+ @hdb.close
51
+ end
52
+ #list all the urls
53
+ def urls
54
+ @hdb.iterinit
55
+ keys=[]
56
+ while(key = @hdb.iternext)
57
+ keys<< key
58
+ end
59
+ keys
60
+ end
61
+ #fetch the content of specificed url
62
+ def get(url)
63
+ return @hdb.get(url)
64
+ end
65
+ end
66
+ class MysqlException < Exception
67
+ def to_s
68
+ return "Can't connect to mysql "
69
+ end
70
+ end
71
+ #class MysqlContentStorage store the content of urls in an msyql_db
72
+ class MysqlContentStorage
73
+ #get md5 hash of string
74
+ def md5(string)
75
+ t=Digest::MD5.new
76
+ t << string
77
+ t.to_s
78
+ end
79
+ #initialize the object
80
+ #hash must be an hash includes mysql connection information such as host,user,pass,database and so on
81
+ #source specific the task name
82
+ def initialize(hash,source="default")
83
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
84
+ raise MysqlException if @my.nil?
85
+ @source=source
86
+ end
87
+ #store an url and content of the url
88
+ def add(url,content)
89
+ sql="INSERT INTO `htmls` (`source`,`url`,`url_crc32`,`html`,`html_crc32`,`created`,`ukey`)
90
+ VALUES ('"+@my.quote(@source)+"','"+@my.quote(url)+"','0','"+@my.quote(content)+"','0','"+Time.now.to_i.to_s+"','"+@my.quote(md5(url)+@source)+"')"
91
+ begin
92
+ @my.query(sql)
93
+ rescue Mysql::Error =>e
94
+ return nil
95
+ end
96
+ return true
97
+ end
98
+ #get the content of url #{url}
99
+ def get(url)
100
+ sql="select html from htmls where ukey='"+@my.quote(md5(url)+@source)+"'"
101
+ begin
102
+ rs=@my.query(sql)
103
+ rs.each do |r|
104
+ return r[0]
105
+ end
106
+ rescue Mysql::Error => e
107
+ return nil
108
+ end
109
+ end
110
+ #list the urls
111
+ #@return Array
112
+ def urls()
113
+ sql="select url from htmls where source='"+@my.quote(@source)+"'"
114
+ begin
115
+ rs=@my.query(sql)
116
+ keys=[]
117
+ rs.each do |r|
118
+ keys.push(r[0])
119
+ end
120
+ return keys
121
+ rescue Mysql::Error => e
122
+ return []
123
+ end
124
+ end
125
+ #close the database connection
126
+ def close()
127
+ @my.close
128
+ end
129
+ end
130
+ end