rspider 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
module Rspider
|
2
|
+
class DataWasher
|
3
|
+
#根据文章diff的结果取回标题和内容.
|
4
|
+
def getDiffRows(exampleFile,dataFile)
|
5
|
+
diff_res=%x{diff #{exampleFile} #{dataFile}}
|
6
|
+
lines=diff_res.split("\n")
|
7
|
+
row_id=1
|
8
|
+
rows=[]
|
9
|
+
cache=""
|
10
|
+
lines.each{|l|
|
11
|
+
if (l[0,2] == "--")
|
12
|
+
elsif(l[0,1] == "<")
|
13
|
+
elsif(l[0,1] == ">")
|
14
|
+
cache= cache + l[1,l.length]+"\n"
|
15
|
+
else
|
16
|
+
rows.push cache
|
17
|
+
cache=""
|
18
|
+
end
|
19
|
+
}
|
20
|
+
rows.push cache
|
21
|
+
rows
|
22
|
+
end
|
23
|
+
|
24
|
+
def parseDir(srcDir,destDir)
|
25
|
+
puts "now:parse Directory:#{srcDir}"
|
26
|
+
files=[]
|
27
|
+
Dir.foreach(srcDir){|f|
|
28
|
+
files << f unless f == "." or f == ".."
|
29
|
+
}
|
30
|
+
l=files.length
|
31
|
+
first=files[0]
|
32
|
+
last=files[l-1]
|
33
|
+
k=0
|
34
|
+
if $ENV == "PRO"
|
35
|
+
files.each{|f|
|
36
|
+
if (k==0)
|
37
|
+
w=parseText(srcDir+last,srcDir+f) unless File.file?(destDir+f)
|
38
|
+
open(destDir+f,"w+").puts w unless w.nil?
|
39
|
+
else
|
40
|
+
w=parseText(srcDir+first,srcDir+f) unless File.file?(destDir+f)
|
41
|
+
open(destDir+f,"w+").puts w unless w.nil?
|
42
|
+
end
|
43
|
+
k=k+1
|
44
|
+
}
|
45
|
+
else
|
46
|
+
i=0
|
47
|
+
files.each{|f|
|
48
|
+
if (i>3)
|
49
|
+
break
|
50
|
+
end
|
51
|
+
diffRows=[]
|
52
|
+
if (k==0)
|
53
|
+
diffRows= getDiffRows(srcDir+last,srcDir+f)
|
54
|
+
else
|
55
|
+
diffRows= getDiffRows(srcDir+first,srcDir+f)
|
56
|
+
end
|
57
|
+
puts "\n\n\n ==============Diff Rows[#{i}]================\n"
|
58
|
+
x=0
|
59
|
+
diffRows.each{ |l|
|
60
|
+
puts "\n+ rows[#{x}]:\n"
|
61
|
+
puts l
|
62
|
+
x=x+1
|
63
|
+
}
|
64
|
+
k=k+1
|
65
|
+
i=i+1
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#根据文章diff的结果取回标题和内容(针对和讯理财)
|
71
|
+
def parseTextHexun(exampleFile,dataFile)
|
72
|
+
rows=getDiffRows(exampleFile,dataFile)
|
73
|
+
i=0
|
74
|
+
cur=0
|
75
|
+
contents=[]
|
76
|
+
rows.each{ |l|
|
77
|
+
if l =~ %r{\s*进入.*吧}
|
78
|
+
puts "got the end of content;#{l}"
|
79
|
+
break
|
80
|
+
end
|
81
|
+
if l =~ %r{^\s*[\d]{1}\*}
|
82
|
+
next
|
83
|
+
end
|
84
|
+
if l =~ %r{^\s*上一页\s*}
|
85
|
+
next
|
86
|
+
end
|
87
|
+
if l =~ %r{^\s*下一页\s*}
|
88
|
+
next
|
89
|
+
end
|
90
|
+
#if l =~ %r{^\s*第[\d]页} and l.length()<25
|
91
|
+
# next
|
92
|
+
#end
|
93
|
+
contents.push l if i>3
|
94
|
+
i = i+1
|
95
|
+
}
|
96
|
+
returns=""
|
97
|
+
returns << rows[1].sub("-理财频道-和讯网","")
|
98
|
+
returns << "::==++\n"
|
99
|
+
returns << contents.join("\n")
|
100
|
+
returns
|
101
|
+
end
|
102
|
+
#根据文章diff的结果取回标题和内容.
|
103
|
+
def parseText(exampleFile,dataFile)
|
104
|
+
if $_SOURCE == "hexun"
|
105
|
+
return parseTextHexun(exampleFile,dataFile)
|
106
|
+
else
|
107
|
+
puts "not hexun"
|
108
|
+
end
|
109
|
+
rows=getDiffRows(exampleFile,dataFile)
|
110
|
+
i=0
|
111
|
+
cur=0
|
112
|
+
rows.each{ |l|
|
113
|
+
if(l.length>150 )
|
114
|
+
cur=i
|
115
|
+
break
|
116
|
+
end
|
117
|
+
i=i+1
|
118
|
+
}
|
119
|
+
if cur==0
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
returns=""
|
123
|
+
returns << rows[1]
|
124
|
+
returns << "::==++\n"
|
125
|
+
returns << rows[cur]
|
126
|
+
returns
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Rspider
|
2
|
+
class Document < Hash
|
3
|
+
end
|
4
|
+
class HTMLDocument < Hash
|
5
|
+
#得到头条之类的大纲
|
6
|
+
def get_headlines(html)
|
7
|
+
r=Regexp.compile('<h[0-9][^>]*>(.*?)<\/h[0-9]>',Regexp::IGNORECASE|Regexp::MULTILINE)
|
8
|
+
lines=[]
|
9
|
+
ms=html.scan(r)
|
10
|
+
return nil if ms.nil?
|
11
|
+
ms.each{|m|
|
12
|
+
lines.push m[0]
|
13
|
+
}
|
14
|
+
lines.join("\n")
|
15
|
+
end
|
16
|
+
#得到head区的内容
|
17
|
+
def get_head_data(html)
|
18
|
+
r=Regexp.compile('<head[^>]*>(.*?)<\/head>',Regexp::IGNORECASE|Regexp::MULTILINE)
|
19
|
+
m=html.scan(r)
|
20
|
+
return nil if m.nil?
|
21
|
+
return nil if m[0].nil?
|
22
|
+
head={}
|
23
|
+
head[:title]=""
|
24
|
+
head[:keywords]=""
|
25
|
+
head[:robots]=""
|
26
|
+
head[:description]=""
|
27
|
+
head[:nofollow]=false
|
28
|
+
head[:noindex]=false
|
29
|
+
head[:base]=""
|
30
|
+
|
31
|
+
|
32
|
+
h=m[0][0]
|
33
|
+
begin
|
34
|
+
r_robots=/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
35
|
+
robots=h.scan(r_robots)[0][0]
|
36
|
+
head[:robots]=robots
|
37
|
+
rescue
|
38
|
+
end
|
39
|
+
begin
|
40
|
+
r_desc=/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
41
|
+
head[:description]=h.scan(r_desc)[0][0]
|
42
|
+
rescue
|
43
|
+
end
|
44
|
+
begin
|
45
|
+
r_keys=/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
46
|
+
head[:keywords]=h.scan(r_keys)[0][0]
|
47
|
+
rescue
|
48
|
+
end
|
49
|
+
|
50
|
+
begin
|
51
|
+
r_charset=/<meta +http\-equiv*=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
52
|
+
head[:charset]=h.scan(r_charset)[0][0].split("=").pop
|
53
|
+
rescue
|
54
|
+
end
|
55
|
+
|
56
|
+
begin
|
57
|
+
r_base=/<base +href *= *[\"']?([^<>'\"]+)[\"']?/im
|
58
|
+
head[:base]=h.scan(r_base)[0][0]
|
59
|
+
rescue
|
60
|
+
end
|
61
|
+
begin
|
62
|
+
r_title=/<title *>(.*?)<\/title*>/im
|
63
|
+
head[:title]=h.scan(r_title)[0][0].gsub("\n","")
|
64
|
+
rescue
|
65
|
+
end
|
66
|
+
|
67
|
+
begin
|
68
|
+
archives=[]
|
69
|
+
r_archives=/<link +rel*=[\"']?archives[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
|
70
|
+
h.scan(r_archives).each{ |l|
|
71
|
+
archives.push l[0]
|
72
|
+
}
|
73
|
+
head[:archives]=archives
|
74
|
+
rescue
|
75
|
+
end
|
76
|
+
begin
|
77
|
+
links=[]
|
78
|
+
r_alternates=/<link +rel*=[\"']?alternate[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
|
79
|
+
h.scan(r_alternates).each{ |l|
|
80
|
+
links.push l[0]
|
81
|
+
}
|
82
|
+
head[:rss_links]=links
|
83
|
+
rescue
|
84
|
+
end
|
85
|
+
robots.downcase.split(",").each{ |j|
|
86
|
+
head[:noindex]=true if j=="noindex"
|
87
|
+
head[:nofollow]=true if j=="nofollow"
|
88
|
+
} unless robots.nil?
|
89
|
+
head
|
90
|
+
end
|
91
|
+
end
|
92
|
+
class RSSItemDocument < Hash
|
93
|
+
end
|
94
|
+
class RSSParser
|
95
|
+
def initialize(rss)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
class PDFDocument < Hash
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
#These classes Extract a document from PDF,HTML,Mail,Doc,XML files
|
9
|
+
class DocumentExtractor
|
10
|
+
def initialize
|
11
|
+
end
|
12
|
+
end
|
13
|
+
class PDFDocumentExtractor < DocumentExtractor
|
14
|
+
end
|
15
|
+
class DocDocumentExtractor < DocumentExtractor
|
16
|
+
end
|
17
|
+
class TEXTDocumentExtractor < DocumentExtractor
|
18
|
+
end
|
19
|
+
class XMLDocumentExtractor < DocumentExtractor
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
require "iconv"
|
8
|
+
module Rspider
|
9
|
+
#this class guesses the encoding of a html document,convert it from various encoding to UTF-8
|
10
|
+
class HtmlTidy
|
11
|
+
|
12
|
+
def initialize()
|
13
|
+
#@defaultCharset="UTF-8"
|
14
|
+
end
|
15
|
+
# guess the encoding of html document
|
16
|
+
def guess_encoding(resp)
|
17
|
+
resp[0..400].scan(/content="(.*)"/i).flatten.join("\t").scan(/charset=([a-z0-9\-]+)/i).flatten.join("\t")
|
18
|
+
end
|
19
|
+
#convert document from one encoding to another
|
20
|
+
def iconv(from,to,text)
|
21
|
+
Iconv.new(to.upcase+"//IGNORE",from.upcase+"//IGNORE").iconv(text)
|
22
|
+
end
|
23
|
+
#guess the encoding of the document and convert it to UTF-8
|
24
|
+
def tidy(html)
|
25
|
+
encoding = guess_encoding(html).upcase
|
26
|
+
return html if encoding == "UTF-8"
|
27
|
+
encoding="GBK" if encoding =~ /gb2312/
|
28
|
+
encoding="GBK" if encoding =~ /gbk/
|
29
|
+
iconv(encoding,"UTF-8",html)
|
30
|
+
end
|
31
|
+
def strip_tags()
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
=begin rdoc
|
3
|
+
Simple Logger
|
4
|
+
Written by aragorn(xurenlu@gmail.com)
|
5
|
+
Author:: aragorn(xurenlu@gmail.com)
|
6
|
+
Homepage:: http://www.162cm.com/
|
7
|
+
MSN:: helloasp@hotmail.com
|
8
|
+
=end
|
9
|
+
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
This class is written for Logging messages when program running.
|
13
|
+
|
14
|
+
Examples:
|
15
|
+
|
16
|
+
* $LOGGER=Logger.new("/var/run/log/cns.log")
|
17
|
+
* $LOGGER.log_msg("Program inited!")
|
18
|
+
* ....
|
19
|
+
* $LOGGER.log_msg("Sockets inited")
|
20
|
+
* $LOGGER.flush_msg();
|
21
|
+
=end
|
22
|
+
class Logger
|
23
|
+
attr_accessor :max,:msgs
|
24
|
+
|
25
|
+
#path: the file path to storage the logs
|
26
|
+
def initialize(path)
|
27
|
+
@max=10
|
28
|
+
@log_path=path
|
29
|
+
@msgs=[]
|
30
|
+
puts "logs created :#{path}" if $DEBUG
|
31
|
+
end
|
32
|
+
#add a new log
|
33
|
+
#the log may not be sync to disk
|
34
|
+
#The Logger syncs every 100 messages automaticly
|
35
|
+
def log_msg(log,level="NOTE")
|
36
|
+
@msgs.push(level+":"+Time.now.to_i.to_s+":"+log)
|
37
|
+
flush_msg if (@msgs.length>@max) or level=="ERROR"
|
38
|
+
end
|
39
|
+
#sync the logs to hard disk
|
40
|
+
def flush_msg()
|
41
|
+
open(@log_path,"a+") do |f|
|
42
|
+
# f.puts "Generated at :"+Time.now.to_s+"\n"
|
43
|
+
f.puts @msgs.join("\n")
|
44
|
+
end
|
45
|
+
@msgs=[]
|
46
|
+
puts "logs flushed ." if $DEBUG
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require "digest/md5"
|
3
|
+
require "zlib"
|
4
|
+
module Rspider
|
5
|
+
#The class MysqlUrlRelationStorage store url relations in Mysql database
|
6
|
+
#For better performance, we create an UrlStorage object to cache urls in memory
|
7
|
+
class MysqlUrlRelationStorage
|
8
|
+
def initialize(hash,source="default")
|
9
|
+
@seed=1024
|
10
|
+
@source=source
|
11
|
+
@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
|
12
|
+
raise MysqlException if @my.nil?
|
13
|
+
end
|
14
|
+
#get the MD5 hash of string param "string"
|
15
|
+
def md5(string)
|
16
|
+
t=Digest::MD5.new
|
17
|
+
t << string
|
18
|
+
t.to_s
|
19
|
+
end
|
20
|
+
def save(referer,url)
|
21
|
+
url_crc=Zlib::crc32(url,@seed)
|
22
|
+
referer_crc=Zlib::crc32(referer,@seed)
|
23
|
+
sql="INSERT INTO url_relations (url,referer,url_crc32,referer_crc32) values('#{@my.quote(url)}','#{@my.quote(referer)}','#{url_crc}','#{referer_crc}')"
|
24
|
+
begin
|
25
|
+
@my.query(sql)
|
26
|
+
rescue Mysql::Error
|
27
|
+
rescue
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require "digest/md5"
|
3
|
+
require "zlib"
|
4
|
+
module Rspider
|
5
|
+
#we re-defined the UrlStorage Class and add the urlStored? method
|
6
|
+
class UrlStorageCache < UrlStorage
|
7
|
+
MAX=512
|
8
|
+
STEP=128
|
9
|
+
@seed=1024
|
10
|
+
def urlStored?(url)
|
11
|
+
STEP.times do
|
12
|
+
@urls.shift
|
13
|
+
end if @urls.length>MAX
|
14
|
+
STEP.times do
|
15
|
+
@visitedUrls.shift
|
16
|
+
end if @visitedUrls.length>MAX
|
17
|
+
@visitedUrls.include?(url) or @urls.include?(url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
#The class MysqlUrlStorage store urls in Mysql database
|
21
|
+
#For better performance, we create an UrlStorage object to cache urls in memory
|
22
|
+
class MysqlUrlStorage
|
23
|
+
attr_accessor :cache
|
24
|
+
#Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass
|
25
|
+
#Param source is the name of cralwering task
|
26
|
+
def initialize(hash,source="default")
|
27
|
+
@source=source
|
28
|
+
@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
|
29
|
+
raise MysqlException if @my.nil?
|
30
|
+
@cache=UrlStorageCache.new
|
31
|
+
end
|
32
|
+
#get the MD5 hash of string param "string"
|
33
|
+
def md5(string)
|
34
|
+
t=Digest::MD5.new
|
35
|
+
t << string
|
36
|
+
t.to_s
|
37
|
+
end
|
38
|
+
#asking if the url has been visited?
|
39
|
+
def visited?(url)
|
40
|
+
return true if @cache.visited?(url)
|
41
|
+
ukey=md5(url)+@source
|
42
|
+
crc=Zlib::crc32(url)
|
43
|
+
sql="SELECT visited FROM `urls` WHERE AND url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
|
44
|
+
begin
|
45
|
+
rs=@my.query(sql)
|
46
|
+
rs.each do |r|
|
47
|
+
return true if r[0].to_i>0
|
48
|
+
end
|
49
|
+
return nil
|
50
|
+
rescue Mysql::Error => e
|
51
|
+
return nil
|
52
|
+
end
|
53
|
+
end
|
54
|
+
#we discover a new url and record it
|
55
|
+
def <<(url)
|
56
|
+
return nil if @cache.urlStored?(url)
|
57
|
+
ukey=md5(url)+@source
|
58
|
+
crc=Zlib::crc32(url,@seed)
|
59
|
+
sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')";
|
60
|
+
begin
|
61
|
+
@my.query(sql)
|
62
|
+
@cache.<<(url)
|
63
|
+
rescue Mysql::Error,StandardError,Exception => e
|
64
|
+
else
|
65
|
+
end
|
66
|
+
end
|
67
|
+
#got a url to cralwer
|
68
|
+
def pop()
|
69
|
+
#sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY errors asc,score desc,RAND() LIMIT 1"
|
70
|
+
url=@cache.pop
|
71
|
+
return url unless url.nil?
|
72
|
+
sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY RAND() LIMIT 1"
|
73
|
+
begin
|
74
|
+
rs=@my.query(sql)
|
75
|
+
rs.each do |r|
|
76
|
+
return r[0]
|
77
|
+
end
|
78
|
+
rescue Mysql::Error
|
79
|
+
return nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
#we have cralwered an url ,so we recored it
|
83
|
+
def visited(url)
|
84
|
+
@cache.visited(url)
|
85
|
+
ukey=md5(url)+@source
|
86
|
+
crc=Zlib::crc32(url)
|
87
|
+
sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND `ukey`='"+ukey+"' LIMIT 1"
|
88
|
+
begin
|
89
|
+
@my.query(sql)
|
90
|
+
rescue Mysql::Error
|
91
|
+
return false
|
92
|
+
end
|
93
|
+
return true
|
94
|
+
end
|
95
|
+
#we meet an error,so we log it
|
96
|
+
def error(url)
|
97
|
+
@cache.error(url)
|
98
|
+
ukey=md5(url)+@source
|
99
|
+
crc=Zlib::crc32(url)
|
100
|
+
sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
|
101
|
+
@my.query(sql)
|
102
|
+
end
|
103
|
+
def close
|
104
|
+
@my.close
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|