rspider 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
module Rspider
|
2
|
+
class DataWasher
|
3
|
+
#根据文章diff的结果取回标题和内容.
|
4
|
+
def getDiffRows(exampleFile,dataFile)
|
5
|
+
diff_res=%x{diff #{exampleFile} #{dataFile}}
|
6
|
+
lines=diff_res.split("\n")
|
7
|
+
row_id=1
|
8
|
+
rows=[]
|
9
|
+
cache=""
|
10
|
+
lines.each{|l|
|
11
|
+
if (l[0,2] == "--")
|
12
|
+
elsif(l[0,1] == "<")
|
13
|
+
elsif(l[0,1] == ">")
|
14
|
+
cache= cache + l[1,l.length]+"\n"
|
15
|
+
else
|
16
|
+
rows.push cache
|
17
|
+
cache=""
|
18
|
+
end
|
19
|
+
}
|
20
|
+
rows.push cache
|
21
|
+
rows
|
22
|
+
end
|
23
|
+
|
24
|
+
def parseDir(srcDir,destDir)
|
25
|
+
puts "now:parse Directory:#{srcDir}"
|
26
|
+
files=[]
|
27
|
+
Dir.foreach(srcDir){|f|
|
28
|
+
files << f unless f == "." or f == ".."
|
29
|
+
}
|
30
|
+
l=files.length
|
31
|
+
first=files[0]
|
32
|
+
last=files[l-1]
|
33
|
+
k=0
|
34
|
+
if $ENV == "PRO"
|
35
|
+
files.each{|f|
|
36
|
+
if (k==0)
|
37
|
+
w=parseText(srcDir+last,srcDir+f) unless File.file?(destDir+f)
|
38
|
+
open(destDir+f,"w+").puts w unless w.nil?
|
39
|
+
else
|
40
|
+
w=parseText(srcDir+first,srcDir+f) unless File.file?(destDir+f)
|
41
|
+
open(destDir+f,"w+").puts w unless w.nil?
|
42
|
+
end
|
43
|
+
k=k+1
|
44
|
+
}
|
45
|
+
else
|
46
|
+
i=0
|
47
|
+
files.each{|f|
|
48
|
+
if (i>3)
|
49
|
+
break
|
50
|
+
end
|
51
|
+
diffRows=[]
|
52
|
+
if (k==0)
|
53
|
+
diffRows= getDiffRows(srcDir+last,srcDir+f)
|
54
|
+
else
|
55
|
+
diffRows= getDiffRows(srcDir+first,srcDir+f)
|
56
|
+
end
|
57
|
+
puts "\n\n\n ==============Diff Rows[#{i}]================\n"
|
58
|
+
x=0
|
59
|
+
diffRows.each{ |l|
|
60
|
+
puts "\n+ rows[#{x}]:\n"
|
61
|
+
puts l
|
62
|
+
x=x+1
|
63
|
+
}
|
64
|
+
k=k+1
|
65
|
+
i=i+1
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#根据文章diff的结果取回标题和内容(针对和讯理财)
|
71
|
+
def parseTextHexun(exampleFile,dataFile)
|
72
|
+
rows=getDiffRows(exampleFile,dataFile)
|
73
|
+
i=0
|
74
|
+
cur=0
|
75
|
+
contents=[]
|
76
|
+
rows.each{ |l|
|
77
|
+
if l =~ %r{\s*进入.*吧}
|
78
|
+
puts "got the end of content;#{l}"
|
79
|
+
break
|
80
|
+
end
|
81
|
+
if l =~ %r{^\s*[\d]{1}\*}
|
82
|
+
next
|
83
|
+
end
|
84
|
+
if l =~ %r{^\s*上一页\s*}
|
85
|
+
next
|
86
|
+
end
|
87
|
+
if l =~ %r{^\s*下一页\s*}
|
88
|
+
next
|
89
|
+
end
|
90
|
+
#if l =~ %r{^\s*第[\d]页} and l.length()<25
|
91
|
+
# next
|
92
|
+
#end
|
93
|
+
contents.push l if i>3
|
94
|
+
i = i+1
|
95
|
+
}
|
96
|
+
returns=""
|
97
|
+
returns << rows[1].sub("-理财频道-和讯网","")
|
98
|
+
returns << "::==++\n"
|
99
|
+
returns << contents.join("\n")
|
100
|
+
returns
|
101
|
+
end
|
102
|
+
#根据文章diff的结果取回标题和内容.
|
103
|
+
def parseText(exampleFile,dataFile)
|
104
|
+
if $_SOURCE == "hexun"
|
105
|
+
return parseTextHexun(exampleFile,dataFile)
|
106
|
+
else
|
107
|
+
puts "not hexun"
|
108
|
+
end
|
109
|
+
rows=getDiffRows(exampleFile,dataFile)
|
110
|
+
i=0
|
111
|
+
cur=0
|
112
|
+
rows.each{ |l|
|
113
|
+
if(l.length>150 )
|
114
|
+
cur=i
|
115
|
+
break
|
116
|
+
end
|
117
|
+
i=i+1
|
118
|
+
}
|
119
|
+
if cur==0
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
returns=""
|
123
|
+
returns << rows[1]
|
124
|
+
returns << "::==++\n"
|
125
|
+
returns << rows[cur]
|
126
|
+
returns
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Rspider
|
2
|
+
class Document < Hash
|
3
|
+
end
|
4
|
+
class HTMLDocument < Hash
|
5
|
+
#得到头条之类的大纲
|
6
|
+
def get_headlines(html)
|
7
|
+
r=Regexp.compile('<h[0-9][^>]*>(.*?)<\/h[0-9]>',Regexp::IGNORECASE|Regexp::MULTILINE)
|
8
|
+
lines=[]
|
9
|
+
ms=html.scan(r)
|
10
|
+
return nil if ms.nil?
|
11
|
+
ms.each{|m|
|
12
|
+
lines.push m[0]
|
13
|
+
}
|
14
|
+
lines.join("\n")
|
15
|
+
end
|
16
|
+
#得到head区的内容
|
17
|
+
def get_head_data(html)
|
18
|
+
r=Regexp.compile('<head[^>]*>(.*?)<\/head>',Regexp::IGNORECASE|Regexp::MULTILINE)
|
19
|
+
m=html.scan(r)
|
20
|
+
return nil if m.nil?
|
21
|
+
return nil if m[0].nil?
|
22
|
+
head={}
|
23
|
+
head[:title]=""
|
24
|
+
head[:keywords]=""
|
25
|
+
head[:robots]=""
|
26
|
+
head[:description]=""
|
27
|
+
head[:nofollow]=false
|
28
|
+
head[:noindex]=false
|
29
|
+
head[:base]=""
|
30
|
+
|
31
|
+
|
32
|
+
h=m[0][0]
|
33
|
+
begin
|
34
|
+
r_robots=/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
35
|
+
robots=h.scan(r_robots)[0][0]
|
36
|
+
head[:robots]=robots
|
37
|
+
rescue
|
38
|
+
end
|
39
|
+
begin
|
40
|
+
r_desc=/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
41
|
+
head[:description]=h.scan(r_desc)[0][0]
|
42
|
+
rescue
|
43
|
+
end
|
44
|
+
begin
|
45
|
+
r_keys=/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
46
|
+
head[:keywords]=h.scan(r_keys)[0][0]
|
47
|
+
rescue
|
48
|
+
end
|
49
|
+
|
50
|
+
begin
|
51
|
+
r_charset=/<meta +http\-equiv*=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
|
52
|
+
head[:charset]=h.scan(r_charset)[0][0].split("=").pop
|
53
|
+
rescue
|
54
|
+
end
|
55
|
+
|
56
|
+
begin
|
57
|
+
r_base=/<base +href *= *[\"']?([^<>'\"]+)[\"']?/im
|
58
|
+
head[:base]=h.scan(r_base)[0][0]
|
59
|
+
rescue
|
60
|
+
end
|
61
|
+
begin
|
62
|
+
r_title=/<title *>(.*?)<\/title*>/im
|
63
|
+
head[:title]=h.scan(r_title)[0][0].gsub("\n","")
|
64
|
+
rescue
|
65
|
+
end
|
66
|
+
|
67
|
+
begin
|
68
|
+
archives=[]
|
69
|
+
r_archives=/<link +rel*=[\"']?archives[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
|
70
|
+
h.scan(r_archives).each{ |l|
|
71
|
+
archives.push l[0]
|
72
|
+
}
|
73
|
+
head[:archives]=archives
|
74
|
+
rescue
|
75
|
+
end
|
76
|
+
begin
|
77
|
+
links=[]
|
78
|
+
r_alternates=/<link +rel*=[\"']?alternate[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
|
79
|
+
h.scan(r_alternates).each{ |l|
|
80
|
+
links.push l[0]
|
81
|
+
}
|
82
|
+
head[:rss_links]=links
|
83
|
+
rescue
|
84
|
+
end
|
85
|
+
robots.downcase.split(",").each{ |j|
|
86
|
+
head[:noindex]=true if j=="noindex"
|
87
|
+
head[:nofollow]=true if j=="nofollow"
|
88
|
+
} unless robots.nil?
|
89
|
+
head
|
90
|
+
end
|
91
|
+
end
|
92
|
+
class RSSItemDocument < Hash
|
93
|
+
end
|
94
|
+
class RSSParser
|
95
|
+
def initialize(rss)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
class PDFDocument < Hash
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
#These classes Extract a document from PDF,HTML,Mail,Doc,XML files
|
9
|
+
class DocumentExtractor
|
10
|
+
def initialize
|
11
|
+
end
|
12
|
+
end
|
13
|
+
class PDFDocumentExtractor < DocumentExtractor
|
14
|
+
end
|
15
|
+
class DocDocumentExtractor < DocumentExtractor
|
16
|
+
end
|
17
|
+
class TEXTDocumentExtractor < DocumentExtractor
|
18
|
+
end
|
19
|
+
class XMLDocumentExtractor < DocumentExtractor
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
require "iconv"
|
8
|
+
module Rspider
|
9
|
+
#this class guesses the encoding of a html document,convert it from various encoding to UTF-8
|
10
|
+
class HtmlTidy
|
11
|
+
|
12
|
+
def initialize()
|
13
|
+
#@defaultCharset="UTF-8"
|
14
|
+
end
|
15
|
+
# guess the encoding of html document
|
16
|
+
def guess_encoding(resp)
|
17
|
+
resp[0..400].scan(/content="(.*)"/i).flatten.join("\t").scan(/charset=([a-z0-9\-]+)/i).flatten.join("\t")
|
18
|
+
end
|
19
|
+
#convert document from one encoding to another
|
20
|
+
def iconv(from,to,text)
|
21
|
+
Iconv.new(to.upcase+"//IGNORE",from.upcase+"//IGNORE").iconv(text)
|
22
|
+
end
|
23
|
+
#guess the encoding of the document and convert it to UTF-8
|
24
|
+
def tidy(html)
|
25
|
+
encoding = guess_encoding(html).upcase
|
26
|
+
return html if encoding == "UTF-8"
|
27
|
+
encoding="GBK" if encoding =~ /gb2312/
|
28
|
+
encoding="GBK" if encoding =~ /gbk/
|
29
|
+
iconv(encoding,"UTF-8",html)
|
30
|
+
end
|
31
|
+
def strip_tags()
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
=begin rdoc
|
3
|
+
Simple Logger
|
4
|
+
Written by aragorn(xurenlu@gmail.com)
|
5
|
+
Author:: aragorn(xurenlu@gmail.com)
|
6
|
+
Homepage:: http://www.162cm.com/
|
7
|
+
MSN:: helloasp@hotmail.com
|
8
|
+
=end
|
9
|
+
|
10
|
+
module Rspider
|
11
|
+
=begin rdoc
|
12
|
+
This class is written for Logging messages when program running.
|
13
|
+
|
14
|
+
Examples:
|
15
|
+
|
16
|
+
* $LOGGER=Logger.new("/var/run/log/cns.log")
|
17
|
+
* $LOGGER.log_msg("Program inited!")
|
18
|
+
* ....
|
19
|
+
* $LOGGER.log_msg("Sockets inited")
|
20
|
+
* $LOGGER.flush_msg();
|
21
|
+
=end
|
22
|
+
class Logger
|
23
|
+
attr_accessor :max,:msgs
|
24
|
+
|
25
|
+
#path: the file path to storage the logs
|
26
|
+
def initialize(path)
|
27
|
+
@max=10
|
28
|
+
@log_path=path
|
29
|
+
@msgs=[]
|
30
|
+
puts "logs created :#{path}" if $DEBUG
|
31
|
+
end
|
32
|
+
#add a new log
|
33
|
+
#the log may not be sync to disk
|
34
|
+
#The Logger syncs every 100 messages automaticly
|
35
|
+
def log_msg(log,level="NOTE")
|
36
|
+
@msgs.push(level+":"+Time.now.to_i.to_s+":"+log)
|
37
|
+
flush_msg if (@msgs.length>@max) or level=="ERROR"
|
38
|
+
end
|
39
|
+
#sync the logs to hard disk
|
40
|
+
def flush_msg()
|
41
|
+
open(@log_path,"a+") do |f|
|
42
|
+
# f.puts "Generated at :"+Time.now.to_s+"\n"
|
43
|
+
f.puts @msgs.join("\n")
|
44
|
+
end
|
45
|
+
@msgs=[]
|
46
|
+
puts "logs flushed ." if $DEBUG
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require "digest/md5"
|
3
|
+
require "zlib"
|
4
|
+
module Rspider
|
5
|
+
#The class MysqlUrlRelationStorage store url relations in Mysql database
|
6
|
+
#For better performance, we create an UrlStorage object to cache urls in memory
|
7
|
+
class MysqlUrlRelationStorage
|
8
|
+
def initialize(hash,source="default")
|
9
|
+
@seed=1024
|
10
|
+
@source=source
|
11
|
+
@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
|
12
|
+
raise MysqlException if @my.nil?
|
13
|
+
end
|
14
|
+
#get the MD5 hash of string param "string"
|
15
|
+
def md5(string)
|
16
|
+
t=Digest::MD5.new
|
17
|
+
t << string
|
18
|
+
t.to_s
|
19
|
+
end
|
20
|
+
def save(referer,url)
|
21
|
+
url_crc=Zlib::crc32(url,@seed)
|
22
|
+
referer_crc=Zlib::crc32(referer,@seed)
|
23
|
+
sql="INSERT INTO url_relations (url,referer,url_crc32,referer_crc32) values('#{@my.quote(url)}','#{@my.quote(referer)}','#{url_crc}','#{referer_crc}')"
|
24
|
+
begin
|
25
|
+
@my.query(sql)
|
26
|
+
rescue Mysql::Error
|
27
|
+
rescue
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
require "digest/md5"
|
3
|
+
require "zlib"
|
4
|
+
module Rspider
|
5
|
+
#we re-defined the UrlStorage Class and add the urlStored? method
|
6
|
+
class UrlStorageCache < UrlStorage
|
7
|
+
MAX=512
|
8
|
+
STEP=128
|
9
|
+
@seed=1024
|
10
|
+
def urlStored?(url)
|
11
|
+
STEP.times do
|
12
|
+
@urls.shift
|
13
|
+
end if @urls.length>MAX
|
14
|
+
STEP.times do
|
15
|
+
@visitedUrls.shift
|
16
|
+
end if @visitedUrls.length>MAX
|
17
|
+
@visitedUrls.include?(url) or @urls.include?(url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
#The class MysqlUrlStorage store urls in Mysql database
|
21
|
+
#For better performance, we create an UrlStorage object to cache urls in memory
|
22
|
+
class MysqlUrlStorage
|
23
|
+
attr_accessor :cache
|
24
|
+
#Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass
|
25
|
+
#Param source is the name of cralwering task
|
26
|
+
def initialize(hash,source="default")
|
27
|
+
@source=source
|
28
|
+
@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
|
29
|
+
raise MysqlException if @my.nil?
|
30
|
+
@cache=UrlStorageCache.new
|
31
|
+
end
|
32
|
+
#get the MD5 hash of string param "string"
|
33
|
+
def md5(string)
|
34
|
+
t=Digest::MD5.new
|
35
|
+
t << string
|
36
|
+
t.to_s
|
37
|
+
end
|
38
|
+
#asking if the url has been visited?
|
39
|
+
def visited?(url)
|
40
|
+
return true if @cache.visited?(url)
|
41
|
+
ukey=md5(url)+@source
|
42
|
+
crc=Zlib::crc32(url)
|
43
|
+
sql="SELECT visited FROM `urls` WHERE AND url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
|
44
|
+
begin
|
45
|
+
rs=@my.query(sql)
|
46
|
+
rs.each do |r|
|
47
|
+
return true if r[0].to_i>0
|
48
|
+
end
|
49
|
+
return nil
|
50
|
+
rescue Mysql::Error => e
|
51
|
+
return nil
|
52
|
+
end
|
53
|
+
end
|
54
|
+
#we discover a new url and record it
|
55
|
+
def <<(url)
|
56
|
+
return nil if @cache.urlStored?(url)
|
57
|
+
ukey=md5(url)+@source
|
58
|
+
crc=Zlib::crc32(url,@seed)
|
59
|
+
sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')";
|
60
|
+
begin
|
61
|
+
@my.query(sql)
|
62
|
+
@cache.<<(url)
|
63
|
+
rescue Mysql::Error,StandardError,Exception => e
|
64
|
+
else
|
65
|
+
end
|
66
|
+
end
|
67
|
+
#got a url to cralwer
|
68
|
+
def pop()
|
69
|
+
#sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY errors asc,score desc,RAND() LIMIT 1"
|
70
|
+
url=@cache.pop
|
71
|
+
return url unless url.nil?
|
72
|
+
sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY RAND() LIMIT 1"
|
73
|
+
begin
|
74
|
+
rs=@my.query(sql)
|
75
|
+
rs.each do |r|
|
76
|
+
return r[0]
|
77
|
+
end
|
78
|
+
rescue Mysql::Error
|
79
|
+
return nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
#we have cralwered an url ,so we recored it
|
83
|
+
def visited(url)
|
84
|
+
@cache.visited(url)
|
85
|
+
ukey=md5(url)+@source
|
86
|
+
crc=Zlib::crc32(url)
|
87
|
+
sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND `ukey`='"+ukey+"' LIMIT 1"
|
88
|
+
begin
|
89
|
+
@my.query(sql)
|
90
|
+
rescue Mysql::Error
|
91
|
+
return false
|
92
|
+
end
|
93
|
+
return true
|
94
|
+
end
|
95
|
+
#we meet an error,so we log it
|
96
|
+
def error(url)
|
97
|
+
@cache.error(url)
|
98
|
+
ukey=md5(url)+@source
|
99
|
+
crc=Zlib::crc32(url)
|
100
|
+
sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
|
101
|
+
@my.query(sql)
|
102
|
+
end
|
103
|
+
def close
|
104
|
+
@my.close
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|