rspider 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,129 @@
1
+ module Rspider
2
+ class DataWasher
3
+ #根据文章diff的结果取回标题和内容.
4
+ def getDiffRows(exampleFile,dataFile)
5
+ diff_res=%x{diff #{exampleFile} #{dataFile}}
6
+ lines=diff_res.split("\n")
7
+ row_id=1
8
+ rows=[]
9
+ cache=""
10
+ lines.each{|l|
11
+ if (l[0,2] == "--")
12
+ elsif(l[0,1] == "<")
13
+ elsif(l[0,1] == ">")
14
+ cache= cache + l[1,l.length]+"\n"
15
+ else
16
+ rows.push cache
17
+ cache=""
18
+ end
19
+ }
20
+ rows.push cache
21
+ rows
22
+ end
23
+
24
+ def parseDir(srcDir,destDir)
25
+ puts "now:parse Directory:#{srcDir}"
26
+ files=[]
27
+ Dir.foreach(srcDir){|f|
28
+ files << f unless f == "." or f == ".."
29
+ }
30
+ l=files.length
31
+ first=files[0]
32
+ last=files[l-1]
33
+ k=0
34
+ if $ENV == "PRO"
35
+ files.each{|f|
36
+ if (k==0)
37
+ w=parseText(srcDir+last,srcDir+f) unless File.file?(destDir+f)
38
+ open(destDir+f,"w+").puts w unless w.nil?
39
+ else
40
+ w=parseText(srcDir+first,srcDir+f) unless File.file?(destDir+f)
41
+ open(destDir+f,"w+").puts w unless w.nil?
42
+ end
43
+ k=k+1
44
+ }
45
+ else
46
+ i=0
47
+ files.each{|f|
48
+ if (i>3)
49
+ break
50
+ end
51
+ diffRows=[]
52
+ if (k==0)
53
+ diffRows= getDiffRows(srcDir+last,srcDir+f)
54
+ else
55
+ diffRows= getDiffRows(srcDir+first,srcDir+f)
56
+ end
57
+ puts "\n\n\n ==============Diff Rows[#{i}]================\n"
58
+ x=0
59
+ diffRows.each{ |l|
60
+ puts "\n+ rows[#{x}]:\n"
61
+ puts l
62
+ x=x+1
63
+ }
64
+ k=k+1
65
+ i=i+1
66
+ }
67
+ end
68
+ end
69
+
70
+ #根据文章diff的结果取回标题和内容(针对和讯理财)
71
+ def parseTextHexun(exampleFile,dataFile)
72
+ rows=getDiffRows(exampleFile,dataFile)
73
+ i=0
74
+ cur=0
75
+ contents=[]
76
+ rows.each{ |l|
77
+ if l =~ %r{\s*进入.*吧}
78
+ puts "got the end of content;#{l}"
79
+ break
80
+ end
81
+ if l =~ %r{^\s*[\d]{1}\*}
82
+ next
83
+ end
84
+ if l =~ %r{^\s*上一页\s*}
85
+ next
86
+ end
87
+ if l =~ %r{^\s*下一页\s*}
88
+ next
89
+ end
90
+ #if l =~ %r{^\s*第[\d]页} and l.length()<25
91
+ # next
92
+ #end
93
+ contents.push l if i>3
94
+ i = i+1
95
+ }
96
+ returns=""
97
+ returns << rows[1].sub("-理财频道-和讯网","")
98
+ returns << "::==++\n"
99
+ returns << contents.join("\n")
100
+ returns
101
+ end
102
+ #根据文章diff的结果取回标题和内容.
103
+ def parseText(exampleFile,dataFile)
104
+ if $_SOURCE == "hexun"
105
+ return parseTextHexun(exampleFile,dataFile)
106
+ else
107
+ puts "not hexun"
108
+ end
109
+ rows=getDiffRows(exampleFile,dataFile)
110
+ i=0
111
+ cur=0
112
+ rows.each{ |l|
113
+ if(l.length>150 )
114
+ cur=i
115
+ break
116
+ end
117
+ i=i+1
118
+ }
119
+ if cur==0
120
+ return nil
121
+ end
122
+ returns=""
123
+ returns << rows[1]
124
+ returns << "::==++\n"
125
+ returns << rows[cur]
126
+ returns
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,100 @@
1
+ module Rspider
2
+ class Document < Hash
3
+ end
4
+ class HTMLDocument < Hash
5
+ #得到头条之类的大纲
6
+ def get_headlines(html)
7
+ r=Regexp.compile('<h[0-9][^>]*>(.*?)<\/h[0-9]>',Regexp::IGNORECASE|Regexp::MULTILINE)
8
+ lines=[]
9
+ ms=html.scan(r)
10
+ return nil if ms.nil?
11
+ ms.each{|m|
12
+ lines.push m[0]
13
+ }
14
+ lines.join("\n")
15
+ end
16
+ #得到head区的内容
17
+ def get_head_data(html)
18
+ r=Regexp.compile('<head[^>]*>(.*?)<\/head>',Regexp::IGNORECASE|Regexp::MULTILINE)
19
+ m=html.scan(r)
20
+ return nil if m.nil?
21
+ return nil if m[0].nil?
22
+ head={}
23
+ head[:title]=""
24
+ head[:keywords]=""
25
+ head[:robots]=""
26
+ head[:description]=""
27
+ head[:nofollow]=false
28
+ head[:noindex]=false
29
+ head[:base]=""
30
+
31
+
32
+ h=m[0][0]
33
+ begin
34
+ r_robots=/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
35
+ robots=h.scan(r_robots)[0][0]
36
+ head[:robots]=robots
37
+ rescue
38
+ end
39
+ begin
40
+ r_desc=/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
41
+ head[:description]=h.scan(r_desc)[0][0]
42
+ rescue
43
+ end
44
+ begin
45
+ r_keys=/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
46
+ head[:keywords]=h.scan(r_keys)[0][0]
47
+ rescue
48
+ end
49
+
50
+ begin
51
+ r_charset=/<meta +http\-equiv*=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
52
+ head[:charset]=h.scan(r_charset)[0][0].split("=").pop
53
+ rescue
54
+ end
55
+
56
+ begin
57
+ r_base=/<base +href *= *[\"']?([^<>'\"]+)[\"']?/im
58
+ head[:base]=h.scan(r_base)[0][0]
59
+ rescue
60
+ end
61
+ begin
62
+ r_title=/<title *>(.*?)<\/title*>/im
63
+ head[:title]=h.scan(r_title)[0][0].gsub("\n","")
64
+ rescue
65
+ end
66
+
67
+ begin
68
+ archives=[]
69
+ r_archives=/<link +rel*=[\"']?archives[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
70
+ h.scan(r_archives).each{ |l|
71
+ archives.push l[0]
72
+ }
73
+ head[:archives]=archives
74
+ rescue
75
+ end
76
+ begin
77
+ links=[]
78
+ r_alternates=/<link +rel*=[\"']?alternate[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
79
+ h.scan(r_alternates).each{ |l|
80
+ links.push l[0]
81
+ }
82
+ head[:rss_links]=links
83
+ rescue
84
+ end
85
+ robots.downcase.split(",").each{ |j|
86
+ head[:noindex]=true if j=="noindex"
87
+ head[:nofollow]=true if j=="nofollow"
88
+ } unless robots.nil?
89
+ head
90
+ end
91
+ end
92
+ class RSSItemDocument < Hash
93
+ end
94
+ class RSSParser
95
+ def initialize(rss)
96
+ end
97
+ end
98
+ class PDFDocument < Hash
99
+ end
100
+ end
@@ -0,0 +1,21 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ #These classes Extract a document from PDF,HTML,Mail,Doc,XML files
9
+ class DocumentExtractor
10
+ def initialize
11
+ end
12
+ end
13
+ class PDFDocumentExtractor < DocumentExtractor
14
+ end
15
+ class DocDocumentExtractor < DocumentExtractor
16
+ end
17
+ class TEXTDocumentExtractor < DocumentExtractor
18
+ end
19
+ class XMLDocumentExtractor < DocumentExtractor
20
+ end
21
+ end
@@ -0,0 +1,34 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ require "iconv"
8
+ module Rspider
9
+ #this class guesses the encoding of a html document,convert it from various encoding to UTF-8
10
+ class HtmlTidy
11
+
12
+ def initialize()
13
+ #@defaultCharset="UTF-8"
14
+ end
15
+ # guess the encoding of html document
16
+ def guess_encoding(resp)
17
+ resp[0..400].scan(/content="(.*)"/i).flatten.join("\t").scan(/charset=([a-z0-9\-]+)/i).flatten.join("\t")
18
+ end
19
+ #convert document from one encoding to another
20
+ def iconv(from,to,text)
21
+ Iconv.new(to.upcase+"//IGNORE",from.upcase+"//IGNORE").iconv(text)
22
+ end
23
+ #guess the encoding of the document and convert it to UTF-8
24
+ def tidy(html)
25
+ encoding = guess_encoding(html).upcase
26
+ return html if encoding == "UTF-8"
27
+ encoding="GBK" if encoding =~ /gb2312/
28
+ encoding="GBK" if encoding =~ /gbk/
29
+ iconv(encoding,"UTF-8",html)
30
+ end
31
+ def strip_tags()
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,49 @@
1
+ #! /usr/bin/ruby
2
+ =begin rdoc
3
+ Simple Logger
4
+ Written by aragorn(xurenlu@gmail.com)
5
+ Author:: aragorn(xurenlu@gmail.com)
6
+ Homepage:: http://www.162cm.com/
7
+ MSN:: helloasp@hotmail.com
8
+ =end
9
+
10
+ module Rspider
11
+ =begin rdoc
12
+ This class is written for Logging messages when program running.
13
+
14
+ Examples:
15
+
16
+ * $LOGGER=Logger.new("/var/run/log/cns.log")
17
+ * $LOGGER.log_msg("Program inited!")
18
+ * ....
19
+ * $LOGGER.log_msg("Sockets inited")
20
+ * $LOGGER.flush_msg();
21
+ =end
22
+ class Logger
23
+ attr_accessor :max,:msgs
24
+
25
+ #path: the file path to storage the logs
26
+ def initialize(path)
27
+ @max=10
28
+ @log_path=path
29
+ @msgs=[]
30
+ puts "logs created :#{path}" if $DEBUG
31
+ end
32
+ #add a new log
33
+ #the log may not be sync to disk
34
+ #The Logger syncs every 100 messages automaticly
35
+ def log_msg(log,level="NOTE")
36
+ @msgs.push(level+":"+Time.now.to_i.to_s+":"+log)
37
+ flush_msg if (@msgs.length>@max) or level=="ERROR"
38
+ end
39
+ #sync the logs to hard disk
40
+ def flush_msg()
41
+ open(@log_path,"a+") do |f|
42
+ # f.puts "Generated at :"+Time.now.to_s+"\n"
43
+ f.puts @msgs.join("\n")
44
+ end
45
+ @msgs=[]
46
+ puts "logs flushed ." if $DEBUG
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/ruby -w
2
+ require "digest/md5"
3
+ require "zlib"
4
+ module Rspider
5
+ #The class MysqlUrlRelationStorage store url relations in Mysql database
6
+ #For better performance, we create an UrlStorage object to cache urls in memory
7
+ class MysqlUrlRelationStorage
8
+ def initialize(hash,source="default")
9
+ @seed=1024
10
+ @source=source
11
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
12
+ raise MysqlException if @my.nil?
13
+ end
14
+ #get the MD5 hash of string param "string"
15
+ def md5(string)
16
+ t=Digest::MD5.new
17
+ t << string
18
+ t.to_s
19
+ end
20
+ def save(referer,url)
21
+ url_crc=Zlib::crc32(url,@seed)
22
+ referer_crc=Zlib::crc32(referer,@seed)
23
+ sql="INSERT INTO url_relations (url,referer,url_crc32,referer_crc32) values('#{@my.quote(url)}','#{@my.quote(referer)}','#{url_crc}','#{referer_crc}')"
24
+ begin
25
+ @my.query(sql)
26
+ rescue Mysql::Error
27
+ rescue
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/ruby -w
2
+ require "digest/md5"
3
+ require "zlib"
4
+ module Rspider
5
+ #we re-defined the UrlStorage Class and add the urlStored? method
6
+ class UrlStorageCache < UrlStorage
7
+ MAX=512
8
+ STEP=128
9
+ @seed=1024
10
+ def urlStored?(url)
11
+ STEP.times do
12
+ @urls.shift
13
+ end if @urls.length>MAX
14
+ STEP.times do
15
+ @visitedUrls.shift
16
+ end if @visitedUrls.length>MAX
17
+ @visitedUrls.include?(url) or @urls.include?(url)
18
+ end
19
+ end
20
+ #The class MysqlUrlStorage store urls in Mysql database
21
+ #For better performance, we create an UrlStorage object to cache urls in memory
22
+ class MysqlUrlStorage
23
+ attr_accessor :cache
24
+ #Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass
25
+ #Param source is the name of cralwering task
26
+ def initialize(hash,source="default")
27
+ @source=source
28
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
29
+ raise MysqlException if @my.nil?
30
+ @cache=UrlStorageCache.new
31
+ end
32
+ #get the MD5 hash of string param "string"
33
+ def md5(string)
34
+ t=Digest::MD5.new
35
+ t << string
36
+ t.to_s
37
+ end
38
+ #asking if the url has been visited?
39
+ def visited?(url)
40
+ return true if @cache.visited?(url)
41
+ ukey=md5(url)+@source
42
+ crc=Zlib::crc32(url)
43
+ sql="SELECT visited FROM `urls` WHERE AND url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
44
+ begin
45
+ rs=@my.query(sql)
46
+ rs.each do |r|
47
+ return true if r[0].to_i>0
48
+ end
49
+ return nil
50
+ rescue Mysql::Error => e
51
+ return nil
52
+ end
53
+ end
54
+ #we discover a new url and record it
55
+ def <<(url)
56
+ return nil if @cache.urlStored?(url)
57
+ ukey=md5(url)+@source
58
+ crc=Zlib::crc32(url,@seed)
59
+ sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')";
60
+ begin
61
+ @my.query(sql)
62
+ @cache.<<(url)
63
+ rescue Mysql::Error,StandardError,Exception => e
64
+ else
65
+ end
66
+ end
67
+ #got a url to cralwer
68
+ def pop()
69
+ #sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY errors asc,score desc,RAND() LIMIT 1"
70
+ url=@cache.pop
71
+ return url unless url.nil?
72
+ sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY RAND() LIMIT 1"
73
+ begin
74
+ rs=@my.query(sql)
75
+ rs.each do |r|
76
+ return r[0]
77
+ end
78
+ rescue Mysql::Error
79
+ return nil
80
+ end
81
+ end
82
+ #we have cralwered an url ,so we recored it
83
+ def visited(url)
84
+ @cache.visited(url)
85
+ ukey=md5(url)+@source
86
+ crc=Zlib::crc32(url)
87
+ sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND `ukey`='"+ukey+"' LIMIT 1"
88
+ begin
89
+ @my.query(sql)
90
+ rescue Mysql::Error
91
+ return false
92
+ end
93
+ return true
94
+ end
95
+ #we meet an error,so we log it
96
+ def error(url)
97
+ @cache.error(url)
98
+ ukey=md5(url)+@source
99
+ crc=Zlib::crc32(url)
100
+ sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
101
+ @my.query(sql)
102
+ end
103
+ def close
104
+ @my.close
105
+ end
106
+ end
107
+ end