rspider 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ module Rspider
2
+ class DataWasher
3
+ #根据文章diff的结果取回标题和内容.
4
+ def getDiffRows(exampleFile,dataFile)
5
+ diff_res=%x{diff #{exampleFile} #{dataFile}}
6
+ lines=diff_res.split("\n")
7
+ row_id=1
8
+ rows=[]
9
+ cache=""
10
+ lines.each{|l|
11
+ if (l[0,2] == "--")
12
+ elsif(l[0,1] == "<")
13
+ elsif(l[0,1] == ">")
14
+ cache= cache + l[1,l.length]+"\n"
15
+ else
16
+ rows.push cache
17
+ cache=""
18
+ end
19
+ }
20
+ rows.push cache
21
+ rows
22
+ end
23
+
24
+ def parseDir(srcDir,destDir)
25
+ puts "now:parse Directory:#{srcDir}"
26
+ files=[]
27
+ Dir.foreach(srcDir){|f|
28
+ files << f unless f == "." or f == ".."
29
+ }
30
+ l=files.length
31
+ first=files[0]
32
+ last=files[l-1]
33
+ k=0
34
+ if $ENV == "PRO"
35
+ files.each{|f|
36
+ if (k==0)
37
+ w=parseText(srcDir+last,srcDir+f) unless File.file?(destDir+f)
38
+ open(destDir+f,"w+").puts w unless w.nil?
39
+ else
40
+ w=parseText(srcDir+first,srcDir+f) unless File.file?(destDir+f)
41
+ open(destDir+f,"w+").puts w unless w.nil?
42
+ end
43
+ k=k+1
44
+ }
45
+ else
46
+ i=0
47
+ files.each{|f|
48
+ if (i>3)
49
+ break
50
+ end
51
+ diffRows=[]
52
+ if (k==0)
53
+ diffRows= getDiffRows(srcDir+last,srcDir+f)
54
+ else
55
+ diffRows= getDiffRows(srcDir+first,srcDir+f)
56
+ end
57
+ puts "\n\n\n ==============Diff Rows[#{i}]================\n"
58
+ x=0
59
+ diffRows.each{ |l|
60
+ puts "\n+ rows[#{x}]:\n"
61
+ puts l
62
+ x=x+1
63
+ }
64
+ k=k+1
65
+ i=i+1
66
+ }
67
+ end
68
+ end
69
+
70
+ #根据文章diff的结果取回标题和内容(针对和讯理财)
71
+ def parseTextHexun(exampleFile,dataFile)
72
+ rows=getDiffRows(exampleFile,dataFile)
73
+ i=0
74
+ cur=0
75
+ contents=[]
76
+ rows.each{ |l|
77
+ if l =~ %r{\s*进入.*吧}
78
+ puts "got the end of content;#{l}"
79
+ break
80
+ end
81
+ if l =~ %r{^\s*[\d]{1}\*}
82
+ next
83
+ end
84
+ if l =~ %r{^\s*上一页\s*}
85
+ next
86
+ end
87
+ if l =~ %r{^\s*下一页\s*}
88
+ next
89
+ end
90
+ #if l =~ %r{^\s*第[\d]页} and l.length()<25
91
+ # next
92
+ #end
93
+ contents.push l if i>3
94
+ i = i+1
95
+ }
96
+ returns=""
97
+ returns << rows[1].sub("-理财频道-和讯网","")
98
+ returns << "::==++\n"
99
+ returns << contents.join("\n")
100
+ returns
101
+ end
102
+ #根据文章diff的结果取回标题和内容.
103
+ def parseText(exampleFile,dataFile)
104
+ if $_SOURCE == "hexun"
105
+ return parseTextHexun(exampleFile,dataFile)
106
+ else
107
+ puts "not hexun"
108
+ end
109
+ rows=getDiffRows(exampleFile,dataFile)
110
+ i=0
111
+ cur=0
112
+ rows.each{ |l|
113
+ if(l.length>150 )
114
+ cur=i
115
+ break
116
+ end
117
+ i=i+1
118
+ }
119
+ if cur==0
120
+ return nil
121
+ end
122
+ returns=""
123
+ returns << rows[1]
124
+ returns << "::==++\n"
125
+ returns << rows[cur]
126
+ returns
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,100 @@
1
+ module Rspider
2
+ class Document < Hash
3
+ end
4
+ class HTMLDocument < Hash
5
+ #得到头条之类的大纲
6
+ def get_headlines(html)
7
+ r=Regexp.compile('<h[0-9][^>]*>(.*?)<\/h[0-9]>',Regexp::IGNORECASE|Regexp::MULTILINE)
8
+ lines=[]
9
+ ms=html.scan(r)
10
+ return nil if ms.nil?
11
+ ms.each{|m|
12
+ lines.push m[0]
13
+ }
14
+ lines.join("\n")
15
+ end
16
+ #得到head区的内容
17
+ def get_head_data(html)
18
+ r=Regexp.compile('<head[^>]*>(.*?)<\/head>',Regexp::IGNORECASE|Regexp::MULTILINE)
19
+ m=html.scan(r)
20
+ return nil if m.nil?
21
+ return nil if m[0].nil?
22
+ head={}
23
+ head[:title]=""
24
+ head[:keywords]=""
25
+ head[:robots]=""
26
+ head[:description]=""
27
+ head[:nofollow]=false
28
+ head[:noindex]=false
29
+ head[:base]=""
30
+
31
+
32
+ h=m[0][0]
33
+ begin
34
+ r_robots=/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
35
+ robots=h.scan(r_robots)[0][0]
36
+ head[:robots]=robots
37
+ rescue
38
+ end
39
+ begin
40
+ r_desc=/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
41
+ head[:description]=h.scan(r_desc)[0][0]
42
+ rescue
43
+ end
44
+ begin
45
+ r_keys=/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
46
+ head[:keywords]=h.scan(r_keys)[0][0]
47
+ rescue
48
+ end
49
+
50
+ begin
51
+ r_charset=/<meta +http\-equiv*=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
52
+ head[:charset]=h.scan(r_charset)[0][0].split("=").pop
53
+ rescue
54
+ end
55
+
56
+ begin
57
+ r_base=/<base +href *= *[\"']?([^<>'\"]+)[\"']?/im
58
+ head[:base]=h.scan(r_base)[0][0]
59
+ rescue
60
+ end
61
+ begin
62
+ r_title=/<title *>(.*?)<\/title*>/im
63
+ head[:title]=h.scan(r_title)[0][0].gsub("\n","")
64
+ rescue
65
+ end
66
+
67
+ begin
68
+ archives=[]
69
+ r_archives=/<link +rel*=[\"']?archives[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
70
+ h.scan(r_archives).each{ |l|
71
+ archives.push l[0]
72
+ }
73
+ head[:archives]=archives
74
+ rescue
75
+ end
76
+ begin
77
+ links=[]
78
+ r_alternates=/<link +rel*=[\"']?alternate[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
79
+ h.scan(r_alternates).each{ |l|
80
+ links.push l[0]
81
+ }
82
+ head[:rss_links]=links
83
+ rescue
84
+ end
85
+ robots.downcase.split(",").each{ |j|
86
+ head[:noindex]=true if j=="noindex"
87
+ head[:nofollow]=true if j=="nofollow"
88
+ } unless robots.nil?
89
+ head
90
+ end
91
+ end
92
+ class RSSItemDocument < Hash
93
+ end
94
+ class RSSParser
95
+ def initialize(rss)
96
+ end
97
+ end
98
+ class PDFDocument < Hash
99
+ end
100
+ end
@@ -0,0 +1,21 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ module Rspider
8
+ #These classes Extract a document from PDF,HTML,Mail,Doc,XML files
9
+ class DocumentExtractor
10
+ def initialize
11
+ end
12
+ end
13
+ class PDFDocumentExtractor < DocumentExtractor
14
+ end
15
+ class DocDocumentExtractor < DocumentExtractor
16
+ end
17
+ class TEXTDocumentExtractor < DocumentExtractor
18
+ end
19
+ class XMLDocumentExtractor < DocumentExtractor
20
+ end
21
+ end
@@ -0,0 +1,34 @@
1
+ =begin rdoc
2
+ Author:: aragorn(xurenlu@gmail.com)
3
+ URL:: http://www.162cm.com/
4
+ Version:: 1.0.0
5
+ License:: LGPL
6
+ =end
7
+ require "iconv"
8
+ module Rspider
9
+ #this class guesses the encoding of a html document,convert it from various encoding to UTF-8
10
+ class HtmlTidy
11
+
12
+ def initialize()
13
+ #@defaultCharset="UTF-8"
14
+ end
15
+ # guess the encoding of html document
16
+ def guess_encoding(resp)
17
+ resp[0..400].scan(/content="(.*)"/i).flatten.join("\t").scan(/charset=([a-z0-9\-]+)/i).flatten.join("\t")
18
+ end
19
+ #convert document from one encoding to another
20
+ def iconv(from,to,text)
21
+ Iconv.new(to.upcase+"//IGNORE",from.upcase+"//IGNORE").iconv(text)
22
+ end
23
+ #guess the encoding of the document and convert it to UTF-8
24
+ def tidy(html)
25
+ encoding = guess_encoding(html).upcase
26
+ return html if encoding == "UTF-8"
27
+ encoding="GBK" if encoding =~ /gb2312/
28
+ encoding="GBK" if encoding =~ /gbk/
29
+ iconv(encoding,"UTF-8",html)
30
+ end
31
+ def strip_tags()
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,49 @@
1
+ #! /usr/bin/ruby
2
+ =begin rdoc
3
+ Simple Logger
4
+ Written by aragorn(xurenlu@gmail.com)
5
+ Author:: aragorn(xurenlu@gmail.com)
6
+ Homepage:: http://www.162cm.com/
7
+ MSN:: helloasp@hotmail.com
8
+ =end
9
+
10
+ module Rspider
11
+ =begin rdoc
12
+ This class is written for Logging messages when program running.
13
+
14
+ Examples:
15
+
16
+ * $LOGGER=Logger.new("/var/run/log/cns.log")
17
+ * $LOGGER.log_msg("Program inited!")
18
+ * ....
19
+ * $LOGGER.log_msg("Sockets inited")
20
+ * $LOGGER.flush_msg();
21
+ =end
22
+ class Logger
23
+ attr_accessor :max,:msgs
24
+
25
+ #path: the file path to storage the logs
26
+ def initialize(path)
27
+ @max=10
28
+ @log_path=path
29
+ @msgs=[]
30
+ puts "logs created :#{path}" if $DEBUG
31
+ end
32
+ #add a new log
33
+ #the log may not be sync to disk
34
+ #The Logger syncs every 100 messages automaticly
35
+ def log_msg(log,level="NOTE")
36
+ @msgs.push(level+":"+Time.now.to_i.to_s+":"+log)
37
+ flush_msg if (@msgs.length>@max) or level=="ERROR"
38
+ end
39
+ #sync the logs to hard disk
40
+ def flush_msg()
41
+ open(@log_path,"a+") do |f|
42
+ # f.puts "Generated at :"+Time.now.to_s+"\n"
43
+ f.puts @msgs.join("\n")
44
+ end
45
+ @msgs=[]
46
+ puts "logs flushed ." if $DEBUG
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/ruby -w
2
+ require "digest/md5"
3
+ require "zlib"
4
+ module Rspider
5
+ #The class MysqlUrlRelationStorage store url relations in Mysql database
6
+ #For better performance, we create an UrlStorage object to cache urls in memory
7
+ class MysqlUrlRelationStorage
8
+ def initialize(hash,source="default")
9
+ @seed=1024
10
+ @source=source
11
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
12
+ raise MysqlException if @my.nil?
13
+ end
14
+ #get the MD5 hash of string param "string"
15
+ def md5(string)
16
+ t=Digest::MD5.new
17
+ t << string
18
+ t.to_s
19
+ end
20
+ def save(referer,url)
21
+ url_crc=Zlib::crc32(url,@seed)
22
+ referer_crc=Zlib::crc32(referer,@seed)
23
+ sql="INSERT INTO url_relations (url,referer,url_crc32,referer_crc32) values('#{@my.quote(url)}','#{@my.quote(referer)}','#{url_crc}','#{referer_crc}')"
24
+ begin
25
+ @my.query(sql)
26
+ rescue Mysql::Error
27
+ rescue
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/ruby -w
2
+ require "digest/md5"
3
+ require "zlib"
4
+ module Rspider
5
+ #we re-defined the UrlStorage Class and add the urlStored? method
6
+ class UrlStorageCache < UrlStorage
7
+ MAX=512
8
+ STEP=128
9
+ @seed=1024
10
+ def urlStored?(url)
11
+ STEP.times do
12
+ @urls.shift
13
+ end if @urls.length>MAX
14
+ STEP.times do
15
+ @visitedUrls.shift
16
+ end if @visitedUrls.length>MAX
17
+ @visitedUrls.include?(url) or @urls.include?(url)
18
+ end
19
+ end
20
+ #The class MysqlUrlStorage store urls in Mysql database
21
+ #For better performance, we create an UrlStorage object to cache urls in memory
22
+ class MysqlUrlStorage
23
+ attr_accessor :cache
24
+ #Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass
25
+ #Param source is the name of cralwering task
26
+ def initialize(hash,source="default")
27
+ @source=source
28
+ @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
29
+ raise MysqlException if @my.nil?
30
+ @cache=UrlStorageCache.new
31
+ end
32
+ #get the MD5 hash of string param "string"
33
+ def md5(string)
34
+ t=Digest::MD5.new
35
+ t << string
36
+ t.to_s
37
+ end
38
+ #asking if the url has been visited?
39
+ def visited?(url)
40
+ return true if @cache.visited?(url)
41
+ ukey=md5(url)+@source
42
+ crc=Zlib::crc32(url)
43
+ sql="SELECT visited FROM `urls` WHERE AND url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
44
+ begin
45
+ rs=@my.query(sql)
46
+ rs.each do |r|
47
+ return true if r[0].to_i>0
48
+ end
49
+ return nil
50
+ rescue Mysql::Error => e
51
+ return nil
52
+ end
53
+ end
54
+ #we discover a new url and record it
55
+ def <<(url)
56
+ return nil if @cache.urlStored?(url)
57
+ ukey=md5(url)+@source
58
+ crc=Zlib::crc32(url,@seed)
59
+ sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')";
60
+ begin
61
+ @my.query(sql)
62
+ @cache.<<(url)
63
+ rescue Mysql::Error,StandardError,Exception => e
64
+ else
65
+ end
66
+ end
67
+ #got a url to cralwer
68
+ def pop()
69
+ #sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY errors asc,score desc,RAND() LIMIT 1"
70
+ url=@cache.pop
71
+ return url unless url.nil?
72
+ sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY RAND() LIMIT 1"
73
+ begin
74
+ rs=@my.query(sql)
75
+ rs.each do |r|
76
+ return r[0]
77
+ end
78
+ rescue Mysql::Error
79
+ return nil
80
+ end
81
+ end
82
+ #we have cralwered an url ,so we recored it
83
+ def visited(url)
84
+ @cache.visited(url)
85
+ ukey=md5(url)+@source
86
+ crc=Zlib::crc32(url)
87
+ sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND `ukey`='"+ukey+"' LIMIT 1"
88
+ begin
89
+ @my.query(sql)
90
+ rescue Mysql::Error
91
+ return false
92
+ end
93
+ return true
94
+ end
95
+ #we meet an error,so we log it
96
+ def error(url)
97
+ @cache.error(url)
98
+ ukey=md5(url)+@source
99
+ crc=Zlib::crc32(url)
100
+ sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
101
+ @my.query(sql)
102
+ end
103
+ def close
104
+ @my.close
105
+ end
106
+ end
107
+ end