rspider 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
data/Changelog
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
--0.8.3 Thu Sep 11 21:22:02 CST 2008
|
3
|
+
1.add timeout field when fetching a page
|
4
|
+
2.package published as gem
|
5
|
+
--0.8.2 Tue Sep 9 00:18:10 CST 2008
|
6
|
+
1.move url Grabber funcitons to Spider class,so ,Spider object can extracts links itself;
|
7
|
+
2.Add Url-relation-Storage support
|
8
|
+
--0.8.1
|
9
|
+
1. Add range field for Spider
|
10
|
+
So we avoid to download very huge files
|
11
|
+
2. Add Cookie support for spider
|
12
|
+
--0.8.0 Sat Sep 6 00:35:01 CST 2008
|
13
|
+
1.optmized MysqlUrlStorage:add a memory cache,so would not fire so much mysql
|
14
|
+
duplicate key errors
|
15
|
+
2.Optmized urlGrabber ,will gen less wrong urls.
|
16
|
+
--0.7.9 Wed Sep 3 02:06:13 CST 2008
|
17
|
+
1.add Local MysqlUrlStorage
|
18
|
+
2.Add local Mysql Content Storage
|
19
|
+
3.add Url score support ,low score urls will have low chance to be cralwered.
|
20
|
+
--0.7.8
|
21
|
+
1.Add logger to log the user interrupts,download failings...
|
22
|
+
2.Add SiteLocker to ensure not to cralwer same site with high frequency.
|
23
|
+
--0.7.7
|
24
|
+
CharsetGuess added,you can handle GBK files and storage it to UTF-8 now
|
25
|
+
--0.7.6
|
26
|
+
ContentStorage added,Downloader switch to Net/HTTP
|
27
|
+
--0.7.5
|
28
|
+
added callback supprot,not completed
|
29
|
+
--0.7.4
|
30
|
+
add UrlStorage callback object
|
31
|
+
--0.7.3
|
32
|
+
robotRules added
|
data/Rakefile
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# The name of your project
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
PROJECT = "rspider"
|
4
|
+
# Your name, used in packaging.
|
5
|
+
MY_NAME = "Renlu Xu"
|
6
|
+
# Your email address, used in packaging.
|
7
|
+
MY_EMAIL = "xurenlu@gmail.com"
|
8
|
+
# Short summary of your project, used in packaging.
|
9
|
+
PROJECT_SUMMARY = "Web cralwer"
|
10
|
+
# The project's package name (as opposed to its display name). Used for
|
11
|
+
# RubyForge connectivity and packaging.
|
12
|
+
UNIX_NAME = "rspider" # Your RubyForge user name. RUBYFORGE_USER = ENV["RUBYFORGE_USER"] || "iam162"
|
13
|
+
# Output directory for the rdoc html files.
|
14
|
+
# If you don't have a custom homepage, and want to use the RDoc
|
15
|
+
RDOC_FILES=FileList[]
|
16
|
+
BIN_FILES=FileList["bin/*.rb"]
|
17
|
+
GENERAL_RDOC_OPTS=""
|
18
|
+
# Variable settings for extension support.
|
19
|
+
EXT_DIR = "ext"
|
20
|
+
HAVE_EXT = File.directory?(EXT_DIR)
|
21
|
+
EXTCONF_FILES = FileList["#{EXT_DIR}/**/extconf.rb"]
|
22
|
+
# Eventually add other files from EXT_DIR, like "MANIFEST"
|
23
|
+
TEST_FILES = FileList["test/**/tc_*.rb"]
|
24
|
+
|
25
|
+
DIST_FILES = FileList["lib/*/*.rb", "lib/rspider.rb","sql/*.sql","Changelog","ToDo","conf/local.conf"]
|
26
|
+
DIST_FILES.include("Rakefile")
|
27
|
+
# Don't package files which are autogenerated by RDocTask
|
28
|
+
# Include extension source files.
|
29
|
+
# Don't package temporary files, perhaps created by tests.
|
30
|
+
DIST_FILES.exclude("**/temp_*", "**/*.tmp")
|
31
|
+
# Don't get into recursion…
|
32
|
+
DIST_FILES.exclude(/^(\.\/)?pkg(\/|$)/)
|
33
|
+
|
34
|
+
REQUIRE_PATHS = ["lib"]
|
35
|
+
REQUIRE_PATHS << EXT_DIR if HAVE_EXT
|
36
|
+
$LOAD_PATH.concat(REQUIRE_PATHS)
|
37
|
+
# This library file defines the MyProject::VERSION constant.
|
38
|
+
require "#{UNIX_NAME}"
|
39
|
+
#PROJECT_VERSION = "#{PROJECT}::#{VERSION}" # e.g., "1.0.2"
|
40
|
+
PROJECT_VERSION="0.8.4"
|
41
|
+
|
42
|
+
GEM_SPEC = Gem::Specification.new do |s|
|
43
|
+
s.name = UNIX_NAME
|
44
|
+
s.version = PROJECT_VERSION
|
45
|
+
s.summary = PROJECT_SUMMARY
|
46
|
+
s.rubyforge_project = UNIX_NAME
|
47
|
+
#s.homepage = "http://#{UNIX_NAME}.rubyforge.org/"
|
48
|
+
s.homepage = "http://www.162cm.com/"
|
49
|
+
s.author = MY_NAME
|
50
|
+
s.email = MY_EMAIL
|
51
|
+
s.files = DIST_FILES
|
52
|
+
s.test_files = TEST_FILES
|
53
|
+
s.executables = BIN_FILES.map { |fn| File.basename(fn) }
|
54
|
+
s.has_rdoc = true
|
55
|
+
s.extra_rdoc_files = RDOC_FILES
|
56
|
+
s.rdoc_options = GENERAL_RDOC_OPTS.to_a.flatten
|
57
|
+
if HAVE_EXT
|
58
|
+
s.extensions = EXTCONF_FILES
|
59
|
+
s.require_paths >> EXT_DIR
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# Now we can generate the package-related tasks.
|
63
|
+
Rake::GemPackageTask.new(GEM_SPEC) do |pkg|
|
64
|
+
pkg.need_zip = true
|
65
|
+
pkg.need_tar = true
|
66
|
+
end
|
data/ToDo
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Mon Sep 1 10:52:10 CST 2008:
|
2
|
+
0.7.9 Rdoc++
|
3
|
+
0.7.10 全面更换为支持Mysql的存储(替换掉HDB,提高通用性)
|
4
|
+
fixed --- 0.8.0 加上带Cookie抓取功能
|
5
|
+
We can cralwer site with cookie
|
6
|
+
|
7
|
+
0.8.1 Add Gem,Makefile and so on
|
8
|
+
0.8.2 Add Url relations storage
|
9
|
+
fixed ---设定最长Url限制
|
10
|
+
设定最大文档限制 避免下载rm,avi等大文件
|
11
|
+
#记录300 Redirect 的URL
|
12
|
+
HTMLTidy部分严重地泄露内存。考虑替换方案。
|
13
|
+
#将UrlGrabber中的功能植入到spider类中来
|
14
|
+
|
15
|
+
将仅记录url,content改为记录:
|
16
|
+
Keywords,Description,Charset...Summary
|
17
|
+
除HTML之外,增加对Doc,XML,PDF,text的解析
|
18
|
+
核查Cookie功能,并加入Load-cookie功能,将用文件记录Cookies
|
19
|
+
发现Blog feed的功能
|
data/bin/linkcheck.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
require "lib/rspider"
|
3
|
+
require "optparse"
|
4
|
+
|
5
|
+
#require "profile"
|
6
|
+
#debug $mem_profiler = MemoryProfiler.new
|
7
|
+
$OPT=Rspider::OptParser.new(ARGV)
|
8
|
+
if $OPT[:debug]=="on"
|
9
|
+
$DEBUG=true
|
10
|
+
else
|
11
|
+
$DEBUG=false
|
12
|
+
end
|
13
|
+
conf=Rspider::SpiderConfParser.new($OPT[:conf])
|
14
|
+
|
15
|
+
puts "Configuration file parsed!"
|
16
|
+
spider=Rspider::Spider.new(conf)
|
17
|
+
#spider.urlStorage= Rspider::UrlDispatcherClient.new("127.0.0.1",10001,conf["source"])
|
18
|
+
#spider.urlStorage=Rspider::UrlStorage.new
|
19
|
+
spider.logger=Rspider::Logger.new(conf["logger"])
|
20
|
+
spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
|
21
|
+
#spider.contentStorage=Rspider::ContentStorage.new
|
22
|
+
#spider.contentStorage=Rspider::HDBContentStorage.new(conf["save_path"])
|
23
|
+
spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
|
24
|
+
spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
|
25
|
+
spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
|
26
|
+
spider.on :failure do |url,resp|
|
27
|
+
puts "ERROR:#{url}"
|
28
|
+
end
|
29
|
+
#$tracker = CallTracker.new
|
30
|
+
#$tracker.register(String, :new)
|
31
|
+
|
32
|
+
pool=Rspider::ThreadPool.new(2)
|
33
|
+
pool.dispatch() {
|
34
|
+
spider.start_from conf["urls"]
|
35
|
+
}
|
36
|
+
puts "threads inited!"
|
37
|
+
pool.shutdown
|
data/bin/main.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
require "lib/rspider"
|
3
|
+
require "optparse"
|
4
|
+
require "lib/Gc"
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'bleak_house'
|
8
|
+
|
9
|
+
#require "profile"
|
10
|
+
$mem= MemoryProfiler.new
|
11
|
+
$OPT=Rspider::OptParser.new(ARGV)
|
12
|
+
if $OPT[:debug]=="on"
|
13
|
+
$DEBUG=true
|
14
|
+
else
|
15
|
+
$DEBUG=false
|
16
|
+
end
|
17
|
+
puts "Configuration file parsed!"
|
18
|
+
interrupted = false
|
19
|
+
trap("SIGINT") { interrupted = true }
|
20
|
+
def run(x=0)
|
21
|
+
conf=Rspider::SpiderConfParser.new($OPT[:conf])
|
22
|
+
spider=Rspider::Spider.new(conf)
|
23
|
+
spider.logger=Rspider::Logger.new(conf["logger"])
|
24
|
+
spider.browser=HTTPal::Browser.new(conf["agent"],conf["max_document_length"])
|
25
|
+
spider.contentStorage=Rspider::MysqlContentStorage.new(conf,conf["source"])
|
26
|
+
spider.urlStorage=Rspider::MysqlUrlStorage.new(conf,conf["source"])
|
27
|
+
spider.relationStorage=Rspider::MysqlUrlRelationStorage.new(conf,conf["source"])
|
28
|
+
spider.start_from("http://localhost/search_doc/") if x == 2
|
29
|
+
spider.run(4)
|
30
|
+
spider.contentStorage.close
|
31
|
+
spider.urlStorage.close
|
32
|
+
end
|
33
|
+
j=1
|
34
|
+
while(true) do
|
35
|
+
j=j+1
|
36
|
+
puts "-"*30
|
37
|
+
exit if interrupted
|
38
|
+
$mem.report
|
39
|
+
run(j)
|
40
|
+
exit if j>4
|
41
|
+
end
|
data/conf/local.conf
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#Conf for spider site :chin.bokee.com
|
2
|
+
#urls="http://localhost/search_doc/"
|
3
|
+
#urls="http://localhost/search_doc/soft/apache2.0/vhosts"
|
4
|
+
#urls="http://localhost/search_doc/soft/apache2.0/sitemap.html"
|
5
|
+
urls="http://localhost/search_doc/"
|
6
|
+
#can_leave_domain must be "yes|no"
|
7
|
+
can_leave_domain= "no"
|
8
|
+
max_depth=4
|
9
|
+
max_redirects=4
|
10
|
+
save_path="./testdata/local.hdb"
|
11
|
+
buckets=128
|
12
|
+
source="localdb2"
|
13
|
+
threads=10
|
14
|
+
same_domain_regexp="localhost"
|
15
|
+
logger="./testdata/local.log"
|
16
|
+
agent="Rspider/1.0 (build 20080824,+http://www.162cm.com/)"
|
17
|
+
url_max_length=130
|
18
|
+
max_document_length=204800
|
19
|
+
host="localhost"
|
20
|
+
user="root"
|
21
|
+
pass=""
|
22
|
+
db="sphider2"
|
23
|
+
timeout=3
|
data/lib/rspider.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
#
|
8
|
+
require "optparse"
|
9
|
+
rspider_lib_files=["rspider/ConfParser",
|
10
|
+
"rspider/DataWasher",
|
11
|
+
"rspider/cookie",
|
12
|
+
"rspider/browser",
|
13
|
+
"rspider/HtmlTidy",
|
14
|
+
"rspider/Logger",
|
15
|
+
"rspider/OptParser",
|
16
|
+
"rspider/UrlDispatcher",
|
17
|
+
"rspider/Spider",
|
18
|
+
"rspider/SiteLocker",
|
19
|
+
"rspider/ThreadPool",
|
20
|
+
"rspider/RobotRules",
|
21
|
+
"rspider/UrlStorage",
|
22
|
+
"rspider/UrlScorer",
|
23
|
+
"rspider/ContentStorage",
|
24
|
+
"rspider/MysqlUrlStorage",
|
25
|
+
"rspider/mysql",
|
26
|
+
"rspider/MysqlUrlRelationStorage"
|
27
|
+
]
|
28
|
+
rspider_lib_dir=File.expand_path(File.dirname(__FILE__))+"/"
|
29
|
+
rspider_lib_files.collect!{|f|
|
30
|
+
rspider_lib_dir+f
|
31
|
+
}
|
32
|
+
rspider_lib_files.each{|f|
|
33
|
+
require f
|
34
|
+
}
|
@@ -0,0 +1,149 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# $Id: parseconfig.rb 37 2008-02-29 07:27:33Z wdierkes $
|
3
|
+
#
|
4
|
+
# Author:: BJ Dierkes <wdierkes@5dollarwhitebox.org>
|
5
|
+
# Copyright:: Copyright (c) 2006,2007 5dollarwhitebox.org
|
6
|
+
# License:: GPL
|
7
|
+
# URL:: http://www.5dollarwhitebox.org
|
8
|
+
#
|
9
|
+
|
10
|
+
# This class was written to simplify the parsing of configuration
|
11
|
+
# files in the format of "param = value". Please review the
|
12
|
+
# demo files included with this package.
|
13
|
+
#
|
14
|
+
# For further information please refer to the './doc' directory
|
15
|
+
# as well as the ChangeLog and README files included.
|
16
|
+
#
|
17
|
+
module Rspider
|
18
|
+
class ConfParseError < Exception
|
19
|
+
attr_reader :errno,:error
|
20
|
+
def initialize(errno,error)
|
21
|
+
@errno=errno
|
22
|
+
@error=error
|
23
|
+
super error
|
24
|
+
end
|
25
|
+
def to_s
|
26
|
+
@error
|
27
|
+
end
|
28
|
+
end
|
29
|
+
class ConfParser < Hash
|
30
|
+
|
31
|
+
Version = '0.4.2'
|
32
|
+
|
33
|
+
# Initialize the class with the path to the 'config_file'
|
34
|
+
# The class objects are dynamically generated by the
|
35
|
+
# name of the 'param' in the config file. Therefore, if
|
36
|
+
# the config file is 'param = value' then the itializer
|
37
|
+
# will eval "@param = value"
|
38
|
+
#
|
39
|
+
def initialize(config_file)
|
40
|
+
super()
|
41
|
+
@config_file = config_file
|
42
|
+
raise Errno::EACCES, "#{self.config_file} is not readable" unless File.readable?(self.config_file)
|
43
|
+
open(self.config_file).each { |line|
|
44
|
+
line.chomp
|
45
|
+
unless (/^\#/.match(line))
|
46
|
+
if(/\s*=\s*/.match(line))
|
47
|
+
param, value = line.split(/\s*=\s*/, 2)
|
48
|
+
var_name = "#{param}".chomp.strip
|
49
|
+
value = value.chomp.strip
|
50
|
+
new_value = ''
|
51
|
+
if (value)
|
52
|
+
if value =~ /^['"](.*)['"]$/
|
53
|
+
new_value = $1
|
54
|
+
else
|
55
|
+
new_value = value
|
56
|
+
end
|
57
|
+
else
|
58
|
+
new_value = ''
|
59
|
+
end
|
60
|
+
# self.instance_variable_set("@#{var_name}", new_value)
|
61
|
+
if self.has_key?(var_name)
|
62
|
+
self[var_name].push(new_value)
|
63
|
+
else
|
64
|
+
self[var_name]=[]
|
65
|
+
self[var_name].push(new_value)
|
66
|
+
end
|
67
|
+
#self[var_name]=new_value
|
68
|
+
end
|
69
|
+
end
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
# This method will provide the value held by the object "@param"
|
74
|
+
# where "@param" is actually the name of the param in the config
|
75
|
+
# file.
|
76
|
+
def get_value(param)
|
77
|
+
self[param]
|
78
|
+
end
|
79
|
+
|
80
|
+
# This method is simple. Should you need to override a value
|
81
|
+
# dynamically, use override_value(param, value) where 'param' is
|
82
|
+
# the name of the paramater in the config file.
|
83
|
+
#
|
84
|
+
def override_value(param, value)
|
85
|
+
self[param]=value
|
86
|
+
end
|
87
|
+
|
88
|
+
# This method will set the value of '@param' to nil (not in the config
|
89
|
+
# file, only in the app).
|
90
|
+
def nil_value(param)
|
91
|
+
self[param]=nil
|
92
|
+
end
|
93
|
+
|
94
|
+
def config_file=(config_file)
|
95
|
+
@config_file = config_file
|
96
|
+
end
|
97
|
+
|
98
|
+
def config_file()
|
99
|
+
@config_file
|
100
|
+
end
|
101
|
+
def to_s()
|
102
|
+
self.each{|k,v|
|
103
|
+
puts "#{k}:=> #{v}\n"
|
104
|
+
}
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
class SpiderConfParser < ConfParser
|
109
|
+
def initialize(config_file)
|
110
|
+
super(config_file)
|
111
|
+
begin
|
112
|
+
if(self["can_leave_domain"].pop.upcase=="YES")
|
113
|
+
self["can_leave_domain"]=true
|
114
|
+
else
|
115
|
+
self["can_leave_domain"]=false
|
116
|
+
end
|
117
|
+
self["max_depth"]=self["max_depth"].pop.to_i
|
118
|
+
self["max_redirects"]=self["max_redirects"].pop.to_i
|
119
|
+
self["save_path"]=self["save_path"].pop
|
120
|
+
self["source"]=self["source"].pop
|
121
|
+
self["buckets"]=self["buckets"].pop.to_i
|
122
|
+
self["threads"]=self["threads"].pop.to_i
|
123
|
+
self["same_domain_regexp"]=self["same_domain_regexp"].pop
|
124
|
+
self["agent"]=self["agent"].pop
|
125
|
+
self["urls"]=self["urls"].pop
|
126
|
+
self["logger"]=self["logger"].pop
|
127
|
+
self["url_max_length"]=self["url_max_length"].pop.to_i
|
128
|
+
self["max_document_length"]=self["max_document_length"].pop.to_i
|
129
|
+
#Mysql settings
|
130
|
+
self["host"]=self["host"].pop
|
131
|
+
self["db"]=self["db"].pop
|
132
|
+
self["user"]=self["user"].pop
|
133
|
+
self["pass"]=self["pass"].pop
|
134
|
+
self["timeout"]=self["timeout"].pop.to_i
|
135
|
+
rescue NoMethodError
|
136
|
+
raise "Some thing error while conf pop"
|
137
|
+
exit
|
138
|
+
end
|
139
|
+
#urls="http://www.coolcode.cn/"
|
140
|
+
#can_leave_domain= "yes"
|
141
|
+
#max_depth=4
|
142
|
+
#save_path="/tmp/coolcode/"
|
143
|
+
#buckets=128
|
144
|
+
#source="coolcode"
|
145
|
+
#threads=10
|
146
|
+
#same_domain_regexp="\.coolcode\.cn"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Author:: aragorn(xurenlu@gmail.com)
|
3
|
+
URL:: http://www.162cm.com/
|
4
|
+
Version:: 1.0.0
|
5
|
+
License:: LGPL
|
6
|
+
=end
|
7
|
+
module Rspider
|
8
|
+
#when you need the tokyocabinet ,remove the comment
|
9
|
+
# require 'tokyocabinet'
|
10
|
+
require "digest/md5"
|
11
|
+
#this class store the content of documents in Hash (memory)
|
12
|
+
#So program uses lots of memory and can play fast
|
13
|
+
#
|
14
|
+
class ContentStorage < Hash
|
15
|
+
def initialize()
|
16
|
+
end
|
17
|
+
#store an url and content of the url
|
18
|
+
def add(url,content)
|
19
|
+
self[url]=content
|
20
|
+
end
|
21
|
+
#list the urls
|
22
|
+
def urls
|
23
|
+
self.keys
|
24
|
+
end
|
25
|
+
#close the db
|
26
|
+
def close
|
27
|
+
end
|
28
|
+
#get the content of url #{url}
|
29
|
+
def get(url)
|
30
|
+
return self[url]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
#this class store the content in tokyocabinet database
|
34
|
+
#so she can get perfect performance and uses little memory
|
35
|
+
class HDBContentStorage
|
36
|
+
#the file path to hold the HDB file
|
37
|
+
def initialize(path)
|
38
|
+
@hdb = TokyoCabinet::HDB::new
|
39
|
+
if(!@hdb.open(path, TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT))
|
40
|
+
ecode = @hdb.ecode
|
41
|
+
STDERR.printf("open error: %s\n", @hdb.errmsg(ecode))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
#store an url and content of the url
|
45
|
+
def add(url,content)
|
46
|
+
@hdb.put(url,content)
|
47
|
+
end
|
48
|
+
#close the db
|
49
|
+
def close
|
50
|
+
@hdb.close
|
51
|
+
end
|
52
|
+
#list all the urls
|
53
|
+
def urls
|
54
|
+
@hdb.iterinit
|
55
|
+
keys=[]
|
56
|
+
while(key = @hdb.iternext)
|
57
|
+
keys<< key
|
58
|
+
end
|
59
|
+
keys
|
60
|
+
end
|
61
|
+
#fetch the content of specificed url
|
62
|
+
def get(url)
|
63
|
+
return @hdb.get(url)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
class MysqlException < Exception
|
67
|
+
def to_s
|
68
|
+
return "Can't connect to mysql "
|
69
|
+
end
|
70
|
+
end
|
71
|
+
#class MysqlContentStorage store the content of urls in an msyql_db
|
72
|
+
class MysqlContentStorage
|
73
|
+
#get md5 hash of string
|
74
|
+
def md5(string)
|
75
|
+
t=Digest::MD5.new
|
76
|
+
t << string
|
77
|
+
t.to_s
|
78
|
+
end
|
79
|
+
#initialize the object
|
80
|
+
#hash must be an hash includes mysql connection information such as host,user,pass,database and so on
|
81
|
+
#source specific the task name
|
82
|
+
def initialize(hash,source="default")
|
83
|
+
@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
|
84
|
+
raise MysqlException if @my.nil?
|
85
|
+
@source=source
|
86
|
+
end
|
87
|
+
#store an url and content of the url
|
88
|
+
def add(url,content)
|
89
|
+
sql="INSERT INTO `htmls` (`source`,`url`,`url_crc32`,`html`,`html_crc32`,`created`,`ukey`)
|
90
|
+
VALUES ('"+@my.quote(@source)+"','"+@my.quote(url)+"','0','"+@my.quote(content)+"','0','"+Time.now.to_i.to_s+"','"+@my.quote(md5(url)+@source)+"')"
|
91
|
+
begin
|
92
|
+
@my.query(sql)
|
93
|
+
rescue Mysql::Error =>e
|
94
|
+
return nil
|
95
|
+
end
|
96
|
+
return true
|
97
|
+
end
|
98
|
+
#get the content of url #{url}
|
99
|
+
def get(url)
|
100
|
+
sql="select html from htmls where ukey='"+@my.quote(md5(url)+@source)+"'"
|
101
|
+
begin
|
102
|
+
rs=@my.query(sql)
|
103
|
+
rs.each do |r|
|
104
|
+
return r[0]
|
105
|
+
end
|
106
|
+
rescue Mysql::Error => e
|
107
|
+
return nil
|
108
|
+
end
|
109
|
+
end
|
110
|
+
#list the urls
|
111
|
+
#@return Array
|
112
|
+
def urls()
|
113
|
+
sql="select url from htmls where source='"+@my.quote(@source)+"'"
|
114
|
+
begin
|
115
|
+
rs=@my.query(sql)
|
116
|
+
keys=[]
|
117
|
+
rs.each do |r|
|
118
|
+
keys.push(r[0])
|
119
|
+
end
|
120
|
+
return keys
|
121
|
+
rescue Mysql::Error => e
|
122
|
+
return []
|
123
|
+
end
|
124
|
+
end
|
125
|
+
#close the database connection
|
126
|
+
def close()
|
127
|
+
@my.close
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|