spider2 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +15 -0
- data/Rakefile +23 -0
- data/init.rb +3 -0
- data/install.rb +2 -0
- data/lib/generators/spider/spider_generator.rb +42 -0
- data/lib/generators/spider/templates/base_page.rb +6 -0
- data/lib/generators/spider/templates/base_page_spec.rb +13 -0
- data/lib/generators/spider/templates/index_page.rb +6 -0
- data/lib/generators/spider/templates/index_page_spec.rb +14 -0
- data/lib/generators/spider/templates/index_page_test.rb +10 -0
- data/lib/generators/spider/templates/list_page.rb +6 -0
- data/lib/generators/spider/templates/list_page_spec.rb +22 -0
- data/lib/generators/spider/templates/list_page_test.rb +10 -0
- data/lib/generators/spider/templates/show_page.rb +14 -0
- data/lib/generators/spider/templates/show_page_spec.rb +19 -0
- data/lib/generators/spider/templates/show_page_test.rb +10 -0
- data/lib/generators/spider/templates/site.rb +7 -0
- data/lib/generators/spider/templates/site_spec.rb +13 -0
- data/lib/generators/spider/templates/test.rb +10 -0
- data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
- data/lib/generators/spider_migration/templates/migration.rb +42 -0
- data/lib/spider/active_record_methods.rb +60 -0
- data/lib/spider/http.rb +43 -0
- data/lib/spider/page/filter.rb +132 -0
- data/lib/spider/page/label.rb +28 -0
- data/lib/spider/page/pagination.rb +142 -0
- data/lib/spider/page/proxy.rb +149 -0
- data/lib/spider/page/publish.rb +78 -0
- data/lib/spider/page/validation.rb +136 -0
- data/lib/spider/page.rb +759 -0
- data/lib/spider/site.rb +225 -0
- data/lib/spider/spider_page.rb +18 -0
- data/lib/spider/spider_page_label.rb +5 -0
- data/lib/spider/version.rb +3 -0
- data/lib/spider.rb +81 -0
- data/lib/tasks/spider_tasks.rake +86 -0
- data/test/spider_fu_test.rb +9 -0
- data/test/test_helper.rb +4 -0
- data/uninstall.rb +2 -0
- metadata +151 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# 分页处理
|
3
|
+
# 分页一般有三种形式
|
4
|
+
# 1)
|
5
|
+
# 上页 1 2 3 4 5 6 下页
|
6
|
+
# 可以使用
|
7
|
+
# paginate :scope=>"#paginte-div"
|
8
|
+
# 还有一种形式是 一共有多少页
|
9
|
+
# 比如 88 页
|
10
|
+
|
11
|
+
module Spider::Page::Pagination
|
12
|
+
module ClassMethods
|
13
|
+
# 实现一个简单的可以解决大部分分页的功能
|
14
|
+
# paginate :body,:links=>".paginate",:to=>[:body] do |page|
|
15
|
+
# end
|
16
|
+
# options:
|
17
|
+
# scope: 指定分页链接范围
|
18
|
+
# 比如:<div class="pagination"><a href="?1">1</a></div>
|
19
|
+
# 可以指定scope为 div.pagination
|
20
|
+
# 那么所有 div.pagination下的链接都认为是分页链接
|
21
|
+
# include_self: boolean
|
22
|
+
# 是否包含当前页面,默认为false
|
23
|
+
# sort: boolean
|
24
|
+
# 是否对链接进行排序,默认为false
|
25
|
+
def paginate(to=:body,options={},&block)
|
26
|
+
options.assert_valid_keys :scope,:include_self,:sort
|
27
|
+
options.reverse_merge! :include_self=>false,:sort=>false,:scope=>".pagination"
|
28
|
+
|
29
|
+
partial_name = "#{to}_partial"
|
30
|
+
paginated_pages_name = "#{to}_paginated_pages"
|
31
|
+
|
32
|
+
# define_method partial_name,&block # 获得页面部分的body
|
33
|
+
|
34
|
+
define_method paginated_pages_name do
|
35
|
+
pages = self.pages(:scope=>options[:scope])
|
36
|
+
|
37
|
+
if options[:include_self]
|
38
|
+
pages.unshift(self)
|
39
|
+
end
|
40
|
+
|
41
|
+
if options[:sort]
|
42
|
+
pages = pages.sort_by{|page| page.url }
|
43
|
+
end
|
44
|
+
pages
|
45
|
+
end
|
46
|
+
|
47
|
+
# 定义获取全文的method
|
48
|
+
define_method to do
|
49
|
+
send(paginated_pages_name).collect{|page|
|
50
|
+
page.send(partial_name)
|
51
|
+
} * Spider::Page.paginate_symbol
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# 产生分页链接
|
56
|
+
# generate_paginated_urls 10,"http://www.google.com/article_%d.html",:first=>"http://www.google.com/article.html"
|
57
|
+
def generate_paginated_urls(page_count,url,options={})
|
58
|
+
options.assert_valid_keys :first,:until
|
59
|
+
page_count = 1000 if options[:until] # 1000页总够了吧
|
60
|
+
urls = []
|
61
|
+
page_count.times do |i|
|
62
|
+
if options[:first] && i.zero?
|
63
|
+
urls << options[:first]
|
64
|
+
else
|
65
|
+
urls << url % i
|
66
|
+
end
|
67
|
+
end
|
68
|
+
urls
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
module InstanceMethods
|
74
|
+
# http://www.google.com/article_1.html
|
75
|
+
# page.generate_paginated_urls(10,/article(_\d+?)\.html/)
|
76
|
+
# 一般有几种方式
|
77
|
+
# 1):
|
78
|
+
# xxxxx.html
|
79
|
+
# xxxxx_1.html
|
80
|
+
# xxxxx_2.html
|
81
|
+
# 2):
|
82
|
+
# xxx.asp
|
83
|
+
# xxx.asp?page=2
|
84
|
+
# xxx.asp?page=3
|
85
|
+
# xxxxx(_%d).html
|
86
|
+
# xxx.asp(?page=%d)
|
87
|
+
# 比如 http://www.powerapple.com/articles/11111.html
|
88
|
+
# http://www.powerapple.com/articles/11111_1.html
|
89
|
+
# 如果已知是 5 页
|
90
|
+
# 那么:http://www.powerapple.com/articles/11111.html
|
91
|
+
# http://www.powerapple.com/articles/11111_2.html
|
92
|
+
# http://www.powerapple.com/articles/11111_3.html
|
93
|
+
# http://www.powerapple.com/articles/11111_4.html
|
94
|
+
# http://www.powerapple.com/articles/11111_5.html
|
95
|
+
# generate_paginated_urls 5,
|
96
|
+
# :url=>"http://www.powerapple.com/articles/1111_%d.html",
|
97
|
+
# :start=>1,
|
98
|
+
# :unshift=>"http://www.powerapple.com/articles/1111.html"
|
99
|
+
def generate_paginated_urls(url,options={})
|
100
|
+
options.assert_valid_keys :start,:prepend,:append,:until_failure,:count
|
101
|
+
number = options[:count].to_i
|
102
|
+
number = 1000 if options[:until_failure] # 根据条件,获取页面
|
103
|
+
new_url = url
|
104
|
+
start = options[:start]
|
105
|
+
urls = []
|
106
|
+
number.times do |index|
|
107
|
+
index += 1
|
108
|
+
next if start && index < start
|
109
|
+
page_url = (url % index)
|
110
|
+
if options[:until_failure]
|
111
|
+
response = Spider::Http.head(page_url)
|
112
|
+
unless response.code == 200
|
113
|
+
break # 遇到404就退出
|
114
|
+
end
|
115
|
+
end
|
116
|
+
urls << page_url
|
117
|
+
end
|
118
|
+
if options[:prepend]
|
119
|
+
[options[:prepend]].flatten.reverse.each do |url|
|
120
|
+
urls.unshift(url)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
if options[:append]
|
124
|
+
[options[:append]].flatten.reverse.each do |url|
|
125
|
+
urls.push(url)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
urls
|
129
|
+
end
|
130
|
+
|
131
|
+
def generate_paginated_pages(*args)
|
132
|
+
urls = generate_paginated_urls(*args)
|
133
|
+
urls.collect{|url| go(url) }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.included(base)
|
138
|
+
base.send(:include,InstanceMethods)
|
139
|
+
base.send(:extend,ClassMethods)
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Page::Proxy
|
3
|
+
def self.included(base)
|
4
|
+
base.send(:include,InstanceMethods)
|
5
|
+
base.send(:extend,ClassMethods)
|
6
|
+
base.class_eval do
|
7
|
+
class_attribute :proxies
|
8
|
+
class_attribute :disabled_proxies
|
9
|
+
self.proxies = []
|
10
|
+
self.disabled_proxies = []
|
11
|
+
|
12
|
+
before_fetch do |page|
|
13
|
+
proxies.compact!
|
14
|
+
proxies.uniq!
|
15
|
+
host,port = proxies.shuffle.first
|
16
|
+
port ||= 80
|
17
|
+
if host
|
18
|
+
logger.debug "set proxy: #{host}:#{port}"
|
19
|
+
Spider::Http.http_proxy host,port
|
20
|
+
else
|
21
|
+
Spider::Http.clear_proxy
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
after_fetch do |page|
|
26
|
+
logger.debug "reset proxy"
|
27
|
+
# Spider::Http.http_proxy old_host,old_port
|
28
|
+
if page.content.blank?
|
29
|
+
# retry, and set proxy to disabled
|
30
|
+
# proxies
|
31
|
+
puts "proxies before:#{self.proxies.inspect}"
|
32
|
+
disabled_proxy = proxies.find{|proxy| proxy.first == Spider::Http.default_options[:http_proxyaddr] }
|
33
|
+
proxies.delete disabled_proxy
|
34
|
+
self.disabled_proxies += [disabled_proxy]
|
35
|
+
puts "proxies after:#{self.proxies.inspect}"
|
36
|
+
unless proxies.empty?
|
37
|
+
puts 'retry'
|
38
|
+
page.request
|
39
|
+
next
|
40
|
+
else
|
41
|
+
puts 'finished retry.'
|
42
|
+
# no proxies available
|
43
|
+
# recover proxies
|
44
|
+
# 以便下次仍然使用(防止一次意外失败,而永久排除)
|
45
|
+
self.proxies += self.disabled_proxies
|
46
|
+
self.disabled_proxies = []
|
47
|
+
# 不用代理服务器使用自身来获取
|
48
|
+
end
|
49
|
+
end
|
50
|
+
Spider::Http.clear_proxy
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
module ClassMethods
|
58
|
+
|
59
|
+
def disable_proxy
|
60
|
+
proxy(nil,nil)
|
61
|
+
end
|
62
|
+
|
63
|
+
def validate_proxies
|
64
|
+
valid_proxies = proxies.find_all do |proxy|
|
65
|
+
valid_proxy?(*proxy)
|
66
|
+
end
|
67
|
+
invalid_proxies = proxies - valid_proxies
|
68
|
+
{:valid => valid_proxies,:invalid => invalid_proxies}
|
69
|
+
end
|
70
|
+
|
71
|
+
# 指定一个 file 作为 proxy 来源
|
72
|
+
# # ip:port
|
73
|
+
def proxy_file(file)
|
74
|
+
config_root = File.join(Rails.root,"config","spiders")
|
75
|
+
if file =~ /^\//
|
76
|
+
# absolute path
|
77
|
+
content = File.read file
|
78
|
+
else
|
79
|
+
content = File.read(File.join(config_root,file))
|
80
|
+
end
|
81
|
+
proxies = []
|
82
|
+
content.each_line do |line|
|
83
|
+
line = line.strip
|
84
|
+
if line =~ /^\s*#/
|
85
|
+
# 注释
|
86
|
+
else
|
87
|
+
if line =~ /\d+?\.\d+?\.\d+?\.\d+?/
|
88
|
+
ip,port = line.split(":")
|
89
|
+
port ||= 80
|
90
|
+
proxies += [[ip,port]]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
self.proxy do |the_proxies|
|
95
|
+
proxies.each do |p|
|
96
|
+
the_proxies += [p]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def valid_proxy?(ip,port = 80,options = {})
|
102
|
+
options[:url] ||= "http://www.google.com"
|
103
|
+
options[:code] ||= 200
|
104
|
+
options[:timeout] ||= 10
|
105
|
+
# options[:match] ||= //
|
106
|
+
Spider::Http.with_proxy ip,port do
|
107
|
+
begin
|
108
|
+
timeout options[:timeout] do
|
109
|
+
response = Spider::Http.get options[:url]
|
110
|
+
r = response.code == options[:code]
|
111
|
+
if options[:match]
|
112
|
+
r && (response.to_s =~ options[:match])
|
113
|
+
else
|
114
|
+
r
|
115
|
+
end
|
116
|
+
end
|
117
|
+
rescue Exception => e
|
118
|
+
false
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# 直接设置 proxies
|
124
|
+
def proxies=(arr)
|
125
|
+
proxy do |ps|
|
126
|
+
arr.each do |a|
|
127
|
+
ps += [a]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def proxy(host = nil,port = 80,&block)
|
133
|
+
self.proxies += [[host,port]] if host
|
134
|
+
|
135
|
+
if block_given?
|
136
|
+
yield self.proxies
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
module InstanceMethods
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Page::Publish
|
3
|
+
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
|
8
|
+
define_model_callbacks :publish
|
9
|
+
|
10
|
+
cattr_accessor :publishers
|
11
|
+
self.publishers = []
|
12
|
+
after_crawl :publish
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
module ClassMethods
|
17
|
+
|
18
|
+
# publish_to Article
|
19
|
+
# Article will set to publisher
|
20
|
+
# publish_to Article will called when #crawl
|
21
|
+
def publish_to(*publishers)
|
22
|
+
logger.debug "[#{self}] set publisher: #{publishers}"
|
23
|
+
self.publishers += publishers
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
module InstanceMethods
|
29
|
+
|
30
|
+
def publish_to(*publishers)
|
31
|
+
run_callbacks :publish do
|
32
|
+
logger.debug "publish to #{publishers}"
|
33
|
+
results = []
|
34
|
+
[publishers].flatten.each do |publisher|
|
35
|
+
logger.info "send self to #{publisher}"
|
36
|
+
logger.debug "class:#{publisher.class.name}"
|
37
|
+
publisher = case publisher
|
38
|
+
when String,Symbol
|
39
|
+
publisher.to_s.classify.constantize
|
40
|
+
else
|
41
|
+
# puts "default: #{publisher}"
|
42
|
+
publisher
|
43
|
+
end
|
44
|
+
logger.debug "publisher: #{publisher}"
|
45
|
+
result = nil
|
46
|
+
begin
|
47
|
+
|
48
|
+
if publisher.respond_to?(:receive_spider_page)
|
49
|
+
logger.debug "#{publisher} receive spider page #{self}"
|
50
|
+
result = publisher.receive_spider_page self
|
51
|
+
logger.debug "#{publisher} return #{result}"
|
52
|
+
else
|
53
|
+
logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
|
54
|
+
end
|
55
|
+
rescue Exception=>e
|
56
|
+
logger.error e.message
|
57
|
+
logger.error e.backtrace.join("\n")
|
58
|
+
end
|
59
|
+
results << result
|
60
|
+
end
|
61
|
+
results
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def publish
|
66
|
+
publishers = self.publishers.uniq
|
67
|
+
if [:title,:body].all?{|name| attribute_names.include?(name) }
|
68
|
+
logger.debug "[#{self} publish to #{publishers}"
|
69
|
+
publish_to(publishers)
|
70
|
+
else
|
71
|
+
logger.debug "attribute names not include :title, :body,so publish canceled."
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Page::Validation
|
3
|
+
|
4
|
+
class ValidationError < Exception; end
|
5
|
+
|
6
|
+
module InstanceMethods
|
7
|
+
|
8
|
+
def valid_url?
|
9
|
+
self.class.valid_url?(url)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
|
16
|
+
# 根据 url 来返回合适的 Spider::Page 类
|
17
|
+
def find_by_url(url)
|
18
|
+
Spider::Site.find_pages.find do |page|
|
19
|
+
page.valid_url?(url)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# 返回一个数组,匹配url的所有page
|
24
|
+
def find_all_by_url(url)
|
25
|
+
pages = Spider::Site.find_pages.find_all do |page|
|
26
|
+
page.valid_url?(url)
|
27
|
+
end
|
28
|
+
if pages.empty?
|
29
|
+
# 获得该 domain 下的所有 pages
|
30
|
+
sites = Spider::Site.all.find_all do |site|
|
31
|
+
site.valid_domain?(url)
|
32
|
+
end
|
33
|
+
pages = sites.collect{|site| site.pages }.flatten
|
34
|
+
end
|
35
|
+
pages
|
36
|
+
end
|
37
|
+
|
38
|
+
# 直接创建 page 的实例
|
39
|
+
def create_all_by_url(url)
|
40
|
+
find_all_by_url(url).collect{|klass| klass.new url }
|
41
|
+
end
|
42
|
+
|
43
|
+
# 直接创建 page 的实例
|
44
|
+
def create_by_url(url)
|
45
|
+
find_by_url(url).try(:new,url)
|
46
|
+
end
|
47
|
+
|
48
|
+
# 判断 url 是不是该类能处理的
|
49
|
+
def valid_url?(url)
|
50
|
+
valid = false
|
51
|
+
self.validate_url_procs.each do |p|
|
52
|
+
|
53
|
+
if p.call(url)
|
54
|
+
valid = true
|
55
|
+
break
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
valid
|
60
|
+
end
|
61
|
+
|
62
|
+
# 例子:
|
63
|
+
# validate_url "china.com",:match=>/suzhou/,:unmatch=>/beijing/
|
64
|
+
# validate_url "google.com",:match=>/baidu/,:unmatch=>/yahoo/
|
65
|
+
def validate_url(options={},&block)
|
66
|
+
options.assert_valid_keys :match,:unmatch,:example,:domain
|
67
|
+
p = lambda do |url|
|
68
|
+
|
69
|
+
begin
|
70
|
+
uri = URI(url)
|
71
|
+
|
72
|
+
domain = [options[:domain]].flatten.compact.uniq
|
73
|
+
unless domain.empty?
|
74
|
+
unless domain.any?{|d| uri.host.end_with? d }
|
75
|
+
raise ValidationError.new(" domain: #{uri.host} not in #{domain.inspect} ")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
match = [options[:match]].flatten.compact.uniq
|
81
|
+
unmatch = [options[:unmatch]].flatten.compact.uniq
|
82
|
+
|
83
|
+
unless match.all?{ |regexp|
|
84
|
+
next regexp.call(url) if regexp.is_a? Proc
|
85
|
+
regexp = Regexp.escape regexp if regexp.is_a? String
|
86
|
+
regexp =~ url
|
87
|
+
}
|
88
|
+
raise ValidationError.new("#{url} not match #{match.inspect}")
|
89
|
+
end
|
90
|
+
|
91
|
+
if unmatch.any?{|regexp|
|
92
|
+
regexp = Regexp.escape regexp if regexp.is_a? String
|
93
|
+
regexp =~ url
|
94
|
+
}
|
95
|
+
raise ValidationError.new("#{url} match #{unmatch.inspect}")
|
96
|
+
end
|
97
|
+
rescue ValidationError=>e
|
98
|
+
logger.debug e.message
|
99
|
+
next false
|
100
|
+
rescue Exception=>e
|
101
|
+
logger.error e.message
|
102
|
+
logger.error e.backtrace.join("\n")
|
103
|
+
next false
|
104
|
+
end
|
105
|
+
|
106
|
+
true
|
107
|
+
end
|
108
|
+
|
109
|
+
if options[:example]
|
110
|
+
set_example_url options[:example]
|
111
|
+
# 执行自我检查
|
112
|
+
[options[:example]].flatten.each do |url|
|
113
|
+
unless p.call(url)
|
114
|
+
raise ValidationError.new("#{url} is not a valid url for me.")
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
self.validate_url_procs += [p]
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.included(base)
|
125
|
+
base.class_eval do
|
126
|
+
class_attribute :validate_url_procs
|
127
|
+
self.validate_url_procs = []
|
128
|
+
|
129
|
+
|
130
|
+
include InstanceMethods
|
131
|
+
extend ClassMethods
|
132
|
+
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|