spider2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +15 -0
  3. data/Rakefile +23 -0
  4. data/init.rb +3 -0
  5. data/install.rb +2 -0
  6. data/lib/generators/spider/spider_generator.rb +42 -0
  7. data/lib/generators/spider/templates/base_page.rb +6 -0
  8. data/lib/generators/spider/templates/base_page_spec.rb +13 -0
  9. data/lib/generators/spider/templates/index_page.rb +6 -0
  10. data/lib/generators/spider/templates/index_page_spec.rb +14 -0
  11. data/lib/generators/spider/templates/index_page_test.rb +10 -0
  12. data/lib/generators/spider/templates/list_page.rb +6 -0
  13. data/lib/generators/spider/templates/list_page_spec.rb +22 -0
  14. data/lib/generators/spider/templates/list_page_test.rb +10 -0
  15. data/lib/generators/spider/templates/show_page.rb +14 -0
  16. data/lib/generators/spider/templates/show_page_spec.rb +19 -0
  17. data/lib/generators/spider/templates/show_page_test.rb +10 -0
  18. data/lib/generators/spider/templates/site.rb +7 -0
  19. data/lib/generators/spider/templates/site_spec.rb +13 -0
  20. data/lib/generators/spider/templates/test.rb +10 -0
  21. data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
  22. data/lib/generators/spider_migration/templates/migration.rb +42 -0
  23. data/lib/spider/active_record_methods.rb +60 -0
  24. data/lib/spider/http.rb +43 -0
  25. data/lib/spider/page/filter.rb +132 -0
  26. data/lib/spider/page/label.rb +28 -0
  27. data/lib/spider/page/pagination.rb +142 -0
  28. data/lib/spider/page/proxy.rb +149 -0
  29. data/lib/spider/page/publish.rb +78 -0
  30. data/lib/spider/page/validation.rb +136 -0
  31. data/lib/spider/page.rb +759 -0
  32. data/lib/spider/site.rb +225 -0
  33. data/lib/spider/spider_page.rb +18 -0
  34. data/lib/spider/spider_page_label.rb +5 -0
  35. data/lib/spider/version.rb +3 -0
  36. data/lib/spider.rb +81 -0
  37. data/lib/tasks/spider_tasks.rake +86 -0
  38. data/test/spider_fu_test.rb +9 -0
  39. data/test/test_helper.rb +4 -0
  40. data/uninstall.rb +2 -0
  41. metadata +151 -0
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+ # 分页处理
3
+ # 分页一般有三种形式
4
+ # 1)
5
+ # 上页 1 2 3 4 5 6 下页
6
+ # 可以使用
7
+ # paginate :scope=>"#paginte-div"
8
+ # 还有一种形式是 一共有多少页
9
+ # 比如 88 页
10
+
11
+ module Spider::Page::Pagination
12
+ module ClassMethods
13
+ # 实现一个简单的可以解决大部分分页的功能
14
+ # paginate :body,:links=>".paginate",:to=>[:body] do |page|
15
+ # end
16
+ # options:
17
+ # scope: 指定分页链接范围
18
+ # 比如:<div class="pagination"><a href="?1">1</a></div>
19
+ # 可以指定scope为 div.pagination
20
+ # 那么所有 div.pagination下的链接都认为是分页链接
21
+ # include_self: boolean
22
+ # 是否包含当前页面,默认为false
23
+ # sort: boolean
24
+ # 是否对链接进行排序,默认为false
25
+ def paginate(to=:body,options={},&block)
26
+ options.assert_valid_keys :scope,:include_self,:sort
27
+ options.reverse_merge! :include_self=>false,:sort=>false,:scope=>".pagination"
28
+
29
+ partial_name = "#{to}_partial"
30
+ paginated_pages_name = "#{to}_paginated_pages"
31
+
32
+ # define_method partial_name,&block # 获得页面部分的body
33
+
34
+ define_method paginated_pages_name do
35
+ pages = self.pages(:scope=>options[:scope])
36
+
37
+ if options[:include_self]
38
+ pages.unshift(self)
39
+ end
40
+
41
+ if options[:sort]
42
+ pages = pages.sort_by{|page| page.url }
43
+ end
44
+ pages
45
+ end
46
+
47
+ # 定义获取全文的method
48
+ define_method to do
49
+ send(paginated_pages_name).collect{|page|
50
+ page.send(partial_name)
51
+ } * Spider::Page.paginate_symbol
52
+ end
53
+ end
54
+
55
+ # 产生分页链接
56
+ # generate_paginated_urls 10,"http://www.google.com/article_%d.html",:first=>"http://www.google.com/article.html"
57
+ def generate_paginated_urls(page_count,url,options={})
58
+ options.assert_valid_keys :first,:until
59
+ page_count = 1000 if options[:until] # 1000页总够了吧
60
+ urls = []
61
+ page_count.times do |i|
62
+ if options[:first] && i.zero?
63
+ urls << options[:first]
64
+ else
65
+ urls << url % i
66
+ end
67
+ end
68
+ urls
69
+ end
70
+
71
+ end
72
+
73
+ module InstanceMethods
74
+ # http://www.google.com/article_1.html
75
+ # page.generate_paginated_urls(10,/article(_\d+?)\.html/)
76
+ # 一般有几种方式
77
+ # 1):
78
+ # xxxxx.html
79
+ # xxxxx_1.html
80
+ # xxxxx_2.html
81
+ # 2):
82
+ # xxx.asp
83
+ # xxx.asp?page=2
84
+ # xxx.asp?page=3
85
+ # xxxxx(_%d).html
86
+ # xxx.asp(?page=%d)
87
+ # 比如 http://www.powerapple.com/articles/11111.html
88
+ # http://www.powerapple.com/articles/11111_1.html
89
+ # 如果已知是 5 页
90
+ # 那么:http://www.powerapple.com/articles/11111.html
91
+ # http://www.powerapple.com/articles/11111_2.html
92
+ # http://www.powerapple.com/articles/11111_3.html
93
+ # http://www.powerapple.com/articles/11111_4.html
94
+ # http://www.powerapple.com/articles/11111_5.html
95
+ # generate_paginated_urls 5,
96
+ # :url=>"http://www.powerapple.com/articles/1111_%d.html",
97
+ # :start=>1,
98
+ # :unshift=>"http://www.powerapple.com/articles/1111.html"
99
+ def generate_paginated_urls(url,options={})
100
+ options.assert_valid_keys :start,:prepend,:append,:until_failure,:count
101
+ number = options[:count].to_i
102
+ number = 1000 if options[:until_failure] # 根据条件,获取页面
103
+ new_url = url
104
+ start = options[:start]
105
+ urls = []
106
+ number.times do |index|
107
+ index += 1
108
+ next if start && index < start
109
+ page_url = (url % index)
110
+ if options[:until_failure]
111
+ response = Spider::Http.head(page_url)
112
+ unless response.code == 200
113
+ break # 遇到404就退出
114
+ end
115
+ end
116
+ urls << page_url
117
+ end
118
+ if options[:prepend]
119
+ [options[:prepend]].flatten.reverse.each do |url|
120
+ urls.unshift(url)
121
+ end
122
+ end
123
+ if options[:append]
124
+ [options[:append]].flatten.reverse.each do |url|
125
+ urls.push(url)
126
+ end
127
+ end
128
+ urls
129
+ end
130
+
131
+ def generate_paginated_pages(*args)
132
+ urls = generate_paginated_urls(*args)
133
+ urls.collect{|url| go(url) }
134
+ end
135
+ end
136
+
137
+ def self.included(base)
138
+ base.send(:include,InstanceMethods)
139
+ base.send(:extend,ClassMethods)
140
+ end
141
+
142
+ end
@@ -0,0 +1,149 @@
1
+ # encoding: utf-8
2
+ module Spider::Page::Proxy
3
+ def self.included(base)
4
+ base.send(:include,InstanceMethods)
5
+ base.send(:extend,ClassMethods)
6
+ base.class_eval do
7
+ class_attribute :proxies
8
+ class_attribute :disabled_proxies
9
+ self.proxies = []
10
+ self.disabled_proxies = []
11
+
12
+ before_fetch do |page|
13
+ proxies.compact!
14
+ proxies.uniq!
15
+ host,port = proxies.shuffle.first
16
+ port ||= 80
17
+ if host
18
+ logger.debug "set proxy: #{host}:#{port}"
19
+ Spider::Http.http_proxy host,port
20
+ else
21
+ Spider::Http.clear_proxy
22
+ end
23
+ end
24
+
25
+ after_fetch do |page|
26
+ logger.debug "reset proxy"
27
+ # Spider::Http.http_proxy old_host,old_port
28
+ if page.content.blank?
29
+ # retry, and set proxy to disabled
30
+ # proxies
31
+ puts "proxies before:#{self.proxies.inspect}"
32
+ disabled_proxy = proxies.find{|proxy| proxy.first == Spider::Http.default_options[:http_proxyaddr] }
33
+ proxies.delete disabled_proxy
34
+ self.disabled_proxies += [disabled_proxy]
35
+ puts "proxies after:#{self.proxies.inspect}"
36
+ unless proxies.empty?
37
+ puts 'retry'
38
+ page.request
39
+ next
40
+ else
41
+ puts 'finished retry.'
42
+ # no proxies available
43
+ # recover proxies
44
+ # 以便下次仍然使用(防止一次意外失败,而永久排除)
45
+ self.proxies += self.disabled_proxies
46
+ self.disabled_proxies = []
47
+ # 不用代理服务器使用自身来获取
48
+ end
49
+ end
50
+ Spider::Http.clear_proxy
51
+ end
52
+
53
+
54
+ end
55
+ end
56
+
57
+ module ClassMethods
58
+
59
+ def disable_proxy
60
+ proxy(nil,nil)
61
+ end
62
+
63
+ def validate_proxies
64
+ valid_proxies = proxies.find_all do |proxy|
65
+ valid_proxy?(*proxy)
66
+ end
67
+ invalid_proxies = proxies - valid_proxies
68
+ {:valid => valid_proxies,:invalid => invalid_proxies}
69
+ end
70
+
71
+ # 指定一个 file 作为 proxy 来源
72
+ # # ip:port
73
+ def proxy_file(file)
74
+ config_root = File.join(Rails.root,"config","spiders")
75
+ if file =~ /^\//
76
+ # absolute path
77
+ content = File.read file
78
+ else
79
+ content = File.read(File.join(config_root,file))
80
+ end
81
+ proxies = []
82
+ content.each_line do |line|
83
+ line = line.strip
84
+ if line =~ /^\s*#/
85
+ # 注释
86
+ else
87
+ if line =~ /\d+?\.\d+?\.\d+?\.\d+?/
88
+ ip,port = line.split(":")
89
+ port ||= 80
90
+ proxies += [[ip,port]]
91
+ end
92
+ end
93
+ end
94
+ self.proxy do |the_proxies|
95
+ proxies.each do |p|
96
+ the_proxies += [p]
97
+ end
98
+ end
99
+ end
100
+
101
+ def valid_proxy?(ip,port = 80,options = {})
102
+ options[:url] ||= "http://www.google.com"
103
+ options[:code] ||= 200
104
+ options[:timeout] ||= 10
105
+ # options[:match] ||= //
106
+ Spider::Http.with_proxy ip,port do
107
+ begin
108
+ timeout options[:timeout] do
109
+ response = Spider::Http.get options[:url]
110
+ r = response.code == options[:code]
111
+ if options[:match]
112
+ r && (response.to_s =~ options[:match])
113
+ else
114
+ r
115
+ end
116
+ end
117
+ rescue Exception => e
118
+ false
119
+ end
120
+ end
121
+ end
122
+
123
+ # 直接设置 proxies
124
+ def proxies=(arr)
125
+ proxy do |ps|
126
+ arr.each do |a|
127
+ ps += [a]
128
+ end
129
+ end
130
+ end
131
+
132
+ def proxy(host = nil,port = 80,&block)
133
+ self.proxies += [[host,port]] if host
134
+
135
+ if block_given?
136
+ yield self.proxies
137
+ end
138
+
139
+ end
140
+
141
+ end
142
+
143
+
144
+
145
+ module InstanceMethods
146
+
147
+ end
148
+
149
+ end
@@ -0,0 +1,78 @@
1
+ # encoding: utf-8
2
+ module Spider::Page::Publish
3
+
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+
8
+ define_model_callbacks :publish
9
+
10
+ cattr_accessor :publishers
11
+ self.publishers = []
12
+ after_crawl :publish
13
+
14
+ end
15
+
16
+ module ClassMethods
17
+
18
+ # publish_to Article
19
+ # Article will set to publisher
20
+ # publish_to Article will called when #crawl
21
+ def publish_to(*publishers)
22
+ logger.debug "[#{self}] set publisher: #{publishers}"
23
+ self.publishers += publishers
24
+ end
25
+
26
+ end
27
+
28
+ module InstanceMethods
29
+
30
+ def publish_to(*publishers)
31
+ run_callbacks :publish do
32
+ logger.debug "publish to #{publishers}"
33
+ results = []
34
+ [publishers].flatten.each do |publisher|
35
+ logger.info "send self to #{publisher}"
36
+ logger.debug "class:#{publisher.class.name}"
37
+ publisher = case publisher
38
+ when String,Symbol
39
+ publisher.to_s.classify.constantize
40
+ else
41
+ # puts "default: #{publisher}"
42
+ publisher
43
+ end
44
+ logger.debug "publisher: #{publisher}"
45
+ result = nil
46
+ begin
47
+
48
+ if publisher.respond_to?(:receive_spider_page)
49
+ logger.debug "#{publisher} receive spider page #{self}"
50
+ result = publisher.receive_spider_page self
51
+ logger.debug "#{publisher} return #{result}"
52
+ else
53
+ logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
54
+ end
55
+ rescue Exception=>e
56
+ logger.error e.message
57
+ logger.error e.backtrace.join("\n")
58
+ end
59
+ results << result
60
+ end
61
+ results
62
+ end
63
+ end
64
+
65
+ def publish
66
+ publishers = self.publishers.uniq
67
+ if [:title,:body].all?{|name| attribute_names.include?(name) }
68
+ logger.debug "[#{self} publish to #{publishers}"
69
+ publish_to(publishers)
70
+ else
71
+ logger.debug "attribute names not include :title, :body,so publish canceled."
72
+ end
73
+ end
74
+
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,136 @@
1
+ # encoding: utf-8
2
+ module Spider::Page::Validation
3
+
4
+ class ValidationError < Exception; end
5
+
6
+ module InstanceMethods
7
+
8
+ def valid_url?
9
+ self.class.valid_url?(url)
10
+ end
11
+
12
+ end
13
+
14
+ module ClassMethods
15
+
16
+ # 根据 url 来返回合适的 Spider::Page 类
17
+ def find_by_url(url)
18
+ Spider::Site.find_pages.find do |page|
19
+ page.valid_url?(url)
20
+ end
21
+ end
22
+
23
+ # 返回一个数组,匹配url的所有page
24
+ def find_all_by_url(url)
25
+ pages = Spider::Site.find_pages.find_all do |page|
26
+ page.valid_url?(url)
27
+ end
28
+ if pages.empty?
29
+ # 获得该 domain 下的所有 pages
30
+ sites = Spider::Site.all.find_all do |site|
31
+ site.valid_domain?(url)
32
+ end
33
+ pages = sites.collect{|site| site.pages }.flatten
34
+ end
35
+ pages
36
+ end
37
+
38
+ # 直接创建 page 的实例
39
+ def create_all_by_url(url)
40
+ find_all_by_url(url).collect{|klass| klass.new url }
41
+ end
42
+
43
+ # 直接创建 page 的实例
44
+ def create_by_url(url)
45
+ find_by_url(url).try(:new,url)
46
+ end
47
+
48
+ # 判断 url 是不是该类能处理的
49
+ def valid_url?(url)
50
+ valid = false
51
+ self.validate_url_procs.each do |p|
52
+
53
+ if p.call(url)
54
+ valid = true
55
+ break
56
+ end
57
+
58
+ end
59
+ valid
60
+ end
61
+
62
+ # 例子:
63
+ # validate_url "china.com",:match=>/suzhou/,:unmatch=>/beijing/
64
+ # validate_url "google.com",:match=>/baidu/,:unmatch=>/yahoo/
65
+ def validate_url(options={},&block)
66
+ options.assert_valid_keys :match,:unmatch,:example,:domain
67
+ p = lambda do |url|
68
+
69
+ begin
70
+ uri = URI(url)
71
+
72
+ domain = [options[:domain]].flatten.compact.uniq
73
+ unless domain.empty?
74
+ unless domain.any?{|d| uri.host.end_with? d }
75
+ raise ValidationError.new(" domain: #{uri.host} not in #{domain.inspect} ")
76
+ end
77
+ end
78
+
79
+
80
+ match = [options[:match]].flatten.compact.uniq
81
+ unmatch = [options[:unmatch]].flatten.compact.uniq
82
+
83
+ unless match.all?{ |regexp|
84
+ next regexp.call(url) if regexp.is_a? Proc
85
+ regexp = Regexp.escape regexp if regexp.is_a? String
86
+ regexp =~ url
87
+ }
88
+ raise ValidationError.new("#{url} not match #{match.inspect}")
89
+ end
90
+
91
+ if unmatch.any?{|regexp|
92
+ regexp = Regexp.escape regexp if regexp.is_a? String
93
+ regexp =~ url
94
+ }
95
+ raise ValidationError.new("#{url} match #{unmatch.inspect}")
96
+ end
97
+ rescue ValidationError=>e
98
+ logger.debug e.message
99
+ next false
100
+ rescue Exception=>e
101
+ logger.error e.message
102
+ logger.error e.backtrace.join("\n")
103
+ next false
104
+ end
105
+
106
+ true
107
+ end
108
+
109
+ if options[:example]
110
+ set_example_url options[:example]
111
+ # 执行自我检查
112
+ [options[:example]].flatten.each do |url|
113
+ unless p.call(url)
114
+ raise ValidationError.new("#{url} is not a valid url for me.")
115
+ end
116
+ end
117
+ end
118
+
119
+ self.validate_url_procs += [p]
120
+ end
121
+
122
+ end
123
+
124
+ def self.included(base)
125
+ base.class_eval do
126
+ class_attribute :validate_url_procs
127
+ self.validate_url_procs = []
128
+
129
+
130
+ include InstanceMethods
131
+ extend ClassMethods
132
+
133
+ end
134
+ end
135
+
136
+ end