spider2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +15 -0
  3. data/Rakefile +23 -0
  4. data/init.rb +3 -0
  5. data/install.rb +2 -0
  6. data/lib/generators/spider/spider_generator.rb +42 -0
  7. data/lib/generators/spider/templates/base_page.rb +6 -0
  8. data/lib/generators/spider/templates/base_page_spec.rb +13 -0
  9. data/lib/generators/spider/templates/index_page.rb +6 -0
  10. data/lib/generators/spider/templates/index_page_spec.rb +14 -0
  11. data/lib/generators/spider/templates/index_page_test.rb +10 -0
  12. data/lib/generators/spider/templates/list_page.rb +6 -0
  13. data/lib/generators/spider/templates/list_page_spec.rb +22 -0
  14. data/lib/generators/spider/templates/list_page_test.rb +10 -0
  15. data/lib/generators/spider/templates/show_page.rb +14 -0
  16. data/lib/generators/spider/templates/show_page_spec.rb +19 -0
  17. data/lib/generators/spider/templates/show_page_test.rb +10 -0
  18. data/lib/generators/spider/templates/site.rb +7 -0
  19. data/lib/generators/spider/templates/site_spec.rb +13 -0
  20. data/lib/generators/spider/templates/test.rb +10 -0
  21. data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
  22. data/lib/generators/spider_migration/templates/migration.rb +42 -0
  23. data/lib/spider/active_record_methods.rb +60 -0
  24. data/lib/spider/http.rb +43 -0
  25. data/lib/spider/page/filter.rb +132 -0
  26. data/lib/spider/page/label.rb +28 -0
  27. data/lib/spider/page/pagination.rb +142 -0
  28. data/lib/spider/page/proxy.rb +149 -0
  29. data/lib/spider/page/publish.rb +78 -0
  30. data/lib/spider/page/validation.rb +136 -0
  31. data/lib/spider/page.rb +759 -0
  32. data/lib/spider/site.rb +225 -0
  33. data/lib/spider/spider_page.rb +18 -0
  34. data/lib/spider/spider_page_label.rb +5 -0
  35. data/lib/spider/version.rb +3 -0
  36. data/lib/spider.rb +81 -0
  37. data/lib/tasks/spider_tasks.rake +86 -0
  38. data/test/spider_fu_test.rb +9 -0
  39. data/test/test_helper.rb +4 -0
  40. data/uninstall.rb +2 -0
  41. metadata +151 -0
@@ -0,0 +1,225 @@
1
+ # encoding: utf-8
2
+ class Spider::Site
3
+ # include Singleton
4
+ cattr_accessor :sites,:publishers
5
+ @@sites = []
6
+
7
+ # 可以在rails的initializer目录下设置发布设置
8
+ # 比如
9
+ # Spider::Site.publishers = [Article]
10
+ # 这样 任何 Page 在 调用 publish 的时候都会 调用 Article.receive_spier_page 方法
11
+ # 参数是一个 page 自身
12
+ @@publishers = []
13
+ attr_accessor :id,:labels,:blocks,:encoding,:disabled,:name,:index_url,:domains
14
+
15
+ class_attribute :attributes
16
+ self.attributes = {}
17
+
18
+ def initialize
19
+ # @pages = []
20
+ @domains = []
21
+ end
22
+
23
+ def self.id
24
+ instance.id
25
+ end
26
+
27
+ def self.set_example_url(*args)
28
+ logger.warn "Spider::Site.set_example_url will take no effect, please remove."
29
+ end
30
+
31
+ def self.set_index_url(url)
32
+ instance.index_url = url
33
+ end
34
+
35
+ def self.set_name(name)
36
+ instance.name = name
37
+ end
38
+
39
+ def self.set_domains(*domains)
40
+ instance.domains = domains
41
+ end
42
+
43
+ def self.set_domain(*args)
44
+ set_domains(*args)
45
+ end
46
+
47
+ def self.index_page
48
+ instance.index_page
49
+ end
50
+
51
+ def self.valid_domain?(url)
52
+ instance.valid_domain?
53
+ end
54
+
55
+ def add_domain(domain)
56
+ domains.push(domain)
57
+ domains.uniq!
58
+ domains
59
+ end
60
+
61
+ def valid_domain?(url)
62
+ begin
63
+ host = URI(url).host
64
+ domains.any?{|domain| host.end_with?(domain) }
65
+ rescue Exception=>e
66
+ false
67
+ end
68
+ end
69
+
70
+ def pages
71
+ parent = self.class.parent
72
+ constants = parent.constants
73
+ @pages = constants.find_all{|i| i =~ /Page/}.reject!{|i| i =~ /^BasePage$/ }.collect{|i| "#{parent}::#{i}".constantize }
74
+ end
75
+
76
+ def index_page
77
+ unless @index_page
78
+ page = pages.find{|page| page.name =~ /IndexPage$/}
79
+ if page
80
+ @index_page ||= page.new index_url
81
+ end
82
+ end
83
+ @index_page
84
+ end
85
+
86
+ def self.pages
87
+ instance.pages
88
+ end
89
+
90
+ # 开始爬行单个url
91
+ # 成功,则返回一个Spider::Page实例
92
+ # 否则返回nil
93
+ def crawl
94
+ puts "#{self.class.name}#crawl not implement"
95
+ end
96
+
97
+ def self.crawl(url,options={})
98
+ sites.each do |site|
99
+ if site.crawl(url,options)
100
+ break
101
+ end
102
+ end
103
+ end
104
+
105
+ def self.set_id(id)
106
+ instance.id = id
107
+ end
108
+
109
+ class << self
110
+ alias_method :name=,:set_name
111
+ alias_method :id=,:set_id
112
+ end
113
+
114
+
115
+ def self.inherited(klass)
116
+ super
117
+ klass.send(:include,Singleton)
118
+ klass.instance.id = klass.name.tableize.split("/").first
119
+ @@sites << klass.instance
120
+ end
121
+
122
+ def self.channels
123
+ public_instance_methods - Spider::Site.public_instance_methods
124
+ end
125
+
126
+ def channels
127
+ self.class.channels
128
+ end
129
+
130
+ # 返回所有被注册了的站点名字
131
+ # Spider::Site.names #=> ['6park','sina','wenxuecity']
132
+ def self.names
133
+ sites.collect &:id
134
+ end
135
+
136
+ def logger
137
+ self.class.logger
138
+ end
139
+
140
+ def self.logger
141
+ Spider.logger
142
+ end
143
+
144
+
145
+ def start
146
+ crawl
147
+ end
148
+
149
+ def spider_pages(options={})
150
+ SpiderPage.scoped(:conditions=>{:site=>id})
151
+ end
152
+
153
+ # 当 site.enable = false 的时候
154
+ # site.start 不会有任何动作
155
+ def enable?
156
+ !disabled
157
+ end
158
+
159
+ # 是否禁用
160
+ # 对 enable? 的取反
161
+ def disabled?
162
+ !enable?
163
+ end
164
+
165
+ def self.start(name=nil,page_index=nil)
166
+ load_rules
167
+ if name.nil?
168
+ @@sites.each{ |site| site.start(page_index) }
169
+ else
170
+ self[name].start(page_index)
171
+ end
172
+ end
173
+
174
+
175
+ # find site by name
176
+ # eg:
177
+ # site = Spider::Site['Sina']
178
+ # site.start if site
179
+ # 获得已经注册了的站点
180
+ # Spider::Site['sina'] #=> #<Spider::Site name='sina'>
181
+ # 如果不存在则返回 nil
182
+ def self.[](id)
183
+ site = sites.find{|site| site.id == id.to_s}
184
+ end
185
+
186
+ def self.all
187
+ sites
188
+ end
189
+
190
+
191
+ def self.full_page_names
192
+ Spider::Site.all.collect{|site| site.name }.flatten
193
+ end
194
+
195
+ # 获得站点下的所有示例url
196
+ def example_url
197
+ pages.collect{|page| page.example_url }.flatten
198
+ end
199
+
200
+ def self.example_url
201
+ instance.example_url
202
+ end
203
+
204
+ def self.find_pages(options={})
205
+ all_pages = Spider::Site.all.collect{|site| site.pages }.flatten
206
+ attributes = [options[:attributes]].flatten.compact
207
+
208
+ unless attributes.empty?
209
+ all_pages = all_pages.find_all{|page|
210
+ pass = true
211
+ attributes.each do |attribute|
212
+ unless page.attribute_names.include?(attribute)
213
+ pass = false
214
+ break
215
+ end
216
+ end
217
+ pass
218
+ }
219
+ end
220
+
221
+ all_pages
222
+
223
+ end
224
+
225
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ class Spider::SpiderPage < ActiveRecord::Base
3
+ validates_uniqueness_of :url
4
+ has_many :labels,:class_name=>"Spider::SpiderPageLabel",:foreign_key=>"page_id"
5
+
6
+ def labels_data
7
+ @labels_data ||= returning({}) do |hash|
8
+ labels.each{|label| hash[label.name] = label.value }
9
+ end
10
+ end
11
+
12
+ def has_label?
13
+ !labels_count.zero?
14
+ end
15
+
16
+ accepts_nested_attributes_for :labels
17
+
18
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: utf-8
2
+ class Spider::SpiderPageLabel < ActiveRecord::Base
3
+ belongs_to :page,:class_name=>"Spider::SpiderPage",:foreign_key=>"page_id",:counter_cache=>"labels_count"
4
+ validates_uniqueness_of :name,:scope=>:page_id
5
+ end
@@ -0,0 +1,3 @@
1
+ module Spider
2
+ VERSION = "0.0.1"
3
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ #require "hpricot"
3
+ module Spider
4
+ def self.logger
5
+ unless @logger
6
+ @logger = Logger.new(File.join(Rails.root,"log","spider_#{Rails.env}.log"))
7
+ @logger.level = Logger::DEBUG
8
+ end
9
+ @logger
10
+ end
11
+
12
+ def self.reload
13
+ error_occur = false
14
+ Dir[Rails.root.join("spiders","**","*.rb").to_s].each do |file|
15
+ begin
16
+ load file
17
+ rescue Exception => e
18
+ error_occur = true
19
+ end
20
+ end
21
+ !error_occur
22
+ end
23
+
24
+ end
25
+
26
+ module Spider
27
+ module Sites
28
+ # 所有站点,都会使用这个namespace
29
+ end
30
+ end
31
+
32
+ require "spider/http"
33
+ require "spider/site"
34
+ require "spider/page"
35
+
36
+
37
+ require "spider/page/filter"
38
+ Spider::Page.send(:include,Spider::Page::Filter)
39
+
40
+ require "spider/page/publish"
41
+ Spider::Page.send(:include,Spider::Page::Publish)
42
+
43
+ require "spider/page/validation"
44
+ Spider::Page.send(:include,Spider::Page::Validation)
45
+
46
+ require "spider/page/pagination"
47
+ Spider::Page.send(:include,Spider::Page::Pagination)
48
+
49
+ require "spider/page/proxy"
50
+ Spider::Page.send(:include,Spider::Page::Proxy)
51
+
52
+ require "spider/page/label"
53
+ Spider::Page.send(:include,Spider::Page::Label)
54
+
55
+
56
+ spiders_dir = File.join(Rails.root,"spiders")
57
+ $:.push(spiders_dir)
58
+
59
+ # define constants
60
+ Dir[File.join(spiders_dir,"*")].each do |dir|
61
+ dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
62
+ Object.const_set(dir_name.classify,Module.new)
63
+ end
64
+
65
+ # 先包含初始化文件
66
+ init_file = File.join(spiders_dir,"init.rb")
67
+ require init_file if File.exists? init_file
68
+
69
+ file_patten = File.join(spiders_dir,"**","*.rb")
70
+ files = Dir[file_patten]
71
+
72
+ site_files = files.find_all{|i| i =~ /site\.rb/}
73
+ site_files.each{|i| require i}
74
+
75
+ base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
76
+ base_page_files.each{|i| require i}
77
+
78
+ files.each{|i| require i }
79
+
80
+ # 包含 active record methods
81
+ require "spider/active_record_methods"
@@ -0,0 +1,86 @@
1
+ # desc "Explaining what the task does"
2
+ # task :spider_fu do
3
+ # # Task goes here
4
+ # end
5
+
6
+
7
+ desc "开始采集"
8
+ namespace :spider do
9
+
10
+ namespace :proxy do
11
+ desc "test proxy PAGE=XXXX::BasePage"
12
+ task :test => :environment do
13
+ klass = ENV['PAGE']
14
+ begin
15
+ klass = klass.constantize
16
+ rescue Exception => e
17
+ puts "unknow class `#{klass}`, please set a right spider page class to PAGE=XXXX::XxxPage"
18
+ exit
19
+ end
20
+ result = klass.validate_proxies
21
+ puts "valid proxies:"
22
+ result[:valid].each do |proxy|
23
+ puts proxy.join(":")
24
+ end
25
+ puts
26
+ puts "invalid proxies:"
27
+ result[:valid].each do |proxy|
28
+ puts proxy.join(":")
29
+ end
30
+ end
31
+ end
32
+
33
+ task :start=>:environment do
34
+ puts "spider:start" => __LINE__
35
+ site = ENV['SITE']
36
+ if site.blank?
37
+ Spider::Site.all.each do |site|
38
+ ENV['SITE'] = site.id
39
+ Rake::Task['spider:site:start'].execute
40
+ end
41
+ else
42
+ Rake::Task['spider:site:start'].execute
43
+ end
44
+
45
+ end
46
+
47
+ namespace :site do
48
+ desc "开始单个网站"
49
+ task :start=>:environment do
50
+ site = ENV['SITE']
51
+ publishers = ENV['PUBLISHERS']
52
+ if publishers.blank?
53
+ puts "WARNING: PUBLISHERS is blank."
54
+ next
55
+ else
56
+ Spider::Page.publishers = publishers.split(",")
57
+ puts "Set publishers to #{Spider::Page.publishers.inspect}"
58
+ end
59
+ site = Spider::Site[site]
60
+ if site
61
+ puts site.name
62
+ puts "正在运行..."
63
+ site.start
64
+ else
65
+ puts "site(#{ENV['SITE']}) not found"
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+
72
+ Rake::TestTask.new(:test => 'db:test:prepare') do |t|
73
+ t.libs << 'test'
74
+ if ENV['SPIDER']
75
+ t.pattern = "test/spiders/#{ENV['SPIDER']}/*_test.rb"
76
+ else
77
+ t.pattern = 'test/spiders/**/*_test.rb'
78
+ end
79
+ t.verbose = true
80
+ end
81
+
82
+ Rake::Task['spider:test'].comment = "测试spider规则"
83
+
84
+ # 按照规则 spider 名字,生成 test
85
+
86
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+ require 'test_helper'
3
+
4
+ class SpiderFuTest < ActiveSupport::TestCase
5
+ # Replace this with your real tests.
6
+ test "the truth" do
7
+ assert true
8
+ end
9
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+ require 'active_support'
4
+ require 'active_support/test_case'
data/uninstall.rb ADDED
@@ -0,0 +1,2 @@
1
+ # encoding: utf-8
2
+ # Uninstall hook code here
metadata ADDED
@@ -0,0 +1,151 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spider2
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - aotianlong
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-03-04 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rails
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 15
29
+ segments:
30
+ - 3
31
+ - 2
32
+ - 0
33
+ version: 3.2.0
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: htmlentities
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 59
45
+ segments:
46
+ - 4
47
+ - 1
48
+ - 0
49
+ version: 4.1.0
50
+ type: :runtime
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: sqlite3
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ type: :development
65
+ version_requirements: *id003
66
+ description: a framework to crawl web pages
67
+ email:
68
+ - aotianlong@gmail.com
69
+ executables: []
70
+
71
+ extensions: []
72
+
73
+ extra_rdoc_files: []
74
+
75
+ files:
76
+ - lib/generators/spider/spider_generator.rb
77
+ - lib/generators/spider/templates/base_page.rb
78
+ - lib/generators/spider/templates/base_page_spec.rb
79
+ - lib/generators/spider/templates/index_page.rb
80
+ - lib/generators/spider/templates/index_page_spec.rb
81
+ - lib/generators/spider/templates/index_page_test.rb
82
+ - lib/generators/spider/templates/list_page.rb
83
+ - lib/generators/spider/templates/list_page_spec.rb
84
+ - lib/generators/spider/templates/list_page_test.rb
85
+ - lib/generators/spider/templates/show_page.rb
86
+ - lib/generators/spider/templates/show_page_spec.rb
87
+ - lib/generators/spider/templates/show_page_test.rb
88
+ - lib/generators/spider/templates/site.rb
89
+ - lib/generators/spider/templates/site_spec.rb
90
+ - lib/generators/spider/templates/test.rb
91
+ - lib/generators/spider_migration/spider_migration_generator.rb
92
+ - lib/generators/spider_migration/templates/migration.rb
93
+ - lib/spider/active_record_methods.rb
94
+ - lib/spider/http.rb
95
+ - lib/spider/page/filter.rb
96
+ - lib/spider/page/label.rb
97
+ - lib/spider/page/pagination.rb
98
+ - lib/spider/page/proxy.rb
99
+ - lib/spider/page/publish.rb
100
+ - lib/spider/page/validation.rb
101
+ - lib/spider/page.rb
102
+ - lib/spider/site.rb
103
+ - lib/spider/spider_page.rb
104
+ - lib/spider/spider_page_label.rb
105
+ - lib/spider/version.rb
106
+ - lib/spider.rb
107
+ - lib/tasks/spider_tasks.rake
108
+ - MIT-LICENSE
109
+ - Rakefile
110
+ - init.rb
111
+ - install.rb
112
+ - README
113
+ - uninstall.rb
114
+ - test/spider_fu_test.rb
115
+ - test/test_helper.rb
116
+ homepage: http://www.powerapple.com
117
+ licenses: []
118
+
119
+ post_install_message:
120
+ rdoc_options: []
121
+
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ none: false
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ hash: 3
130
+ segments:
131
+ - 0
132
+ version: "0"
133
+ required_rubygems_version: !ruby/object:Gem::Requirement
134
+ none: false
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ hash: 3
139
+ segments:
140
+ - 0
141
+ version: "0"
142
+ requirements: []
143
+
144
+ rubyforge_project:
145
+ rubygems_version: 1.8.15
146
+ signing_key:
147
+ specification_version: 3
148
+ summary: spider
149
+ test_files:
150
+ - test/spider_fu_test.rb
151
+ - test/test_helper.rb