spider2 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +15 -0
  3. data/Rakefile +23 -0
  4. data/init.rb +3 -0
  5. data/install.rb +2 -0
  6. data/lib/generators/spider/spider_generator.rb +42 -0
  7. data/lib/generators/spider/templates/base_page.rb +6 -0
  8. data/lib/generators/spider/templates/base_page_spec.rb +13 -0
  9. data/lib/generators/spider/templates/index_page.rb +6 -0
  10. data/lib/generators/spider/templates/index_page_spec.rb +14 -0
  11. data/lib/generators/spider/templates/index_page_test.rb +10 -0
  12. data/lib/generators/spider/templates/list_page.rb +6 -0
  13. data/lib/generators/spider/templates/list_page_spec.rb +22 -0
  14. data/lib/generators/spider/templates/list_page_test.rb +10 -0
  15. data/lib/generators/spider/templates/show_page.rb +14 -0
  16. data/lib/generators/spider/templates/show_page_spec.rb +19 -0
  17. data/lib/generators/spider/templates/show_page_test.rb +10 -0
  18. data/lib/generators/spider/templates/site.rb +7 -0
  19. data/lib/generators/spider/templates/site_spec.rb +13 -0
  20. data/lib/generators/spider/templates/test.rb +10 -0
  21. data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
  22. data/lib/generators/spider_migration/templates/migration.rb +42 -0
  23. data/lib/spider/active_record_methods.rb +60 -0
  24. data/lib/spider/http.rb +43 -0
  25. data/lib/spider/page/filter.rb +132 -0
  26. data/lib/spider/page/label.rb +28 -0
  27. data/lib/spider/page/pagination.rb +142 -0
  28. data/lib/spider/page/proxy.rb +149 -0
  29. data/lib/spider/page/publish.rb +78 -0
  30. data/lib/spider/page/validation.rb +136 -0
  31. data/lib/spider/page.rb +759 -0
  32. data/lib/spider/site.rb +225 -0
  33. data/lib/spider/spider_page.rb +18 -0
  34. data/lib/spider/spider_page_label.rb +5 -0
  35. data/lib/spider/version.rb +3 -0
  36. data/lib/spider.rb +81 -0
  37. data/lib/tasks/spider_tasks.rake +86 -0
  38. data/test/spider_fu_test.rb +9 -0
  39. data/test/test_helper.rb +4 -0
  40. data/uninstall.rb +2 -0
  41. metadata +151 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,15 @@
1
+ SpiderFu
2
+ ========
3
+
4
+ Introduction goes here.
5
+
6
+
7
+ Example
8
+ =======
9
+
10
+ LaimaimaSpider.start
11
+
12
+ Example goes here.
13
+
14
+
15
+ Copyright (c) 2009 [name of plugin creator], released under the MIT license
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+
5
+ desc 'Default: run unit tests.'
6
+ task :default => :test
7
+
8
+ desc 'Test the spider_fu plugin.'
9
+ Rake::TestTask.new(:test) do |t|
10
+ t.libs << 'lib'
11
+ t.libs << 'test'
12
+ t.pattern = 'test/**/*_test.rb'
13
+ t.verbose = true
14
+ end
15
+
16
+ desc 'Generate documentation for the spider_fu plugin.'
17
+ Rake::RDocTask.new(:rdoc) do |rdoc|
18
+ rdoc.rdoc_dir = 'rdoc'
19
+ rdoc.title = 'SpiderFu'
20
+ rdoc.options << '--line-numbers' << '--inline-source'
21
+ rdoc.rdoc_files.include('README')
22
+ rdoc.rdoc_files.include('lib/**/*.rb')
23
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ # encoding: utf-8
2
+ # Include hook code here
3
+ require "spider"
data/install.rb ADDED
@@ -0,0 +1,2 @@
1
+ # encoding: utf-8
2
+ # Install hook code here
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require "rails/generators/active_record"
3
+ class SpiderGenerator < ActiveRecord::Generators::Base
4
+
5
+ source_root File.expand_path('../templates', __FILE__)
6
+
7
+ def generate_files
8
+ template "site.rb",
9
+ File.join("spiders/#{file_name}","site.rb")
10
+ template "base_page.rb",
11
+ File.join("spiders/#{file_name}","base_page.rb")
12
+ template "show_page.rb",
13
+ File.join("spiders/#{file_name}","show_page.rb")
14
+ template "list_page.rb",
15
+ File.join("spiders/#{file_name}","list_page.rb")
16
+ template "index_page.rb",
17
+ File.join("spiders/#{file_name}","index_page.rb")
18
+
19
+ # test
20
+
21
+ template "test.rb",
22
+ File.join("test/spiders/#{file_name}","site_test.rb")
23
+ template "show_page_test.rb",
24
+ File.join("test/spiders/#{file_name}","show_page_test.rb")
25
+ template "list_page_test.rb",
26
+ File.join("test/spiders/#{file_name}","list_page_test.rb")
27
+ template "index_page_test.rb",
28
+ File.join("test/spiders/#{file_name}","index_page_test.rb")
29
+
30
+ # spec
31
+ template "site_spec.rb",
32
+ File.join("spec/spiders/#{file_name}","site_spec.rb")
33
+ template "show_page_spec.rb",
34
+ File.join("spec/spiders/#{file_name}","show_page_spec.rb")
35
+ template "list_page_spec.rb",
36
+ File.join("spec/spiders/#{file_name}","list_page_spec.rb")
37
+ template "index_page_spec.rb",
38
+ File.join("spec/spiders/#{file_name}","index_page_spec.rb")
39
+
40
+ end
41
+
42
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class BasePage < Spider::Page
4
+ encoding "utf-8"
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::BasePage do
5
+
6
+ before :each do
7
+ end
8
+
9
+ # it "will correct set encoding" do
10
+ # <%= class_name %>::BasePage.encoding.should == "gbk"
11
+ # end
12
+
13
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class IndexPage < BasePage
4
+ ignore_existing
5
+ end
6
+ end
@@ -0,0 +1,14 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::IndexPage do
5
+
6
+ before :each do
7
+ @index_page = <%= class_name %>::Site.index_page
8
+ end
9
+
10
+ # it "will correct set encoding" do
11
+ # <%= class_name %>::IndexPage.encoding.should == "gbk"
12
+ # end
13
+
14
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::IndexPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class ListPage < BasePage
4
+ ignore_existing
5
+ end
6
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::ListPage do
5
+
6
+ before :each do
7
+ @url = "" # set list page url here
8
+ @list_page = <%= class_name %>::ListPage.new @url
9
+ end
10
+
11
+ # it "will correct set encoding" do
12
+ # <%= class_name %>::IndexPage.encoding.should == "gbk"
13
+ # end
14
+
15
+ it "will return all show pages" do
16
+ show_pages = @list_page.show_pages
17
+ show_pages.each do |page|
18
+ # page.url.shuold
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::ListPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,14 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class ShowPage < BasePage
4
+ # 一些常用操作:
5
+ # define_attributes :title
6
+ # filter :title,:filters=>:default
7
+ # before_crawl{ do some thing}
8
+ # before_fetch{ do login }
9
+ # set_proxy "host",port
10
+ # validate_url :domain=>"powerapple.com",
11
+ # :example=>'http://www.powerapple.com',
12
+ # :match=>/powerapple/
13
+ end
14
+ end
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::ShowPage do
5
+
6
+ before :each do
7
+ @url = "" # set show page url here
8
+ @show_page = <%= class_name %>::ShowPage.new @url
9
+ end
10
+
11
+ it "will parse title correct" do
12
+ @show_page.title.should == ""
13
+ end
14
+
15
+ it "will parse body correct" do
16
+ @show_page.body.should == ""
17
+ end
18
+
19
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::ShowPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class Site < Spider::Site
4
+ set_name '<%= class_name %>'
5
+ set_id '<%= file_name %>'
6
+ end
7
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::Site do
5
+
6
+ before :each do
7
+ end
8
+
9
+ # it "will correct set encoding" do
10
+ # <%= class_name %>::Site.encoding.should == "gbk"
11
+ # end
12
+
13
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::SiteTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "rails/generators/active_record"
3
+ class SpiderMigrationGenerator < ActiveRecord::Generators::Base
4
+ source_root File.expand_path('../templates', __FILE__)
5
+
6
+ def copy_files
7
+ migration_template 'migration.rb',"db/migrate/create_spider_pages_and_spider_page_labels"
8
+ end
9
+
10
+ end
11
+
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ class CreateSpiderPagesAndSpiderPageLabels < ActiveRecord::Migration
3
+
4
+ def self.up
5
+
6
+ create_table "spider_pages" do |t|
7
+ t.string "url"
8
+ t.boolean "done", :default => false
9
+ t.datetime "created_at"
10
+ t.datetime "updated_at"
11
+ t.integer "labels_count", :default => 0
12
+ t.string "labels_hash", :limit => 32
13
+ t.boolean "published", :default => false
14
+ t.boolean "fragment", :default => false
15
+ t.integer "content_length"
16
+ t.string "site"
17
+ end
18
+
19
+ add_index :spider_pages,:url
20
+ add_index :spider_pages,:site
21
+
22
+
23
+ create_table "spider_page_labels" do |t|
24
+ t.string "name"
25
+ t.integer "page_id"
26
+ t.text "value"
27
+ t.datetime "created_at"
28
+ t.datetime "updated_at"
29
+ end
30
+
31
+ add_index :spider_page_labels,:name
32
+ add_index :spider_page_labels,:page_id
33
+
34
+
35
+ end
36
+
37
+
38
+ def self.down
39
+ drop_table :spider_page_labels
40
+ drop_table :spider_pages
41
+ end
42
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module Spider::ActiveRecordMethods
3
+
4
+ def self.included(base)
5
+ base.send(:extend,ClassMethods)
6
+ base.send(:include,InstanceMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+
11
+ # 从url中创建
12
+ # 需要自身实现了 receive_spider_page 方法
13
+ # 才可以正确调用
14
+ def create_from_url(url,force=false)
15
+ if !force && ::SpiderPage.find_by_url(url)
16
+ raise "此URL已经存在于系统中了。"
17
+ end
18
+ # 找到能够处理 title跟body的page类
19
+ pages = Spider::Page.find_all_by_url(url)
20
+
21
+ object = nil
22
+ raise "没有找到合适的规则" if pages.empty?
23
+
24
+ Spider::Site.logger.info "采集单个#{human_name} #{url},找到 适合的规则 #{pages.inspect}"
25
+ pages.each do |page|
26
+
27
+ page.logger.info "使用 #{page} 的规则来尝试采集"
28
+ begin
29
+ spider_page = page.new(url)
30
+ results = spider_page.publish_to(self)
31
+ object = results.first
32
+ rescue Exception=>e
33
+ logger.error e.message
34
+ logger.error e.backtrace.join("\n")
35
+ object = nil
36
+ end
37
+ if object.try(:valid?)
38
+ page.logger.info "采集成功"
39
+ # 保存url
40
+ spider_page.save
41
+ break
42
+ else
43
+ page.logger.info "采集失败: #{object.try(:errors).try(:full_messages).try(:first)}"
44
+ object = nil
45
+ end
46
+ end
47
+
48
+ unless object
49
+ raise "采集器了找到了规则 #{pages.inspect}, 但是都失败了."
50
+ end
51
+ object
52
+
53
+ end
54
+
55
+ end
56
+
57
+ module InstanceMethods
58
+ end
59
+
60
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ module Spider::Http
3
+ include HTTParty
4
+ headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
5
+ # debug_output Spider.logger
6
+
7
+ def self.with_options(new_options = {})
8
+ @options_mutex ||= Mutex.new
9
+ @options_mutex.synchronize do
10
+ original_default_options = self.default_options.deep_dup # deep_dup ,非常重要由active support提供
11
+ self.default_options.merge! new_options
12
+ begin
13
+ yield
14
+ rescue Exception => e
15
+ Spider.logger.error e.message
16
+ Spider.logger.error e.backtrace.join("\n")
17
+ ensure
18
+ original_default_options[:headers].delete 'referer'
19
+ self.default_options = original_default_options
20
+ end
21
+ end
22
+ end
23
+
24
+ def self.with_proxy(ip,port,&block)
25
+ http_proxy ip,port
26
+ result = yield
27
+ clear_proxy
28
+ result
29
+ end
30
+
31
+ def self.clear_proxy
32
+ Spider::Http.default_options.delete :http_proxyaddr
33
+ Spider::Http.default_options.delete :http_proxyport
34
+ end
35
+
36
+ =begin
37
+ def self.get(url,options = {})
38
+ open(url).read
39
+ end
40
+ =end
41
+
42
+ end
43
+
@@ -0,0 +1,132 @@
1
+ # encoding: utf-8
2
+ # 过滤html用的
3
+ module Spider::Page::Filter
4
+
5
+ def self.included(base)
6
+ base.class_eval do
7
+ send(:include,InstanceMethods)
8
+ send(:extend,ClassMethods)
9
+ alias_method_chain :read_attribute,:filter
10
+ class_attribute :attributes_filters
11
+ class_attribute :attributes_before_filters
12
+ class_attribute :attributes_after_filters
13
+ self.attributes_filters = {}
14
+ self.attributes_before_filters = {}
15
+ self.attributes_after_filters = {}
16
+ end
17
+ end
18
+
19
+ module ClassMethods
20
+
21
+ def default_filter(text)
22
+ sanitize(text)
23
+ end
24
+
25
+ def sanitizer
26
+ @sanitizer ||= HTML::WhiteListSanitizer.new
27
+ end
28
+
29
+ def sanitize(text,options={})
30
+ sanitizer.sanitize(text,options)
31
+ end
32
+
33
+ # class Sina::ShowPage < Sina::BasePage
34
+ # filter :title,:filters=>[:default]
35
+ # filter :title2,:filters=>:method_to_call_to_filter
36
+ # filter :title3,:filters=>lambda{|i| 'string to return '}
37
+ # end
38
+ def filter(*args,&block)
39
+ options = args.extract_options!
40
+ options[:filters] = block if block
41
+ options.assert_valid_keys :filters,:position
42
+ position = options[:position]
43
+ position = position.to_s + "_" if position
44
+ filter_attrs = send("attributes_#{position}filters")
45
+ logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
46
+ args.each do |attr_name|
47
+ filter_attrs[attr_name] ||= {}
48
+ # current_options[:filters] = options
49
+ filter_attrs[attr_name] = options #current_options
50
+ end
51
+ logger.debug "#{filter_attrs.inspect}"
52
+ end
53
+
54
+
55
+ def filters
56
+ {:before=>attributes_before_filters,:middle=>attributes_filters,:after=>attributes_after_filters}
57
+ end
58
+
59
+ end
60
+
61
+ module InstanceMethods
62
+
63
+
64
+ def read_attribute_with_filter(name,reload = false)
65
+ # 会经过过滤
66
+ unless !reload && @attributes[name]
67
+ value = read_attribute_without_filter(name)
68
+ value = filter_for_attribute(name,value)
69
+ @attributes[name] = value
70
+ end
71
+ @attributes[name]
72
+ end
73
+
74
+ def default_filter(text)
75
+ self.class.default_filter text
76
+ end
77
+
78
+ def sanitize(*args,&block)
79
+ self.class.sanitize(*args,&block)
80
+ end
81
+
82
+
83
+ def filter_for_attribute(name,text)
84
+ [self.attributes_before_filters,self.attributes_filters,self.attributes_after_filters].each do |filter_attrs|
85
+ filter_attrs.each_pair do |attr_name,filter|
86
+ logger.debug "#{attr_name} : #{filter.inspect}"
87
+ if attr_name == name
88
+ text = filter_text(text,filter[:filters])
89
+ logger.debug "#{name}(Filtered):#{text}"
90
+ end
91
+ end
92
+ end
93
+ text
94
+ end
95
+
96
+ # 执行过滤操作
97
+ def filter_text(text,filters=[])
98
+ [filters].flatten.each do |filter|
99
+ # 可以定义多个内建的过滤器
100
+ text = case filter
101
+ when :default
102
+ logger.debug "default filter"
103
+ default_filter(text)
104
+ when Symbol
105
+ logger.debug "symbol filter"
106
+ if respond_to?(filter)
107
+ send(filter,text)
108
+ else
109
+ text
110
+ end
111
+ when Proc
112
+ logger.debug "proc filter"
113
+ filter.bind(self).call(text)
114
+ else
115
+ logger.debug "unknow filter"
116
+ text
117
+ end
118
+ end
119
+ text
120
+ end
121
+
122
+
123
+
124
+ end
125
+
126
+
127
+ def filters
128
+ self.class.filters
129
+ end
130
+
131
+
132
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ # 用来存储页面中分析到的有用信息
3
+ module Spider::Page::Label
4
+ extend ActiveSupport::Concern
5
+
6
+ module InstanceMethods
7
+ end
8
+
9
+ module ClassMethods
10
+ def label(name,options = {},&block)
11
+ name = name.to_sym
12
+ define_attribute name
13
+ define_method(name) do |*args|
14
+ opts = args.extract_options!
15
+ instance_variable_name = "@__#{name}__"
16
+ if opts[:reload]
17
+ instance_variable_set(instance_variable_name,nil)
18
+ end
19
+ value = instance_variable_get(instance_variable_name)
20
+ unless value
21
+ value = block.bind(self).call
22
+ instance_variable_set(instance_variable_name,value)
23
+ end
24
+ value
25
+ end
26
+ end
27
+ end
28
+ end