spider2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +15 -0
  3. data/Rakefile +23 -0
  4. data/init.rb +3 -0
  5. data/install.rb +2 -0
  6. data/lib/generators/spider/spider_generator.rb +42 -0
  7. data/lib/generators/spider/templates/base_page.rb +6 -0
  8. data/lib/generators/spider/templates/base_page_spec.rb +13 -0
  9. data/lib/generators/spider/templates/index_page.rb +6 -0
  10. data/lib/generators/spider/templates/index_page_spec.rb +14 -0
  11. data/lib/generators/spider/templates/index_page_test.rb +10 -0
  12. data/lib/generators/spider/templates/list_page.rb +6 -0
  13. data/lib/generators/spider/templates/list_page_spec.rb +22 -0
  14. data/lib/generators/spider/templates/list_page_test.rb +10 -0
  15. data/lib/generators/spider/templates/show_page.rb +14 -0
  16. data/lib/generators/spider/templates/show_page_spec.rb +19 -0
  17. data/lib/generators/spider/templates/show_page_test.rb +10 -0
  18. data/lib/generators/spider/templates/site.rb +7 -0
  19. data/lib/generators/spider/templates/site_spec.rb +13 -0
  20. data/lib/generators/spider/templates/test.rb +10 -0
  21. data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
  22. data/lib/generators/spider_migration/templates/migration.rb +42 -0
  23. data/lib/spider/active_record_methods.rb +60 -0
  24. data/lib/spider/http.rb +43 -0
  25. data/lib/spider/page/filter.rb +132 -0
  26. data/lib/spider/page/label.rb +28 -0
  27. data/lib/spider/page/pagination.rb +142 -0
  28. data/lib/spider/page/proxy.rb +149 -0
  29. data/lib/spider/page/publish.rb +78 -0
  30. data/lib/spider/page/validation.rb +136 -0
  31. data/lib/spider/page.rb +759 -0
  32. data/lib/spider/site.rb +225 -0
  33. data/lib/spider/spider_page.rb +18 -0
  34. data/lib/spider/spider_page_label.rb +5 -0
  35. data/lib/spider/version.rb +3 -0
  36. data/lib/spider.rb +81 -0
  37. data/lib/tasks/spider_tasks.rake +86 -0
  38. data/test/spider_fu_test.rb +9 -0
  39. data/test/test_helper.rb +4 -0
  40. data/uninstall.rb +2 -0
  41. metadata +151 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,15 @@
1
+ SpiderFu
2
+ ========
3
+
4
+ Introduction goes here.
5
+
6
+
7
+ Example
8
+ =======
9
+
10
+ LaimaimaSpider.start
11
+
12
+ Example goes here.
13
+
14
+
15
+ Copyright (c) 2009 [name of plugin creator], released under the MIT license
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+
5
+ desc 'Default: run unit tests.'
6
+ task :default => :test
7
+
8
+ desc 'Test the spider_fu plugin.'
9
+ Rake::TestTask.new(:test) do |t|
10
+ t.libs << 'lib'
11
+ t.libs << 'test'
12
+ t.pattern = 'test/**/*_test.rb'
13
+ t.verbose = true
14
+ end
15
+
16
+ desc 'Generate documentation for the spider_fu plugin.'
17
+ Rake::RDocTask.new(:rdoc) do |rdoc|
18
+ rdoc.rdoc_dir = 'rdoc'
19
+ rdoc.title = 'SpiderFu'
20
+ rdoc.options << '--line-numbers' << '--inline-source'
21
+ rdoc.rdoc_files.include('README')
22
+ rdoc.rdoc_files.include('lib/**/*.rb')
23
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ # encoding: utf-8
2
+ # Include hook code here
3
+ require "spider"
data/install.rb ADDED
@@ -0,0 +1,2 @@
1
+ # encoding: utf-8
2
+ # Install hook code here
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require "rails/generators/active_record"
3
+ class SpiderGenerator < ActiveRecord::Generators::Base
4
+
5
+ source_root File.expand_path('../templates', __FILE__)
6
+
7
+ def generate_files
8
+ template "site.rb",
9
+ File.join("spiders/#{file_name}","site.rb")
10
+ template "base_page.rb",
11
+ File.join("spiders/#{file_name}","base_page.rb")
12
+ template "show_page.rb",
13
+ File.join("spiders/#{file_name}","show_page.rb")
14
+ template "list_page.rb",
15
+ File.join("spiders/#{file_name}","list_page.rb")
16
+ template "index_page.rb",
17
+ File.join("spiders/#{file_name}","index_page.rb")
18
+
19
+ # test
20
+
21
+ template "test.rb",
22
+ File.join("test/spiders/#{file_name}","site_test.rb")
23
+ template "show_page_test.rb",
24
+ File.join("test/spiders/#{file_name}","show_page_test.rb")
25
+ template "list_page_test.rb",
26
+ File.join("test/spiders/#{file_name}","list_page_test.rb")
27
+ template "index_page_test.rb",
28
+ File.join("test/spiders/#{file_name}","index_page_test.rb")
29
+
30
+ # spec
31
+ template "site_spec.rb",
32
+ File.join("spec/spiders/#{file_name}","site_spec.rb")
33
+ template "show_page_spec.rb",
34
+ File.join("spec/spiders/#{file_name}","show_page_spec.rb")
35
+ template "list_page_spec.rb",
36
+ File.join("spec/spiders/#{file_name}","list_page_spec.rb")
37
+ template "index_page_spec.rb",
38
+ File.join("spec/spiders/#{file_name}","index_page_spec.rb")
39
+
40
+ end
41
+
42
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class BasePage < Spider::Page
4
+ encoding "utf-8"
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::BasePage do
5
+
6
+ before :each do
7
+ end
8
+
9
+ # it "will correct set encoding" do
10
+ # <%= class_name %>::BasePage.encoding.should == "gbk"
11
+ # end
12
+
13
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class IndexPage < BasePage
4
+ ignore_existing
5
+ end
6
+ end
@@ -0,0 +1,14 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::IndexPage do
5
+
6
+ before :each do
7
+ @index_page = <%= class_name %>::Site.index_page
8
+ end
9
+
10
+ # it "will correct set encoding" do
11
+ # <%= class_name %>::IndexPage.encoding.should == "gbk"
12
+ # end
13
+
14
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::IndexPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class ListPage < BasePage
4
+ ignore_existing
5
+ end
6
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::ListPage do
5
+
6
+ before :each do
7
+ @url = "" # set list page url here
8
+ @list_page = <%= class_name %>::ListPage.new @url
9
+ end
10
+
11
+ # it "will correct set encoding" do
12
+ # <%= class_name %>::IndexPage.encoding.should == "gbk"
13
+ # end
14
+
15
+ it "will return all show pages" do
16
+ show_pages = @list_page.show_pages
17
+ show_pages.each do |page|
18
+ # page.url.shuold
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::ListPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,14 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class ShowPage < BasePage
4
+ # 一些常用操作:
5
+ # define_attributes :title
6
+ # filter :title,:filters=>:default
7
+ # before_crawl{ do some thing}
8
+ # before_fetch{ do login }
9
+ # set_proxy "host",port
10
+ # validate_url :domain=>"powerapple.com",
11
+ # :example=>'http://www.powerapple.com',
12
+ # :match=>/powerapple/
13
+ end
14
+ end
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::ShowPage do
5
+
6
+ before :each do
7
+ @url = "" # set show page url here
8
+ @show_page = <%= class_name %>::ShowPage.new @url
9
+ end
10
+
11
+ it "will parse title correct" do
12
+ @show_page.title.should == ""
13
+ end
14
+
15
+ it "will parse body correct" do
16
+ @show_page.body.should == ""
17
+ end
18
+
19
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::ShowPageTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+ module <%= class_name %>
3
+ class Site < Spider::Site
4
+ set_name '<%= class_name %>'
5
+ set_id '<%= file_name %>'
6
+ end
7
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe <%= class_name %>::Site do
5
+
6
+ before :each do
7
+ end
8
+
9
+ # it "will correct set encoding" do
10
+ # <%= class_name %>::Site.encoding.should == "gbk"
11
+ # end
12
+
13
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ require "test_helper"
3
+ class <%= class_name %>::SiteTest < ActiveSupport::TestCase
4
+
5
+ test "true" do
6
+ assert true
7
+ end
8
+
9
+ end
10
+
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "rails/generators/active_record"
3
+ class SpiderMigrationGenerator < ActiveRecord::Generators::Base
4
+ source_root File.expand_path('../templates', __FILE__)
5
+
6
+ def copy_files
7
+ migration_template 'migration.rb',"db/migrate/create_spider_pages_and_spider_page_labels"
8
+ end
9
+
10
+ end
11
+
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ class CreateSpiderPagesAndSpiderPageLabels < ActiveRecord::Migration
3
+
4
+ def self.up
5
+
6
+ create_table "spider_pages" do |t|
7
+ t.string "url"
8
+ t.boolean "done", :default => false
9
+ t.datetime "created_at"
10
+ t.datetime "updated_at"
11
+ t.integer "labels_count", :default => 0
12
+ t.string "labels_hash", :limit => 32
13
+ t.boolean "published", :default => false
14
+ t.boolean "fragment", :default => false
15
+ t.integer "content_length"
16
+ t.string "site"
17
+ end
18
+
19
+ add_index :spider_pages,:url
20
+ add_index :spider_pages,:site
21
+
22
+
23
+ create_table "spider_page_labels" do |t|
24
+ t.string "name"
25
+ t.integer "page_id"
26
+ t.text "value"
27
+ t.datetime "created_at"
28
+ t.datetime "updated_at"
29
+ end
30
+
31
+ add_index :spider_page_labels,:name
32
+ add_index :spider_page_labels,:page_id
33
+
34
+
35
+ end
36
+
37
+
38
+ def self.down
39
+ drop_table :spider_page_labels
40
+ drop_table :spider_pages
41
+ end
42
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module Spider::ActiveRecordMethods
3
+
4
+ def self.included(base)
5
+ base.send(:extend,ClassMethods)
6
+ base.send(:include,InstanceMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+
11
+ # 从url中创建
12
+ # 需要自身实现了 receive_spider_page 方法
13
+ # 才可以正确调用
14
+ def create_from_url(url,force=false)
15
+ if !force && ::SpiderPage.find_by_url(url)
16
+ raise "此URL已经存在于系统中了。"
17
+ end
18
+ # 找到能够处理 title跟body的page类
19
+ pages = Spider::Page.find_all_by_url(url)
20
+
21
+ object = nil
22
+ raise "没有找到合适的规则" if pages.empty?
23
+
24
+ Spider::Site.logger.info "采集单个#{human_name} #{url},找到 适合的规则 #{pages.inspect}"
25
+ pages.each do |page|
26
+
27
+ page.logger.info "使用 #{page} 的规则来尝试采集"
28
+ begin
29
+ spider_page = page.new(url)
30
+ results = spider_page.publish_to(self)
31
+ object = results.first
32
+ rescue Exception=>e
33
+ logger.error e.message
34
+ logger.error e.backtrace.join("\n")
35
+ object = nil
36
+ end
37
+ if object.try(:valid?)
38
+ page.logger.info "采集成功"
39
+ # 保存url
40
+ spider_page.save
41
+ break
42
+ else
43
+ page.logger.info "采集失败: #{object.try(:errors).try(:full_messages).try(:first)}"
44
+ object = nil
45
+ end
46
+ end
47
+
48
+ unless object
49
+ raise "采集器了找到了规则 #{pages.inspect}, 但是都失败了."
50
+ end
51
+ object
52
+
53
+ end
54
+
55
+ end
56
+
57
+ module InstanceMethods
58
+ end
59
+
60
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ module Spider::Http
3
+ include HTTParty
4
+ headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
5
+ # debug_output Spider.logger
6
+
7
+ def self.with_options(new_options = {})
8
+ @options_mutex ||= Mutex.new
9
+ @options_mutex.synchronize do
10
+ original_default_options = self.default_options.deep_dup # deep_dup ,非常重要由active support提供
11
+ self.default_options.merge! new_options
12
+ begin
13
+ yield
14
+ rescue Exception => e
15
+ Spider.logger.error e.message
16
+ Spider.logger.error e.backtrace.join("\n")
17
+ ensure
18
+ original_default_options[:headers].delete 'referer'
19
+ self.default_options = original_default_options
20
+ end
21
+ end
22
+ end
23
+
24
+ def self.with_proxy(ip,port,&block)
25
+ http_proxy ip,port
26
+ result = yield
27
+ clear_proxy
28
+ result
29
+ end
30
+
31
+ def self.clear_proxy
32
+ Spider::Http.default_options.delete :http_proxyaddr
33
+ Spider::Http.default_options.delete :http_proxyport
34
+ end
35
+
36
+ =begin
37
+ def self.get(url,options = {})
38
+ open(url).read
39
+ end
40
+ =end
41
+
42
+ end
43
+
@@ -0,0 +1,132 @@
1
+ # encoding: utf-8
2
+ # 过滤html用的
3
+ module Spider::Page::Filter
4
+
5
+ def self.included(base)
6
+ base.class_eval do
7
+ send(:include,InstanceMethods)
8
+ send(:extend,ClassMethods)
9
+ alias_method_chain :read_attribute,:filter
10
+ class_attribute :attributes_filters
11
+ class_attribute :attributes_before_filters
12
+ class_attribute :attributes_after_filters
13
+ self.attributes_filters = {}
14
+ self.attributes_before_filters = {}
15
+ self.attributes_after_filters = {}
16
+ end
17
+ end
18
+
19
+ module ClassMethods
20
+
21
+ def default_filter(text)
22
+ sanitize(text)
23
+ end
24
+
25
+ def sanitizer
26
+ @sanitizer ||= HTML::WhiteListSanitizer.new
27
+ end
28
+
29
+ def sanitize(text,options={})
30
+ sanitizer.sanitize(text,options)
31
+ end
32
+
33
+ # class Sina::ShowPage < Sina::BasePage
34
+ # filter :title,:filters=>[:default]
35
+ # filter :title2,:filters=>:method_to_call_to_filter
36
+ # filter :title3,:filters=>lambda{|i| 'string to return '}
37
+ # end
38
+ def filter(*args,&block)
39
+ options = args.extract_options!
40
+ options[:filters] = block if block
41
+ options.assert_valid_keys :filters,:position
42
+ position = options[:position]
43
+ position = position.to_s + "_" if position
44
+ filter_attrs = send("attributes_#{position}filters")
45
+ logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
46
+ args.each do |attr_name|
47
+ filter_attrs[attr_name] ||= {}
48
+ # current_options[:filters] = options
49
+ filter_attrs[attr_name] = options #current_options
50
+ end
51
+ logger.debug "#{filter_attrs.inspect}"
52
+ end
53
+
54
+
55
+ def filters
56
+ {:before=>attributes_before_filters,:middle=>attributes_filters,:after=>attributes_after_filters}
57
+ end
58
+
59
+ end
60
+
61
+ module InstanceMethods
62
+
63
+
64
+ def read_attribute_with_filter(name,reload = false)
65
+ # 会经过过滤
66
+ unless !reload && @attributes[name]
67
+ value = read_attribute_without_filter(name)
68
+ value = filter_for_attribute(name,value)
69
+ @attributes[name] = value
70
+ end
71
+ @attributes[name]
72
+ end
73
+
74
+ def default_filter(text)
75
+ self.class.default_filter text
76
+ end
77
+
78
+ def sanitize(*args,&block)
79
+ self.class.sanitize(*args,&block)
80
+ end
81
+
82
+
83
+ def filter_for_attribute(name,text)
84
+ [self.attributes_before_filters,self.attributes_filters,self.attributes_after_filters].each do |filter_attrs|
85
+ filter_attrs.each_pair do |attr_name,filter|
86
+ logger.debug "#{attr_name} : #{filter.inspect}"
87
+ if attr_name == name
88
+ text = filter_text(text,filter[:filters])
89
+ logger.debug "#{name}(Filtered):#{text}"
90
+ end
91
+ end
92
+ end
93
+ text
94
+ end
95
+
96
+ # 执行过滤操作
97
+ def filter_text(text,filters=[])
98
+ [filters].flatten.each do |filter|
99
+ # 可以定义多个内建的过滤器
100
+ text = case filter
101
+ when :default
102
+ logger.debug "default filter"
103
+ default_filter(text)
104
+ when Symbol
105
+ logger.debug "symbol filter"
106
+ if respond_to?(filter)
107
+ send(filter,text)
108
+ else
109
+ text
110
+ end
111
+ when Proc
112
+ logger.debug "proc filter"
113
+ filter.bind(self).call(text)
114
+ else
115
+ logger.debug "unknow filter"
116
+ text
117
+ end
118
+ end
119
+ text
120
+ end
121
+
122
+
123
+
124
+ end
125
+
126
+
127
+ def filters
128
+ self.class.filters
129
+ end
130
+
131
+
132
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ # 用来存储页面中分析到的有用信息
3
+ module Spider::Page::Label
4
+ extend ActiveSupport::Concern
5
+
6
+ module InstanceMethods
7
+ end
8
+
9
+ module ClassMethods
10
+ def label(name,options = {},&block)
11
+ name = name.to_sym
12
+ define_attribute name
13
+ define_method(name) do |*args|
14
+ opts = args.extract_options!
15
+ instance_variable_name = "@__#{name}__"
16
+ if opts[:reload]
17
+ instance_variable_set(instance_variable_name,nil)
18
+ end
19
+ value = instance_variable_get(instance_variable_name)
20
+ unless value
21
+ value = block.bind(self).call
22
+ instance_variable_set(instance_variable_name,value)
23
+ end
24
+ value
25
+ end
26
+ end
27
+ end
28
+ end