spider2 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +15 -0
- data/Rakefile +23 -0
- data/init.rb +3 -0
- data/install.rb +2 -0
- data/lib/generators/spider/spider_generator.rb +42 -0
- data/lib/generators/spider/templates/base_page.rb +6 -0
- data/lib/generators/spider/templates/base_page_spec.rb +13 -0
- data/lib/generators/spider/templates/index_page.rb +6 -0
- data/lib/generators/spider/templates/index_page_spec.rb +14 -0
- data/lib/generators/spider/templates/index_page_test.rb +10 -0
- data/lib/generators/spider/templates/list_page.rb +6 -0
- data/lib/generators/spider/templates/list_page_spec.rb +22 -0
- data/lib/generators/spider/templates/list_page_test.rb +10 -0
- data/lib/generators/spider/templates/show_page.rb +14 -0
- data/lib/generators/spider/templates/show_page_spec.rb +19 -0
- data/lib/generators/spider/templates/show_page_test.rb +10 -0
- data/lib/generators/spider/templates/site.rb +7 -0
- data/lib/generators/spider/templates/site_spec.rb +13 -0
- data/lib/generators/spider/templates/test.rb +10 -0
- data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
- data/lib/generators/spider_migration/templates/migration.rb +42 -0
- data/lib/spider/active_record_methods.rb +60 -0
- data/lib/spider/http.rb +43 -0
- data/lib/spider/page/filter.rb +132 -0
- data/lib/spider/page/label.rb +28 -0
- data/lib/spider/page/pagination.rb +142 -0
- data/lib/spider/page/proxy.rb +149 -0
- data/lib/spider/page/publish.rb +78 -0
- data/lib/spider/page/validation.rb +136 -0
- data/lib/spider/page.rb +759 -0
- data/lib/spider/site.rb +225 -0
- data/lib/spider/spider_page.rb +18 -0
- data/lib/spider/spider_page_label.rb +5 -0
- data/lib/spider/version.rb +3 -0
- data/lib/spider.rb +81 -0
- data/lib/tasks/spider_tasks.rake +86 -0
- data/test/spider_fu_test.rb +9 -0
- data/test/test_helper.rb +4 -0
- data/uninstall.rb +2 -0
- metadata +151 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 [name of plugin creator]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
|
5
|
+
desc 'Default: run unit tests.'
|
6
|
+
task :default => :test
|
7
|
+
|
8
|
+
desc 'Test the spider_fu plugin.'
|
9
|
+
Rake::TestTask.new(:test) do |t|
|
10
|
+
t.libs << 'lib'
|
11
|
+
t.libs << 'test'
|
12
|
+
t.pattern = 'test/**/*_test.rb'
|
13
|
+
t.verbose = true
|
14
|
+
end
|
15
|
+
|
16
|
+
desc 'Generate documentation for the spider_fu plugin.'
|
17
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
18
|
+
rdoc.rdoc_dir = 'rdoc'
|
19
|
+
rdoc.title = 'SpiderFu'
|
20
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
21
|
+
rdoc.rdoc_files.include('README')
|
22
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
23
|
+
end
|
data/init.rb
ADDED
data/install.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "rails/generators/active_record"
|
3
|
+
class SpiderGenerator < ActiveRecord::Generators::Base
|
4
|
+
|
5
|
+
source_root File.expand_path('../templates', __FILE__)
|
6
|
+
|
7
|
+
def generate_files
|
8
|
+
template "site.rb",
|
9
|
+
File.join("spiders/#{file_name}","site.rb")
|
10
|
+
template "base_page.rb",
|
11
|
+
File.join("spiders/#{file_name}","base_page.rb")
|
12
|
+
template "show_page.rb",
|
13
|
+
File.join("spiders/#{file_name}","show_page.rb")
|
14
|
+
template "list_page.rb",
|
15
|
+
File.join("spiders/#{file_name}","list_page.rb")
|
16
|
+
template "index_page.rb",
|
17
|
+
File.join("spiders/#{file_name}","index_page.rb")
|
18
|
+
|
19
|
+
# test
|
20
|
+
|
21
|
+
template "test.rb",
|
22
|
+
File.join("test/spiders/#{file_name}","site_test.rb")
|
23
|
+
template "show_page_test.rb",
|
24
|
+
File.join("test/spiders/#{file_name}","show_page_test.rb")
|
25
|
+
template "list_page_test.rb",
|
26
|
+
File.join("test/spiders/#{file_name}","list_page_test.rb")
|
27
|
+
template "index_page_test.rb",
|
28
|
+
File.join("test/spiders/#{file_name}","index_page_test.rb")
|
29
|
+
|
30
|
+
# spec
|
31
|
+
template "site_spec.rb",
|
32
|
+
File.join("spec/spiders/#{file_name}","site_spec.rb")
|
33
|
+
template "show_page_spec.rb",
|
34
|
+
File.join("spec/spiders/#{file_name}","show_page_spec.rb")
|
35
|
+
template "list_page_spec.rb",
|
36
|
+
File.join("spec/spiders/#{file_name}","list_page_spec.rb")
|
37
|
+
template "index_page_spec.rb",
|
38
|
+
File.join("spec/spiders/#{file_name}","index_page_spec.rb")
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::IndexPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@index_page = <%= class_name %>::Site.index_page
|
8
|
+
end
|
9
|
+
|
10
|
+
# it "will correct set encoding" do
|
11
|
+
# <%= class_name %>::IndexPage.encoding.should == "gbk"
|
12
|
+
# end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::ListPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@url = "" # set list page url here
|
8
|
+
@list_page = <%= class_name %>::ListPage.new @url
|
9
|
+
end
|
10
|
+
|
11
|
+
# it "will correct set encoding" do
|
12
|
+
# <%= class_name %>::IndexPage.encoding.should == "gbk"
|
13
|
+
# end
|
14
|
+
|
15
|
+
it "will return all show pages" do
|
16
|
+
show_pages = @list_page.show_pages
|
17
|
+
show_pages.each do |page|
|
18
|
+
# page.url.shuold
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module <%= class_name %>
|
3
|
+
class ShowPage < BasePage
|
4
|
+
# 一些常用操作:
|
5
|
+
# define_attributes :title
|
6
|
+
# filter :title,:filters=>:default
|
7
|
+
# before_crawl{ do some thing}
|
8
|
+
# before_fetch{ do login }
|
9
|
+
# set_proxy "host",port
|
10
|
+
# validate_url :domain=>"powerapple.com",
|
11
|
+
# :example=>'http://www.powerapple.com',
|
12
|
+
# :match=>/powerapple/
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::ShowPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@url = "" # set show page url here
|
8
|
+
@show_page = <%= class_name %>::ShowPage.new @url
|
9
|
+
end
|
10
|
+
|
11
|
+
it "will parse title correct" do
|
12
|
+
@show_page.title.should == ""
|
13
|
+
end
|
14
|
+
|
15
|
+
it "will parse body correct" do
|
16
|
+
@show_page.body.should == ""
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "rails/generators/active_record"
|
3
|
+
class SpiderMigrationGenerator < ActiveRecord::Generators::Base
|
4
|
+
source_root File.expand_path('../templates', __FILE__)
|
5
|
+
|
6
|
+
def copy_files
|
7
|
+
migration_template 'migration.rb',"db/migrate/create_spider_pages_and_spider_page_labels"
|
8
|
+
end
|
9
|
+
|
10
|
+
end
|
11
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class CreateSpiderPagesAndSpiderPageLabels < ActiveRecord::Migration
|
3
|
+
|
4
|
+
def self.up
|
5
|
+
|
6
|
+
create_table "spider_pages" do |t|
|
7
|
+
t.string "url"
|
8
|
+
t.boolean "done", :default => false
|
9
|
+
t.datetime "created_at"
|
10
|
+
t.datetime "updated_at"
|
11
|
+
t.integer "labels_count", :default => 0
|
12
|
+
t.string "labels_hash", :limit => 32
|
13
|
+
t.boolean "published", :default => false
|
14
|
+
t.boolean "fragment", :default => false
|
15
|
+
t.integer "content_length"
|
16
|
+
t.string "site"
|
17
|
+
end
|
18
|
+
|
19
|
+
add_index :spider_pages,:url
|
20
|
+
add_index :spider_pages,:site
|
21
|
+
|
22
|
+
|
23
|
+
create_table "spider_page_labels" do |t|
|
24
|
+
t.string "name"
|
25
|
+
t.integer "page_id"
|
26
|
+
t.text "value"
|
27
|
+
t.datetime "created_at"
|
28
|
+
t.datetime "updated_at"
|
29
|
+
end
|
30
|
+
|
31
|
+
add_index :spider_page_labels,:name
|
32
|
+
add_index :spider_page_labels,:page_id
|
33
|
+
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def self.down
|
39
|
+
drop_table :spider_page_labels
|
40
|
+
drop_table :spider_pages
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::ActiveRecordMethods
|
3
|
+
|
4
|
+
def self.included(base)
|
5
|
+
base.send(:extend,ClassMethods)
|
6
|
+
base.send(:include,InstanceMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
|
11
|
+
# 从url中创建
|
12
|
+
# 需要自身实现了 receive_spider_page 方法
|
13
|
+
# 才可以正确调用
|
14
|
+
def create_from_url(url,force=false)
|
15
|
+
if !force && ::SpiderPage.find_by_url(url)
|
16
|
+
raise "此URL已经存在于系统中了。"
|
17
|
+
end
|
18
|
+
# 找到能够处理 title跟body的page类
|
19
|
+
pages = Spider::Page.find_all_by_url(url)
|
20
|
+
|
21
|
+
object = nil
|
22
|
+
raise "没有找到合适的规则" if pages.empty?
|
23
|
+
|
24
|
+
Spider::Site.logger.info "采集单个#{human_name} #{url},找到 适合的规则 #{pages.inspect}"
|
25
|
+
pages.each do |page|
|
26
|
+
|
27
|
+
page.logger.info "使用 #{page} 的规则来尝试采集"
|
28
|
+
begin
|
29
|
+
spider_page = page.new(url)
|
30
|
+
results = spider_page.publish_to(self)
|
31
|
+
object = results.first
|
32
|
+
rescue Exception=>e
|
33
|
+
logger.error e.message
|
34
|
+
logger.error e.backtrace.join("\n")
|
35
|
+
object = nil
|
36
|
+
end
|
37
|
+
if object.try(:valid?)
|
38
|
+
page.logger.info "采集成功"
|
39
|
+
# 保存url
|
40
|
+
spider_page.save
|
41
|
+
break
|
42
|
+
else
|
43
|
+
page.logger.info "采集失败: #{object.try(:errors).try(:full_messages).try(:first)}"
|
44
|
+
object = nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
unless object
|
49
|
+
raise "采集器了找到了规则 #{pages.inspect}, 但是都失败了."
|
50
|
+
end
|
51
|
+
object
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
module InstanceMethods
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
data/lib/spider/http.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Http
|
3
|
+
include HTTParty
|
4
|
+
headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
|
5
|
+
# debug_output Spider.logger
|
6
|
+
|
7
|
+
def self.with_options(new_options = {})
|
8
|
+
@options_mutex ||= Mutex.new
|
9
|
+
@options_mutex.synchronize do
|
10
|
+
original_default_options = self.default_options.deep_dup # deep_dup ,非常重要由active support提供
|
11
|
+
self.default_options.merge! new_options
|
12
|
+
begin
|
13
|
+
yield
|
14
|
+
rescue Exception => e
|
15
|
+
Spider.logger.error e.message
|
16
|
+
Spider.logger.error e.backtrace.join("\n")
|
17
|
+
ensure
|
18
|
+
original_default_options[:headers].delete 'referer'
|
19
|
+
self.default_options = original_default_options
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.with_proxy(ip,port,&block)
|
25
|
+
http_proxy ip,port
|
26
|
+
result = yield
|
27
|
+
clear_proxy
|
28
|
+
result
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.clear_proxy
|
32
|
+
Spider::Http.default_options.delete :http_proxyaddr
|
33
|
+
Spider::Http.default_options.delete :http_proxyport
|
34
|
+
end
|
35
|
+
|
36
|
+
=begin
|
37
|
+
def self.get(url,options = {})
|
38
|
+
open(url).read
|
39
|
+
end
|
40
|
+
=end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# 过滤html用的
|
3
|
+
module Spider::Page::Filter
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.class_eval do
|
7
|
+
send(:include,InstanceMethods)
|
8
|
+
send(:extend,ClassMethods)
|
9
|
+
alias_method_chain :read_attribute,:filter
|
10
|
+
class_attribute :attributes_filters
|
11
|
+
class_attribute :attributes_before_filters
|
12
|
+
class_attribute :attributes_after_filters
|
13
|
+
self.attributes_filters = {}
|
14
|
+
self.attributes_before_filters = {}
|
15
|
+
self.attributes_after_filters = {}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
module ClassMethods
|
20
|
+
|
21
|
+
def default_filter(text)
|
22
|
+
sanitize(text)
|
23
|
+
end
|
24
|
+
|
25
|
+
def sanitizer
|
26
|
+
@sanitizer ||= HTML::WhiteListSanitizer.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def sanitize(text,options={})
|
30
|
+
sanitizer.sanitize(text,options)
|
31
|
+
end
|
32
|
+
|
33
|
+
# class Sina::ShowPage < Sina::BasePage
|
34
|
+
# filter :title,:filters=>[:default]
|
35
|
+
# filter :title2,:filters=>:method_to_call_to_filter
|
36
|
+
# filter :title3,:filters=>lambda{|i| 'string to return '}
|
37
|
+
# end
|
38
|
+
def filter(*args,&block)
|
39
|
+
options = args.extract_options!
|
40
|
+
options[:filters] = block if block
|
41
|
+
options.assert_valid_keys :filters,:position
|
42
|
+
position = options[:position]
|
43
|
+
position = position.to_s + "_" if position
|
44
|
+
filter_attrs = send("attributes_#{position}filters")
|
45
|
+
logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
|
46
|
+
args.each do |attr_name|
|
47
|
+
filter_attrs[attr_name] ||= {}
|
48
|
+
# current_options[:filters] = options
|
49
|
+
filter_attrs[attr_name] = options #current_options
|
50
|
+
end
|
51
|
+
logger.debug "#{filter_attrs.inspect}"
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def filters
|
56
|
+
{:before=>attributes_before_filters,:middle=>attributes_filters,:after=>attributes_after_filters}
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
module InstanceMethods
|
62
|
+
|
63
|
+
|
64
|
+
def read_attribute_with_filter(name,reload = false)
|
65
|
+
# 会经过过滤
|
66
|
+
unless !reload && @attributes[name]
|
67
|
+
value = read_attribute_without_filter(name)
|
68
|
+
value = filter_for_attribute(name,value)
|
69
|
+
@attributes[name] = value
|
70
|
+
end
|
71
|
+
@attributes[name]
|
72
|
+
end
|
73
|
+
|
74
|
+
def default_filter(text)
|
75
|
+
self.class.default_filter text
|
76
|
+
end
|
77
|
+
|
78
|
+
def sanitize(*args,&block)
|
79
|
+
self.class.sanitize(*args,&block)
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
def filter_for_attribute(name,text)
|
84
|
+
[self.attributes_before_filters,self.attributes_filters,self.attributes_after_filters].each do |filter_attrs|
|
85
|
+
filter_attrs.each_pair do |attr_name,filter|
|
86
|
+
logger.debug "#{attr_name} : #{filter.inspect}"
|
87
|
+
if attr_name == name
|
88
|
+
text = filter_text(text,filter[:filters])
|
89
|
+
logger.debug "#{name}(Filtered):#{text}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
text
|
94
|
+
end
|
95
|
+
|
96
|
+
# 执行过滤操作
|
97
|
+
def filter_text(text,filters=[])
|
98
|
+
[filters].flatten.each do |filter|
|
99
|
+
# 可以定义多个内建的过滤器
|
100
|
+
text = case filter
|
101
|
+
when :default
|
102
|
+
logger.debug "default filter"
|
103
|
+
default_filter(text)
|
104
|
+
when Symbol
|
105
|
+
logger.debug "symbol filter"
|
106
|
+
if respond_to?(filter)
|
107
|
+
send(filter,text)
|
108
|
+
else
|
109
|
+
text
|
110
|
+
end
|
111
|
+
when Proc
|
112
|
+
logger.debug "proc filter"
|
113
|
+
filter.bind(self).call(text)
|
114
|
+
else
|
115
|
+
logger.debug "unknow filter"
|
116
|
+
text
|
117
|
+
end
|
118
|
+
end
|
119
|
+
text
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
def filters
|
128
|
+
self.class.filters
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# 用来存储页面中分析到的有用信息
|
3
|
+
module Spider::Page::Label
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
module InstanceMethods
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def label(name,options = {},&block)
|
11
|
+
name = name.to_sym
|
12
|
+
define_attribute name
|
13
|
+
define_method(name) do |*args|
|
14
|
+
opts = args.extract_options!
|
15
|
+
instance_variable_name = "@__#{name}__"
|
16
|
+
if opts[:reload]
|
17
|
+
instance_variable_set(instance_variable_name,nil)
|
18
|
+
end
|
19
|
+
value = instance_variable_get(instance_variable_name)
|
20
|
+
unless value
|
21
|
+
value = block.bind(self).call
|
22
|
+
instance_variable_set(instance_variable_name,value)
|
23
|
+
end
|
24
|
+
value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|