spider2 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +15 -0
- data/Rakefile +23 -0
- data/init.rb +3 -0
- data/install.rb +2 -0
- data/lib/generators/spider/spider_generator.rb +42 -0
- data/lib/generators/spider/templates/base_page.rb +6 -0
- data/lib/generators/spider/templates/base_page_spec.rb +13 -0
- data/lib/generators/spider/templates/index_page.rb +6 -0
- data/lib/generators/spider/templates/index_page_spec.rb +14 -0
- data/lib/generators/spider/templates/index_page_test.rb +10 -0
- data/lib/generators/spider/templates/list_page.rb +6 -0
- data/lib/generators/spider/templates/list_page_spec.rb +22 -0
- data/lib/generators/spider/templates/list_page_test.rb +10 -0
- data/lib/generators/spider/templates/show_page.rb +14 -0
- data/lib/generators/spider/templates/show_page_spec.rb +19 -0
- data/lib/generators/spider/templates/show_page_test.rb +10 -0
- data/lib/generators/spider/templates/site.rb +7 -0
- data/lib/generators/spider/templates/site_spec.rb +13 -0
- data/lib/generators/spider/templates/test.rb +10 -0
- data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
- data/lib/generators/spider_migration/templates/migration.rb +42 -0
- data/lib/spider/active_record_methods.rb +60 -0
- data/lib/spider/http.rb +43 -0
- data/lib/spider/page/filter.rb +132 -0
- data/lib/spider/page/label.rb +28 -0
- data/lib/spider/page/pagination.rb +142 -0
- data/lib/spider/page/proxy.rb +149 -0
- data/lib/spider/page/publish.rb +78 -0
- data/lib/spider/page/validation.rb +136 -0
- data/lib/spider/page.rb +759 -0
- data/lib/spider/site.rb +225 -0
- data/lib/spider/spider_page.rb +18 -0
- data/lib/spider/spider_page_label.rb +5 -0
- data/lib/spider/version.rb +3 -0
- data/lib/spider.rb +81 -0
- data/lib/tasks/spider_tasks.rake +86 -0
- data/test/spider_fu_test.rb +9 -0
- data/test/test_helper.rb +4 -0
- data/uninstall.rb +2 -0
- metadata +151 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 [name of plugin creator]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
|
5
|
+
desc 'Default: run unit tests.'
|
6
|
+
task :default => :test
|
7
|
+
|
8
|
+
desc 'Test the spider_fu plugin.'
|
9
|
+
Rake::TestTask.new(:test) do |t|
|
10
|
+
t.libs << 'lib'
|
11
|
+
t.libs << 'test'
|
12
|
+
t.pattern = 'test/**/*_test.rb'
|
13
|
+
t.verbose = true
|
14
|
+
end
|
15
|
+
|
16
|
+
desc 'Generate documentation for the spider_fu plugin.'
|
17
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
18
|
+
rdoc.rdoc_dir = 'rdoc'
|
19
|
+
rdoc.title = 'SpiderFu'
|
20
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
21
|
+
rdoc.rdoc_files.include('README')
|
22
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
23
|
+
end
|
data/init.rb
ADDED
data/install.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "rails/generators/active_record"
|
3
|
+
class SpiderGenerator < ActiveRecord::Generators::Base
|
4
|
+
|
5
|
+
source_root File.expand_path('../templates', __FILE__)
|
6
|
+
|
7
|
+
def generate_files
|
8
|
+
template "site.rb",
|
9
|
+
File.join("spiders/#{file_name}","site.rb")
|
10
|
+
template "base_page.rb",
|
11
|
+
File.join("spiders/#{file_name}","base_page.rb")
|
12
|
+
template "show_page.rb",
|
13
|
+
File.join("spiders/#{file_name}","show_page.rb")
|
14
|
+
template "list_page.rb",
|
15
|
+
File.join("spiders/#{file_name}","list_page.rb")
|
16
|
+
template "index_page.rb",
|
17
|
+
File.join("spiders/#{file_name}","index_page.rb")
|
18
|
+
|
19
|
+
# test
|
20
|
+
|
21
|
+
template "test.rb",
|
22
|
+
File.join("test/spiders/#{file_name}","site_test.rb")
|
23
|
+
template "show_page_test.rb",
|
24
|
+
File.join("test/spiders/#{file_name}","show_page_test.rb")
|
25
|
+
template "list_page_test.rb",
|
26
|
+
File.join("test/spiders/#{file_name}","list_page_test.rb")
|
27
|
+
template "index_page_test.rb",
|
28
|
+
File.join("test/spiders/#{file_name}","index_page_test.rb")
|
29
|
+
|
30
|
+
# spec
|
31
|
+
template "site_spec.rb",
|
32
|
+
File.join("spec/spiders/#{file_name}","site_spec.rb")
|
33
|
+
template "show_page_spec.rb",
|
34
|
+
File.join("spec/spiders/#{file_name}","show_page_spec.rb")
|
35
|
+
template "list_page_spec.rb",
|
36
|
+
File.join("spec/spiders/#{file_name}","list_page_spec.rb")
|
37
|
+
template "index_page_spec.rb",
|
38
|
+
File.join("spec/spiders/#{file_name}","index_page_spec.rb")
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::IndexPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@index_page = <%= class_name %>::Site.index_page
|
8
|
+
end
|
9
|
+
|
10
|
+
# it "will correct set encoding" do
|
11
|
+
# <%= class_name %>::IndexPage.encoding.should == "gbk"
|
12
|
+
# end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::ListPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@url = "" # set list page url here
|
8
|
+
@list_page = <%= class_name %>::ListPage.new @url
|
9
|
+
end
|
10
|
+
|
11
|
+
# it "will correct set encoding" do
|
12
|
+
# <%= class_name %>::IndexPage.encoding.should == "gbk"
|
13
|
+
# end
|
14
|
+
|
15
|
+
it "will return all show pages" do
|
16
|
+
show_pages = @list_page.show_pages
|
17
|
+
show_pages.each do |page|
|
18
|
+
# page.url.shuold
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module <%= class_name %>
|
3
|
+
class ShowPage < BasePage
|
4
|
+
# 一些常用操作:
|
5
|
+
# define_attributes :title
|
6
|
+
# filter :title,:filters=>:default
|
7
|
+
# before_crawl{ do some thing}
|
8
|
+
# before_fetch{ do login }
|
9
|
+
# set_proxy "host",port
|
10
|
+
# validate_url :domain=>"powerapple.com",
|
11
|
+
# :example=>'http://www.powerapple.com',
|
12
|
+
# :match=>/powerapple/
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe <%= class_name %>::ShowPage do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@url = "" # set show page url here
|
8
|
+
@show_page = <%= class_name %>::ShowPage.new @url
|
9
|
+
end
|
10
|
+
|
11
|
+
it "will parse title correct" do
|
12
|
+
@show_page.title.should == ""
|
13
|
+
end
|
14
|
+
|
15
|
+
it "will parse body correct" do
|
16
|
+
@show_page.body.should == ""
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "rails/generators/active_record"
|
3
|
+
class SpiderMigrationGenerator < ActiveRecord::Generators::Base
|
4
|
+
source_root File.expand_path('../templates', __FILE__)
|
5
|
+
|
6
|
+
def copy_files
|
7
|
+
migration_template 'migration.rb',"db/migrate/create_spider_pages_and_spider_page_labels"
|
8
|
+
end
|
9
|
+
|
10
|
+
end
|
11
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class CreateSpiderPagesAndSpiderPageLabels < ActiveRecord::Migration
|
3
|
+
|
4
|
+
def self.up
|
5
|
+
|
6
|
+
create_table "spider_pages" do |t|
|
7
|
+
t.string "url"
|
8
|
+
t.boolean "done", :default => false
|
9
|
+
t.datetime "created_at"
|
10
|
+
t.datetime "updated_at"
|
11
|
+
t.integer "labels_count", :default => 0
|
12
|
+
t.string "labels_hash", :limit => 32
|
13
|
+
t.boolean "published", :default => false
|
14
|
+
t.boolean "fragment", :default => false
|
15
|
+
t.integer "content_length"
|
16
|
+
t.string "site"
|
17
|
+
end
|
18
|
+
|
19
|
+
add_index :spider_pages,:url
|
20
|
+
add_index :spider_pages,:site
|
21
|
+
|
22
|
+
|
23
|
+
create_table "spider_page_labels" do |t|
|
24
|
+
t.string "name"
|
25
|
+
t.integer "page_id"
|
26
|
+
t.text "value"
|
27
|
+
t.datetime "created_at"
|
28
|
+
t.datetime "updated_at"
|
29
|
+
end
|
30
|
+
|
31
|
+
add_index :spider_page_labels,:name
|
32
|
+
add_index :spider_page_labels,:page_id
|
33
|
+
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def self.down
|
39
|
+
drop_table :spider_page_labels
|
40
|
+
drop_table :spider_pages
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::ActiveRecordMethods
|
3
|
+
|
4
|
+
def self.included(base)
|
5
|
+
base.send(:extend,ClassMethods)
|
6
|
+
base.send(:include,InstanceMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
|
11
|
+
# 从url中创建
|
12
|
+
# 需要自身实现了 receive_spider_page 方法
|
13
|
+
# 才可以正确调用
|
14
|
+
def create_from_url(url,force=false)
|
15
|
+
if !force && ::SpiderPage.find_by_url(url)
|
16
|
+
raise "此URL已经存在于系统中了。"
|
17
|
+
end
|
18
|
+
# 找到能够处理 title跟body的page类
|
19
|
+
pages = Spider::Page.find_all_by_url(url)
|
20
|
+
|
21
|
+
object = nil
|
22
|
+
raise "没有找到合适的规则" if pages.empty?
|
23
|
+
|
24
|
+
Spider::Site.logger.info "采集单个#{human_name} #{url},找到 适合的规则 #{pages.inspect}"
|
25
|
+
pages.each do |page|
|
26
|
+
|
27
|
+
page.logger.info "使用 #{page} 的规则来尝试采集"
|
28
|
+
begin
|
29
|
+
spider_page = page.new(url)
|
30
|
+
results = spider_page.publish_to(self)
|
31
|
+
object = results.first
|
32
|
+
rescue Exception=>e
|
33
|
+
logger.error e.message
|
34
|
+
logger.error e.backtrace.join("\n")
|
35
|
+
object = nil
|
36
|
+
end
|
37
|
+
if object.try(:valid?)
|
38
|
+
page.logger.info "采集成功"
|
39
|
+
# 保存url
|
40
|
+
spider_page.save
|
41
|
+
break
|
42
|
+
else
|
43
|
+
page.logger.info "采集失败: #{object.try(:errors).try(:full_messages).try(:first)}"
|
44
|
+
object = nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
unless object
|
49
|
+
raise "采集器了找到了规则 #{pages.inspect}, 但是都失败了."
|
50
|
+
end
|
51
|
+
object
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
module InstanceMethods
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
data/lib/spider/http.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Http
|
3
|
+
include HTTParty
|
4
|
+
headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
|
5
|
+
# debug_output Spider.logger
|
6
|
+
|
7
|
+
def self.with_options(new_options = {})
|
8
|
+
@options_mutex ||= Mutex.new
|
9
|
+
@options_mutex.synchronize do
|
10
|
+
original_default_options = self.default_options.deep_dup # deep_dup ,非常重要由active support提供
|
11
|
+
self.default_options.merge! new_options
|
12
|
+
begin
|
13
|
+
yield
|
14
|
+
rescue Exception => e
|
15
|
+
Spider.logger.error e.message
|
16
|
+
Spider.logger.error e.backtrace.join("\n")
|
17
|
+
ensure
|
18
|
+
original_default_options[:headers].delete 'referer'
|
19
|
+
self.default_options = original_default_options
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.with_proxy(ip,port,&block)
|
25
|
+
http_proxy ip,port
|
26
|
+
result = yield
|
27
|
+
clear_proxy
|
28
|
+
result
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.clear_proxy
|
32
|
+
Spider::Http.default_options.delete :http_proxyaddr
|
33
|
+
Spider::Http.default_options.delete :http_proxyport
|
34
|
+
end
|
35
|
+
|
36
|
+
=begin
|
37
|
+
def self.get(url,options = {})
|
38
|
+
open(url).read
|
39
|
+
end
|
40
|
+
=end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# 过滤html用的
|
3
|
+
module Spider::Page::Filter
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.class_eval do
|
7
|
+
send(:include,InstanceMethods)
|
8
|
+
send(:extend,ClassMethods)
|
9
|
+
alias_method_chain :read_attribute,:filter
|
10
|
+
class_attribute :attributes_filters
|
11
|
+
class_attribute :attributes_before_filters
|
12
|
+
class_attribute :attributes_after_filters
|
13
|
+
self.attributes_filters = {}
|
14
|
+
self.attributes_before_filters = {}
|
15
|
+
self.attributes_after_filters = {}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
module ClassMethods
|
20
|
+
|
21
|
+
def default_filter(text)
|
22
|
+
sanitize(text)
|
23
|
+
end
|
24
|
+
|
25
|
+
def sanitizer
|
26
|
+
@sanitizer ||= HTML::WhiteListSanitizer.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def sanitize(text,options={})
|
30
|
+
sanitizer.sanitize(text,options)
|
31
|
+
end
|
32
|
+
|
33
|
+
# class Sina::ShowPage < Sina::BasePage
|
34
|
+
# filter :title,:filters=>[:default]
|
35
|
+
# filter :title2,:filters=>:method_to_call_to_filter
|
36
|
+
# filter :title3,:filters=>lambda{|i| 'string to return '}
|
37
|
+
# end
|
38
|
+
def filter(*args,&block)
|
39
|
+
options = args.extract_options!
|
40
|
+
options[:filters] = block if block
|
41
|
+
options.assert_valid_keys :filters,:position
|
42
|
+
position = options[:position]
|
43
|
+
position = position.to_s + "_" if position
|
44
|
+
filter_attrs = send("attributes_#{position}filters")
|
45
|
+
logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
|
46
|
+
args.each do |attr_name|
|
47
|
+
filter_attrs[attr_name] ||= {}
|
48
|
+
# current_options[:filters] = options
|
49
|
+
filter_attrs[attr_name] = options #current_options
|
50
|
+
end
|
51
|
+
logger.debug "#{filter_attrs.inspect}"
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def filters
|
56
|
+
{:before=>attributes_before_filters,:middle=>attributes_filters,:after=>attributes_after_filters}
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
module InstanceMethods
|
62
|
+
|
63
|
+
|
64
|
+
def read_attribute_with_filter(name,reload = false)
|
65
|
+
# 会经过过滤
|
66
|
+
unless !reload && @attributes[name]
|
67
|
+
value = read_attribute_without_filter(name)
|
68
|
+
value = filter_for_attribute(name,value)
|
69
|
+
@attributes[name] = value
|
70
|
+
end
|
71
|
+
@attributes[name]
|
72
|
+
end
|
73
|
+
|
74
|
+
def default_filter(text)
|
75
|
+
self.class.default_filter text
|
76
|
+
end
|
77
|
+
|
78
|
+
def sanitize(*args,&block)
|
79
|
+
self.class.sanitize(*args,&block)
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
def filter_for_attribute(name,text)
|
84
|
+
[self.attributes_before_filters,self.attributes_filters,self.attributes_after_filters].each do |filter_attrs|
|
85
|
+
filter_attrs.each_pair do |attr_name,filter|
|
86
|
+
logger.debug "#{attr_name} : #{filter.inspect}"
|
87
|
+
if attr_name == name
|
88
|
+
text = filter_text(text,filter[:filters])
|
89
|
+
logger.debug "#{name}(Filtered):#{text}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
text
|
94
|
+
end
|
95
|
+
|
96
|
+
# 执行过滤操作
|
97
|
+
def filter_text(text,filters=[])
|
98
|
+
[filters].flatten.each do |filter|
|
99
|
+
# 可以定义多个内建的过滤器
|
100
|
+
text = case filter
|
101
|
+
when :default
|
102
|
+
logger.debug "default filter"
|
103
|
+
default_filter(text)
|
104
|
+
when Symbol
|
105
|
+
logger.debug "symbol filter"
|
106
|
+
if respond_to?(filter)
|
107
|
+
send(filter,text)
|
108
|
+
else
|
109
|
+
text
|
110
|
+
end
|
111
|
+
when Proc
|
112
|
+
logger.debug "proc filter"
|
113
|
+
filter.bind(self).call(text)
|
114
|
+
else
|
115
|
+
logger.debug "unknow filter"
|
116
|
+
text
|
117
|
+
end
|
118
|
+
end
|
119
|
+
text
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
def filters
|
128
|
+
self.class.filters
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# 用来存储页面中分析到的有用信息
|
3
|
+
module Spider::Page::Label
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
module InstanceMethods
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def label(name,options = {},&block)
|
11
|
+
name = name.to_sym
|
12
|
+
define_attribute name
|
13
|
+
define_method(name) do |*args|
|
14
|
+
opts = args.extract_options!
|
15
|
+
instance_variable_name = "@__#{name}__"
|
16
|
+
if opts[:reload]
|
17
|
+
instance_variable_set(instance_variable_name,nil)
|
18
|
+
end
|
19
|
+
value = instance_variable_get(instance_variable_name)
|
20
|
+
unless value
|
21
|
+
value = block.bind(self).call
|
22
|
+
instance_variable_set(instance_variable_name,value)
|
23
|
+
end
|
24
|
+
value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|