spider2 0.0.1 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/spider.rb +19 -22
- data/lib/spider/engine.rb +31 -0
- data/lib/spider/http.rb +5 -2
- data/lib/spider/httparty_patch.rb +37 -0
- data/lib/spider/page.rb +36 -26
- data/lib/spider/page/cache.rb +52 -0
- data/lib/spider/page/filter.rb +1 -0
- data/lib/spider/page/label.rb +0 -3
- data/lib/spider/page/proxy.rb +154 -64
- data/lib/spider/page/publish.rb +42 -44
- data/lib/spider/version.rb +1 -1
- data/lib/spider2.rb +1 -0
- data/lib/tasks/spider_tasks.rake +81 -16
- metadata +14 -53
data/lib/spider.rb
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#require "hpricot"
|
3
|
+
#
|
4
|
+
|
5
|
+
# copy from active support
|
6
|
+
require "httparty"
|
7
|
+
Hash.class_eval do
|
8
|
+
def deep_dup
|
9
|
+
duplicate = self.dup
|
10
|
+
duplicate.each_pair do |k,v|
|
11
|
+
tv = duplicate[k]
|
12
|
+
duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? tv.deep_dup : v
|
13
|
+
end
|
14
|
+
duplicate
|
15
|
+
end
|
16
|
+
end unless Hash.new.respond_to? :deep_dup
|
17
|
+
|
18
|
+
|
3
19
|
module Spider
|
4
20
|
def self.logger
|
5
21
|
unless @logger
|
@@ -52,30 +68,11 @@ Spider::Page.send(:include,Spider::Page::Proxy)
|
|
52
68
|
require "spider/page/label"
|
53
69
|
Spider::Page.send(:include,Spider::Page::Label)
|
54
70
|
|
71
|
+
require "spider/page/cache"
|
72
|
+
Spider::Page.send(:include,Spider::Page::Cache)
|
55
73
|
|
56
|
-
spiders_dir = File.join(Rails.root,"spiders")
|
57
|
-
$:.push(spiders_dir)
|
58
|
-
|
59
|
-
# define constants
|
60
|
-
Dir[File.join(spiders_dir,"*")].each do |dir|
|
61
|
-
dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
|
62
|
-
Object.const_set(dir_name.classify,Module.new)
|
63
|
-
end
|
64
|
-
|
65
|
-
# 先包含初始化文件
|
66
|
-
init_file = File.join(spiders_dir,"init.rb")
|
67
|
-
require init_file if File.exists? init_file
|
68
|
-
|
69
|
-
file_patten = File.join(spiders_dir,"**","*.rb")
|
70
|
-
files = Dir[file_patten]
|
71
|
-
|
72
|
-
site_files = files.find_all{|i| i =~ /site\.rb/}
|
73
|
-
site_files.each{|i| require i}
|
74
|
-
|
75
|
-
base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
|
76
|
-
base_page_files.each{|i| require i}
|
77
74
|
|
78
|
-
files.each{|i| require i }
|
79
75
|
|
80
76
|
# 包含 active record methods
|
81
77
|
require "spider/active_record_methods"
|
78
|
+
require "spider/engine"
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Spider
|
2
|
+
class Engine < Rails::Engine
|
3
|
+
initializer 'spider' do
|
4
|
+
spiders_dir = File.join(Rails.root,"spiders")
|
5
|
+
$:.push(spiders_dir)
|
6
|
+
|
7
|
+
# define constants
|
8
|
+
Dir[File.join(spiders_dir,"*")].each do |dir|
|
9
|
+
dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
|
10
|
+
Object.const_set(dir_name.classify,Module.new)
|
11
|
+
end
|
12
|
+
|
13
|
+
# 先包含初始化文件
|
14
|
+
init_file = File.join(spiders_dir,"init.rb")
|
15
|
+
require init_file if File.exists? init_file
|
16
|
+
|
17
|
+
file_patten = File.join(spiders_dir,"**","*.rb")
|
18
|
+
files = Dir[file_patten]
|
19
|
+
|
20
|
+
site_files = files.find_all{|i| i =~ /site\.rb/}
|
21
|
+
site_files.each{|i| require i}
|
22
|
+
|
23
|
+
base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
|
24
|
+
base_page_files.each{|i| require i}
|
25
|
+
|
26
|
+
files.each{|i| require i }
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/spider/http.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require "spider/httparty_patch"
|
2
3
|
module Spider::Http
|
3
4
|
include HTTParty
|
4
5
|
headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
|
@@ -21,8 +22,8 @@ module Spider::Http
|
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.with_proxy(
|
25
|
-
http_proxy
|
25
|
+
def self.with_proxy(proxy,&block)
|
26
|
+
http_proxy proxy.host,proxy.port,proxy.user,proxy.password
|
26
27
|
result = yield
|
27
28
|
clear_proxy
|
28
29
|
result
|
@@ -31,6 +32,8 @@ module Spider::Http
|
|
31
32
|
def self.clear_proxy
|
32
33
|
Spider::Http.default_options.delete :http_proxyaddr
|
33
34
|
Spider::Http.default_options.delete :http_proxyport
|
35
|
+
Spider::Http.default_options.delete :http_proxyuser
|
36
|
+
Spider::Http.default_options.delete :http_proxypassword
|
34
37
|
end
|
35
38
|
|
36
39
|
=begin
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTTParty
|
2
|
+
module ClassMethods
|
3
|
+
def http_proxy(addr=nil, port = nil, user=nil, password=nil)
|
4
|
+
default_options[:http_proxyaddr] = addr
|
5
|
+
default_options[:http_proxyport] = port
|
6
|
+
default_options[:http_proxyuser] = user
|
7
|
+
default_options[:http_proxypassword] = password
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Request
|
12
|
+
def http
|
13
|
+
http = Net::HTTP.new(
|
14
|
+
uri.host,
|
15
|
+
uri.port,
|
16
|
+
options[:http_proxyaddr],
|
17
|
+
options[:http_proxyport],
|
18
|
+
options[:http_proxyuser],
|
19
|
+
options[:http_proxypassword]
|
20
|
+
)
|
21
|
+
http.use_ssl = ssl_implied?
|
22
|
+
|
23
|
+
if options[:timeout] && (options[:timeout].is_a?(Integer) || options[:timeout].is_a?(Float))
|
24
|
+
http.open_timeout = options[:timeout]
|
25
|
+
end
|
26
|
+
|
27
|
+
attach_ssl_certificates(http)
|
28
|
+
|
29
|
+
if options[:debug_output]
|
30
|
+
http.set_debug_output(options[:debug_output])
|
31
|
+
end
|
32
|
+
|
33
|
+
http
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
data/lib/spider/page.rb
CHANGED
@@ -4,6 +4,8 @@ class Spider::PageExistsAndDoneException < Exception; end
|
|
4
4
|
require "iconv"
|
5
5
|
require "digest/md5"
|
6
6
|
require "htmlentities"
|
7
|
+
require "spider/spider_page"
|
8
|
+
require "v8"
|
7
9
|
# 从本质上讲,所有的WEB页面都是一个页面(Page)
|
8
10
|
# 每个页面拥有一些属性,比如(encoding,title,url)
|
9
11
|
# 每个页面有我们感兴趣的信息,我们需要提取出来
|
@@ -37,16 +39,17 @@ class Spider::Page
|
|
37
39
|
@coder ||= HTMLEntities.new
|
38
40
|
end
|
39
41
|
|
40
|
-
def
|
41
|
-
|
42
|
+
def self.parse_query(string)
|
43
|
+
Rack::Utils.parse_query(string)
|
42
44
|
end
|
43
45
|
|
44
|
-
def
|
45
|
-
|
46
|
-
# 现在还是得使用class_inheritable_accessor
|
47
|
-
class_inheritable_accessor *args # 目前还没找到更好的方式
|
46
|
+
def parse_query(string)
|
47
|
+
self.class.parse_query string
|
48
48
|
end
|
49
49
|
|
50
|
+
def coder
|
51
|
+
self.class.coder
|
52
|
+
end
|
50
53
|
|
51
54
|
extend ActiveModel::Callbacks
|
52
55
|
|
@@ -61,9 +64,17 @@ class Spider::Page
|
|
61
64
|
self.options = {}
|
62
65
|
self.options[:example_url] ||= []
|
63
66
|
|
64
|
-
|
65
67
|
SEPARATOR = "<!-- PAGINATE SEPARATOR -->"
|
66
68
|
|
69
|
+
def self.inherited(subclass)
|
70
|
+
subclass.options = options.dup
|
71
|
+
super
|
72
|
+
end
|
73
|
+
|
74
|
+
def options
|
75
|
+
self.class.options
|
76
|
+
end
|
77
|
+
|
67
78
|
@@paginate_symbol = "--NEXTPAGE--"
|
68
79
|
cattr_accessor :paginate_symbol
|
69
80
|
|
@@ -115,7 +126,7 @@ class Spider::Page
|
|
115
126
|
end
|
116
127
|
|
117
128
|
def self.define_attribute(attribute)
|
118
|
-
self.attribute_names
|
129
|
+
self.attribute_names += [attribute]
|
119
130
|
self.attribute_names.uniq!
|
120
131
|
self.attribute_names.compact!
|
121
132
|
attribute
|
@@ -140,6 +151,18 @@ class Spider::Page
|
|
140
151
|
hash
|
141
152
|
end
|
142
153
|
|
154
|
+
def v8
|
155
|
+
@v8 ||= V8::Context.new
|
156
|
+
end
|
157
|
+
|
158
|
+
# eval_js "info = {name:'name'}"
|
159
|
+
# v8['info']['name'] => "name"
|
160
|
+
# v8 is a V8::Context instance
|
161
|
+
# for details about V8 , refer to therubyracer gem
|
162
|
+
def eval_js(js)
|
163
|
+
v8.eval js
|
164
|
+
end
|
165
|
+
|
143
166
|
# 对 <base href="xxxx" /> 的标记进行快捷获取
|
144
167
|
def base_href
|
145
168
|
doc.at("base").try(:attributes).try(:[],"href")
|
@@ -148,12 +171,11 @@ class Spider::Page
|
|
148
171
|
|
149
172
|
# 从url的query string中分析得到params
|
150
173
|
def params
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
r[key.to_sym] = value
|
155
|
-
end
|
174
|
+
h = {}
|
175
|
+
Rack::Utils.parse_query(uri.query).each_pair do |key,value|
|
176
|
+
h[key.to_sym] = value
|
156
177
|
end
|
178
|
+
h
|
157
179
|
end
|
158
180
|
|
159
181
|
# 提供对 attributes 的快捷访问
|
@@ -559,7 +581,7 @@ class Spider::Page
|
|
559
581
|
if exists?
|
560
582
|
page = spider_page
|
561
583
|
else
|
562
|
-
page = Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
|
584
|
+
page = ::Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
|
563
585
|
end
|
564
586
|
page.content_length = content_length
|
565
587
|
#page.labels_hash = Digest::MD5.hexdigest(labels.to_yaml)
|
@@ -744,16 +766,4 @@ class Spider::Page
|
|
744
766
|
end
|
745
767
|
|
746
768
|
|
747
|
-
def fetch_content_from_url_with_cache(options={})
|
748
|
-
key = Digest::MD5.hexdigest(options.to_json.to_s) + "/" + Digest::MD5.hexdigest(url)
|
749
|
-
@content ||= Rails.cache.fetch key do
|
750
|
-
fetch_content_from_url_without_cache(options)
|
751
|
-
end
|
752
|
-
@content_length = @content.length
|
753
|
-
@content
|
754
|
-
end
|
755
|
-
|
756
|
-
# alias_method_chain :fetch_content_from_url,:cache
|
757
|
-
|
758
|
-
|
759
769
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Page::Cache
|
3
|
+
extend ActiveSupport::Concern
|
4
|
+
included do
|
5
|
+
class_attribute :cache_enabled
|
6
|
+
self.cache_enabled = false
|
7
|
+
alias_method_chain :fetch_content_from_url,:cache
|
8
|
+
end
|
9
|
+
|
10
|
+
def fetch_content_from_url_with_cache(*args)
|
11
|
+
fetch_content_from_url_without_cache *args
|
12
|
+
if self.cache_enabled
|
13
|
+
f = cache_file("content")
|
14
|
+
FileUtils.mkdir_p File.dirname(f)
|
15
|
+
File.open f,"w+" do |file|
|
16
|
+
file.write @content
|
17
|
+
end
|
18
|
+
end
|
19
|
+
@content
|
20
|
+
end
|
21
|
+
|
22
|
+
def cached_content
|
23
|
+
if self.cache_enabled
|
24
|
+
f = cache_file("content")
|
25
|
+
File.read f
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
def cache_file(name = '')
|
31
|
+
md5 = Digest::MD5.hexdigest url
|
32
|
+
file = md5 + name
|
33
|
+
file = "#{file[0,3]}/#{file}"
|
34
|
+
if defined? Rails
|
35
|
+
Rails.root.join("tmp",file).to_s
|
36
|
+
else
|
37
|
+
"/tmp/#{file}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
module ClassMethods
|
42
|
+
|
43
|
+
def enable_cache
|
44
|
+
self.cache_enabled = true
|
45
|
+
end
|
46
|
+
|
47
|
+
def disable_cache
|
48
|
+
self.cache_enabled = false
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
data/lib/spider/page/filter.rb
CHANGED
@@ -41,6 +41,7 @@ module Spider::Page::Filter
|
|
41
41
|
options.assert_valid_keys :filters,:position
|
42
42
|
position = options[:position]
|
43
43
|
position = position.to_s + "_" if position
|
44
|
+
send("attributes_#{position}filters=",{})
|
44
45
|
filter_attrs = send("attributes_#{position}filters")
|
45
46
|
logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
|
46
47
|
args.each do |attr_name|
|
data/lib/spider/page/label.rb
CHANGED
data/lib/spider/page/proxy.rb
CHANGED
@@ -1,22 +1,97 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Spider::Page::Proxy
|
3
|
+
class HttpProxy
|
4
|
+
attr_accessor :user,:password,:host,:port
|
5
|
+
# aotianlong:password@192.168.1.1:8000
|
6
|
+
# => :user => "aotianlong",
|
7
|
+
# :password => "password",
|
8
|
+
# :host => "192.168.1.1",
|
9
|
+
# :port => 8000
|
10
|
+
def self.parse(str)
|
11
|
+
{}.tap do |hash|
|
12
|
+
user = password = host = port = nil
|
13
|
+
if str =~ /@/
|
14
|
+
userinfo,addrinfo = str.split("@")
|
15
|
+
addr_hash = parse addrinfo
|
16
|
+
port,host = addr_hash[:port],addr_hash[:host]
|
17
|
+
user_hash = parse userinfo
|
18
|
+
user,password = user_hash[:host],user_hash[:port]
|
19
|
+
else
|
20
|
+
host,port = str.split(":")
|
21
|
+
port = 80 if port.blank?
|
22
|
+
end
|
23
|
+
hash[:host] = host
|
24
|
+
hash[:port] = port
|
25
|
+
hash[:user] = user if user
|
26
|
+
hash[:password] = password if password
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
if user && password
|
32
|
+
"#{user}:#{password}@#{host}:#{port}"
|
33
|
+
else
|
34
|
+
"#{host}:#{port}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(host,options = {})
|
43
|
+
hash = self.class.parse host
|
44
|
+
hash.merge! options
|
45
|
+
@host = hash[:host]
|
46
|
+
@port = hash[:port]
|
47
|
+
@user = hash[:user]
|
48
|
+
@password = hash[:password]
|
49
|
+
end
|
50
|
+
|
51
|
+
def valid?(options = {})
|
52
|
+
options[:url] ||= "http://www.google.com"
|
53
|
+
options[:code] ||= 200
|
54
|
+
options[:timeout] ||= 10
|
55
|
+
# options[:match] ||= //
|
56
|
+
Spider::Http.with_proxy self do
|
57
|
+
begin
|
58
|
+
timeout options[:timeout] do
|
59
|
+
response = Spider::Http.get options[:url]
|
60
|
+
r = response.code == options[:code]
|
61
|
+
if options[:match]
|
62
|
+
r && (response.to_s =~ options[:match])
|
63
|
+
else
|
64
|
+
r
|
65
|
+
end
|
66
|
+
end
|
67
|
+
rescue Exception => e
|
68
|
+
false
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
3
75
|
def self.included(base)
|
4
76
|
base.send(:include,InstanceMethods)
|
5
77
|
base.send(:extend,ClassMethods)
|
6
78
|
base.class_eval do
|
7
|
-
class_attribute :
|
79
|
+
class_attribute :proxy_items
|
8
80
|
class_attribute :disabled_proxies
|
9
|
-
|
81
|
+
class_attribute :current_proxies
|
82
|
+
class_attribute :proxies_filename
|
83
|
+
self.current_proxies = []
|
10
84
|
self.disabled_proxies = []
|
85
|
+
self.proxies_filename = nil
|
86
|
+
self.proxy_items = []
|
11
87
|
|
12
88
|
before_fetch do |page|
|
13
|
-
proxies.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
Spider::Http.http_proxy host,port
|
89
|
+
self.current_proxies = proxies.shuffle.first(5) if self.current_proxies.empty?
|
90
|
+
logger.debug "current_proxies: #{current_proxies.inspect}"
|
91
|
+
proxy = current_proxies.shuffle.first
|
92
|
+
if proxy.try(:host)
|
93
|
+
logger.debug "set proxy: #{proxy.inspect}"
|
94
|
+
Spider::Http.http_proxy proxy.host,proxy.port,proxy.user,proxy.password
|
20
95
|
else
|
21
96
|
Spider::Http.clear_proxy
|
22
97
|
end
|
@@ -25,15 +100,17 @@ module Spider::Page::Proxy
|
|
25
100
|
after_fetch do |page|
|
26
101
|
logger.debug "reset proxy"
|
27
102
|
# Spider::Http.http_proxy old_host,old_port
|
28
|
-
if page.content.blank?
|
103
|
+
if page.content.blank? #|| page.code == 502 # bad gateway
|
29
104
|
# retry, and set proxy to disabled
|
30
105
|
# proxies
|
31
|
-
puts "proxies before:#{self.
|
32
|
-
disabled_proxy =
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
106
|
+
puts "proxies before:#{self.current_proxies.inspect}"
|
107
|
+
disabled_proxy = current_proxies.find{|proxy| proxy.host == Spider::Http.default_options[:http_proxyaddr] && proxy.port == Spider::Http.default_options[:http_proxyport] }
|
108
|
+
if disabled_proxy
|
109
|
+
current_proxies.delete disabled_proxy
|
110
|
+
self.disabled_proxies += [disabled_proxy]
|
111
|
+
puts "proxies after:#{self.current_proxies.inspect}"
|
112
|
+
end
|
113
|
+
unless current_proxies.empty?
|
37
114
|
puts 'retry'
|
38
115
|
page.request
|
39
116
|
next
|
@@ -42,7 +119,7 @@ module Spider::Page::Proxy
|
|
42
119
|
# no proxies available
|
43
120
|
# recover proxies
|
44
121
|
# 以便下次仍然使用(防止一次意外失败,而永久排除)
|
45
|
-
self.
|
122
|
+
self.current_proxies = []
|
46
123
|
self.disabled_proxies = []
|
47
124
|
# 不用代理服务器使用自身来获取
|
48
125
|
end
|
@@ -57,83 +134,93 @@ module Spider::Page::Proxy
|
|
57
134
|
module ClassMethods
|
58
135
|
|
59
136
|
def disable_proxy
|
60
|
-
proxy(nil
|
137
|
+
proxy(nil)
|
61
138
|
end
|
62
139
|
|
63
140
|
def validate_proxies
|
64
141
|
valid_proxies = proxies.find_all do |proxy|
|
65
|
-
valid_proxy?(
|
142
|
+
valid_proxy?(proxy)
|
66
143
|
end
|
67
144
|
invalid_proxies = proxies - valid_proxies
|
68
145
|
{:valid => valid_proxies,:invalid => invalid_proxies}
|
69
146
|
end
|
70
147
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
config_root = File.join(Rails.root,"config","spiders")
|
75
|
-
if file =~ /^\//
|
76
|
-
# absolute path
|
77
|
-
content = File.read file
|
148
|
+
def proxies
|
149
|
+
if proxies_filename
|
150
|
+
parse_proxy_file proxies_filename
|
78
151
|
else
|
79
|
-
|
152
|
+
proxy_items
|
80
153
|
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def parse_proxies(content)
|
81
157
|
proxies = []
|
82
158
|
content.each_line do |line|
|
83
159
|
line = line.strip
|
84
160
|
if line =~ /^\s*#/
|
85
161
|
# 注释
|
86
162
|
else
|
87
|
-
|
88
|
-
|
89
|
-
port ||= 80
|
90
|
-
proxies += [[ip,port]]
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
self.proxy do |the_proxies|
|
95
|
-
proxies.each do |p|
|
96
|
-
the_proxies += [p]
|
163
|
+
# proxy line,options
|
164
|
+
proxies << Spider::Page::Proxy::HttpProxy.new(line)
|
97
165
|
end
|
98
166
|
end
|
167
|
+
proxies
|
99
168
|
end
|
100
169
|
|
101
|
-
def
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
timeout options[:timeout] do
|
109
|
-
response = Spider::Http.get options[:url]
|
110
|
-
r = response.code == options[:code]
|
111
|
-
if options[:match]
|
112
|
-
r && (response.to_s =~ options[:match])
|
113
|
-
else
|
114
|
-
r
|
115
|
-
end
|
116
|
-
end
|
117
|
-
rescue Exception => e
|
118
|
-
false
|
119
|
-
end
|
170
|
+
def parse_proxy_file(file)
|
171
|
+
config_root = File.join(Rails.root,"config","spiders")
|
172
|
+
if file =~ /^\//
|
173
|
+
# absolute path
|
174
|
+
content = File.read file
|
175
|
+
else
|
176
|
+
content = File.read(File.join(config_root,file))
|
120
177
|
end
|
178
|
+
parse_proxies content
|
179
|
+
end
|
180
|
+
|
181
|
+
# 指定一个 file 作为 proxy 来源
|
182
|
+
# # ip:port
|
183
|
+
def proxy_file(file,options = {})
|
184
|
+
self.proxies_filename = file
|
185
|
+
# parse_proxy_file(file).each do |proxy|
|
186
|
+
# self.proxy proxy
|
187
|
+
# end
|
188
|
+
end
|
189
|
+
|
190
|
+
def valid_proxy?(proxy)
|
191
|
+
proxy.valid?
|
121
192
|
end
|
122
193
|
|
123
194
|
# 直接设置 proxies
|
124
195
|
def proxies=(arr)
|
125
|
-
proxy
|
126
|
-
|
127
|
-
|
196
|
+
proxy arr
|
197
|
+
end
|
198
|
+
|
199
|
+
def clear_proxies
|
200
|
+
self.proxy_items = []
|
201
|
+
self.proxies_filename = nil
|
202
|
+
end
|
203
|
+
|
204
|
+
def proxy(host = nil,options = {})
|
205
|
+
|
206
|
+
|
207
|
+
if host.is_a? Array
|
208
|
+
host.each do |h|
|
209
|
+
proxy h,options
|
128
210
|
end
|
211
|
+
return
|
129
212
|
end
|
130
|
-
end
|
131
213
|
|
132
|
-
|
133
|
-
|
214
|
+
if host.is_a? Spider::Page::Proxy::HttpProxy
|
215
|
+
http_proxy = host
|
216
|
+
else
|
217
|
+
http_proxy = Spider::Page::Proxy::HttpProxy.new(host,options || {})
|
218
|
+
end
|
219
|
+
|
220
|
+
self.proxy_items += [http_proxy]
|
134
221
|
|
135
222
|
if block_given?
|
136
|
-
yield
|
223
|
+
yield([]) # for old syntax
|
137
224
|
end
|
138
225
|
|
139
226
|
end
|
@@ -143,7 +230,10 @@ module Spider::Page::Proxy
|
|
143
230
|
|
144
231
|
|
145
232
|
module InstanceMethods
|
146
|
-
|
233
|
+
def proxies
|
234
|
+
self.class.proxies
|
235
|
+
end
|
147
236
|
end
|
148
237
|
|
238
|
+
|
149
239
|
end
|
data/lib/spider/page/publish.rb
CHANGED
@@ -5,12 +5,12 @@ module Spider::Page::Publish
|
|
5
5
|
|
6
6
|
included do
|
7
7
|
|
8
|
-
|
8
|
+
define_model_callbacks :publish
|
9
|
+
|
10
|
+
cattr_accessor :publishers
|
11
|
+
self.publishers = []
|
12
|
+
after_crawl :publish
|
9
13
|
|
10
|
-
cattr_accessor :publishers
|
11
|
-
self.publishers = []
|
12
|
-
after_crawl :publish
|
13
|
-
|
14
14
|
end
|
15
15
|
|
16
16
|
module ClassMethods
|
@@ -25,54 +25,52 @@ module Spider::Page::Publish
|
|
25
25
|
|
26
26
|
end
|
27
27
|
|
28
|
-
module InstanceMethods
|
29
28
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
def publish_to(*publishers)
|
30
|
+
run_callbacks :publish do
|
31
|
+
logger.debug "publish to #{publishers}"
|
32
|
+
results = []
|
33
|
+
[publishers].flatten.each do |publisher|
|
34
|
+
logger.info "send self to #{publisher}"
|
35
|
+
logger.debug "class:#{publisher.class.name}"
|
36
|
+
publisher = case publisher
|
37
|
+
when String,Symbol
|
38
|
+
publisher.to_s.classify.constantize
|
39
|
+
else
|
40
|
+
# puts "default: #{publisher}"
|
41
|
+
publisher
|
42
|
+
end
|
43
|
+
logger.debug "publisher: #{publisher}"
|
44
|
+
result = nil
|
45
|
+
begin
|
47
46
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
end
|
55
|
-
rescue Exception=>e
|
56
|
-
logger.error e.message
|
57
|
-
logger.error e.backtrace.join("\n")
|
47
|
+
if publisher.respond_to?(:receive_spider_page)
|
48
|
+
logger.debug "#{publisher} receive spider page #{self}"
|
49
|
+
result = publisher.receive_spider_page self
|
50
|
+
logger.debug "#{publisher} return #{result}"
|
51
|
+
else
|
52
|
+
logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
|
58
53
|
end
|
59
|
-
|
54
|
+
rescue Exception=>e
|
55
|
+
logger.error e.message
|
56
|
+
logger.error e.backtrace.join("\n")
|
60
57
|
end
|
61
|
-
results
|
58
|
+
results << result
|
62
59
|
end
|
60
|
+
results
|
63
61
|
end
|
62
|
+
end
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
64
|
+
def publish
|
65
|
+
publishers = self.publishers.uniq
|
66
|
+
if [:title,:body].all?{|name| attribute_names.include?(name) }
|
67
|
+
logger.debug "[#{self} publish to #{publishers}"
|
68
|
+
publish_to(publishers)
|
69
|
+
else
|
70
|
+
logger.debug "attribute names not include :title, :body,so publish canceled."
|
73
71
|
end
|
72
|
+
end
|
74
73
|
|
75
74
|
|
76
|
-
end
|
77
75
|
|
78
76
|
end
|
data/lib/spider/version.rb
CHANGED
data/lib/spider2.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "spider"
|
data/lib/tasks/spider_tasks.rake
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# desc "Explaining what the task does"
|
2
3
|
# task :spider_fu do
|
3
4
|
# # Task goes here
|
@@ -8,25 +9,89 @@ desc "开始采集"
|
|
8
9
|
namespace :spider do
|
9
10
|
|
10
11
|
namespace :proxy do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
|
13
|
+
|
14
|
+
desc "test proxy"
|
15
|
+
task :test2 => :environment do
|
16
|
+
times = ENV['TIMES'] && ENV['TIMES'].to_i || 50
|
17
|
+
verbose = ENV['VERBOSE']
|
18
|
+
proxies = Spider::Page.proxies
|
19
|
+
proxies_count = proxies.size
|
20
|
+
good_proxies = []
|
21
|
+
|
22
|
+
result = {}
|
23
|
+
|
24
|
+
times.times do |i|
|
25
|
+
puts "round #{i + 1}/#{times}" if verbose
|
26
|
+
proxies.each_with_index do |proxy,index|
|
27
|
+
print "(#{index + 1}/#{proxies_count} #{proxy.inspect}): " if verbose
|
28
|
+
result[proxy] ||= []
|
29
|
+
r = {}
|
30
|
+
time = Benchmark.ms do
|
31
|
+
r[:success ] = proxy.valid?
|
32
|
+
end / 1000
|
33
|
+
if r[:success]
|
34
|
+
print "OK" if verbose
|
35
|
+
else
|
36
|
+
print "FAILED" if verbose
|
37
|
+
end
|
38
|
+
print " " if verbose
|
39
|
+
print "#{time}s" if verbose
|
40
|
+
print "\n" if verbose
|
41
|
+
r[:time] = time
|
42
|
+
result[proxy] << r
|
43
|
+
end
|
19
44
|
end
|
20
|
-
|
21
|
-
|
22
|
-
result
|
23
|
-
|
45
|
+
|
46
|
+
good_proxies = []
|
47
|
+
result.each_pair do |proxy,data|
|
48
|
+
success_times = data.find_all{|d| d[:success] }.size
|
49
|
+
success_ratio = success_times.to_f / times
|
50
|
+
average_time = data.map{|d| d[:time] }.sum / data.size
|
51
|
+
good_proxies << proxy if success_ratio > 0.95
|
52
|
+
puts "=> #{proxy}" if verbose
|
53
|
+
puts data.collect{|d| d[:success] ? "*" : "!"}.join("") if verbose
|
54
|
+
puts " => success times: #{success_times}(#{success_ratio * 100}%)" if verbose
|
55
|
+
puts " => average time per request: #{average_time} seconds." if verbose
|
56
|
+
puts if verbose
|
24
57
|
end
|
25
|
-
puts
|
26
|
-
puts
|
27
|
-
|
28
|
-
|
58
|
+
puts " =============================== " if verbose
|
59
|
+
puts 'following proxies are 95% complete all test request:' if verbose
|
60
|
+
puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
|
61
|
+
end
|
62
|
+
|
63
|
+
desc "test proxy PAGE=XXXX::BasePage"
|
64
|
+
task :test => :environment do
|
65
|
+
times = ENV['TIMES'] && ENV['TIMES'].to_i || 10
|
66
|
+
verbose = ENV['VERBOSE']
|
67
|
+
proxies = Spider::Page.proxies
|
68
|
+
proxies_count = proxies.size
|
69
|
+
good_proxies = []
|
70
|
+
proxies.each_with_index do |proxy,index|
|
71
|
+
puts "(#{index + 1}/#{proxies_count}) => #{proxy.to_s}" if verbose
|
72
|
+
total_time = 0.0
|
73
|
+
success_times = 0
|
74
|
+
times.times do
|
75
|
+
total_time += Benchmark.ms do
|
76
|
+
if proxy.valid?
|
77
|
+
success_times += 1
|
78
|
+
print "*" if verbose
|
79
|
+
else
|
80
|
+
print "!" if verbose
|
81
|
+
end
|
82
|
+
STDOUT.flush
|
83
|
+
end / 1000
|
84
|
+
end
|
85
|
+
good_proxies << proxy if success_times == times
|
86
|
+
print "\n"
|
87
|
+
puts " => success times: #{success_times}" if verbose
|
88
|
+
puts " => total time:#{total_time/60} minutes" if verbose
|
89
|
+
puts " => average time per request: #{total_time / times} seconds." if verbose
|
90
|
+
puts if verbose
|
29
91
|
end
|
92
|
+
puts " =============================== " if verbose
|
93
|
+
puts 'following proxies are 100% complete all test request:' if verbose
|
94
|
+
puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
|
30
95
|
end
|
31
96
|
end
|
32
97
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 13
|
5
|
+
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 9
|
10
|
+
version: 0.0.9
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- aotianlong
|
@@ -15,54 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ~>
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 15
|
29
|
-
segments:
|
30
|
-
- 3
|
31
|
-
- 2
|
32
|
-
- 0
|
33
|
-
version: 3.2.0
|
34
|
-
type: :runtime
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: htmlentities
|
38
|
-
prerelease: false
|
39
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
-
none: false
|
41
|
-
requirements:
|
42
|
-
- - ~>
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
hash: 59
|
45
|
-
segments:
|
46
|
-
- 4
|
47
|
-
- 1
|
48
|
-
- 0
|
49
|
-
version: 4.1.0
|
50
|
-
type: :runtime
|
51
|
-
version_requirements: *id002
|
52
|
-
- !ruby/object:Gem::Dependency
|
53
|
-
name: sqlite3
|
54
|
-
prerelease: false
|
55
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
-
none: false
|
57
|
-
requirements:
|
58
|
-
- - ">="
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
hash: 3
|
61
|
-
segments:
|
62
|
-
- 0
|
63
|
-
version: "0"
|
64
|
-
type: :development
|
65
|
-
version_requirements: *id003
|
18
|
+
date: 2012-09-17 00:00:00 +08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
66
22
|
description: a framework to crawl web pages
|
67
23
|
email:
|
68
24
|
- aotianlong@gmail.com
|
@@ -91,7 +47,10 @@ files:
|
|
91
47
|
- lib/generators/spider_migration/spider_migration_generator.rb
|
92
48
|
- lib/generators/spider_migration/templates/migration.rb
|
93
49
|
- lib/spider/active_record_methods.rb
|
50
|
+
- lib/spider/engine.rb
|
94
51
|
- lib/spider/http.rb
|
52
|
+
- lib/spider/httparty_patch.rb
|
53
|
+
- lib/spider/page/cache.rb
|
95
54
|
- lib/spider/page/filter.rb
|
96
55
|
- lib/spider/page/label.rb
|
97
56
|
- lib/spider/page/pagination.rb
|
@@ -104,6 +63,7 @@ files:
|
|
104
63
|
- lib/spider/spider_page_label.rb
|
105
64
|
- lib/spider/version.rb
|
106
65
|
- lib/spider.rb
|
66
|
+
- lib/spider2.rb
|
107
67
|
- lib/tasks/spider_tasks.rake
|
108
68
|
- MIT-LICENSE
|
109
69
|
- Rakefile
|
@@ -113,6 +73,7 @@ files:
|
|
113
73
|
- uninstall.rb
|
114
74
|
- test/spider_fu_test.rb
|
115
75
|
- test/test_helper.rb
|
76
|
+
has_rdoc: true
|
116
77
|
homepage: http://www.powerapple.com
|
117
78
|
licenses: []
|
118
79
|
|
@@ -142,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
103
|
requirements: []
|
143
104
|
|
144
105
|
rubyforge_project:
|
145
|
-
rubygems_version: 1.
|
106
|
+
rubygems_version: 1.3.7
|
146
107
|
signing_key:
|
147
108
|
specification_version: 3
|
148
109
|
summary: spider
|