spider2 0.0.1 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/spider.rb +19 -22
- data/lib/spider/engine.rb +31 -0
- data/lib/spider/http.rb +5 -2
- data/lib/spider/httparty_patch.rb +37 -0
- data/lib/spider/page.rb +36 -26
- data/lib/spider/page/cache.rb +52 -0
- data/lib/spider/page/filter.rb +1 -0
- data/lib/spider/page/label.rb +0 -3
- data/lib/spider/page/proxy.rb +154 -64
- data/lib/spider/page/publish.rb +42 -44
- data/lib/spider/version.rb +1 -1
- data/lib/spider2.rb +1 -0
- data/lib/tasks/spider_tasks.rake +81 -16
- metadata +14 -53
data/lib/spider.rb
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#require "hpricot"
|
3
|
+
#
|
4
|
+
|
5
|
+
# copy from active support
|
6
|
+
require "httparty"
|
7
|
+
Hash.class_eval do
|
8
|
+
def deep_dup
|
9
|
+
duplicate = self.dup
|
10
|
+
duplicate.each_pair do |k,v|
|
11
|
+
tv = duplicate[k]
|
12
|
+
duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? tv.deep_dup : v
|
13
|
+
end
|
14
|
+
duplicate
|
15
|
+
end
|
16
|
+
end unless Hash.new.respond_to? :deep_dup
|
17
|
+
|
18
|
+
|
3
19
|
module Spider
|
4
20
|
def self.logger
|
5
21
|
unless @logger
|
@@ -52,30 +68,11 @@ Spider::Page.send(:include,Spider::Page::Proxy)
|
|
52
68
|
require "spider/page/label"
|
53
69
|
Spider::Page.send(:include,Spider::Page::Label)
|
54
70
|
|
71
|
+
require "spider/page/cache"
|
72
|
+
Spider::Page.send(:include,Spider::Page::Cache)
|
55
73
|
|
56
|
-
spiders_dir = File.join(Rails.root,"spiders")
|
57
|
-
$:.push(spiders_dir)
|
58
|
-
|
59
|
-
# define constants
|
60
|
-
Dir[File.join(spiders_dir,"*")].each do |dir|
|
61
|
-
dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
|
62
|
-
Object.const_set(dir_name.classify,Module.new)
|
63
|
-
end
|
64
|
-
|
65
|
-
# 先包含初始化文件
|
66
|
-
init_file = File.join(spiders_dir,"init.rb")
|
67
|
-
require init_file if File.exists? init_file
|
68
|
-
|
69
|
-
file_patten = File.join(spiders_dir,"**","*.rb")
|
70
|
-
files = Dir[file_patten]
|
71
|
-
|
72
|
-
site_files = files.find_all{|i| i =~ /site\.rb/}
|
73
|
-
site_files.each{|i| require i}
|
74
|
-
|
75
|
-
base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
|
76
|
-
base_page_files.each{|i| require i}
|
77
74
|
|
78
|
-
files.each{|i| require i }
|
79
75
|
|
80
76
|
# 包含 active record methods
|
81
77
|
require "spider/active_record_methods"
|
78
|
+
require "spider/engine"
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Spider
|
2
|
+
class Engine < Rails::Engine
|
3
|
+
initializer 'spider' do
|
4
|
+
spiders_dir = File.join(Rails.root,"spiders")
|
5
|
+
$:.push(spiders_dir)
|
6
|
+
|
7
|
+
# define constants
|
8
|
+
Dir[File.join(spiders_dir,"*")].each do |dir|
|
9
|
+
dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
|
10
|
+
Object.const_set(dir_name.classify,Module.new)
|
11
|
+
end
|
12
|
+
|
13
|
+
# 先包含初始化文件
|
14
|
+
init_file = File.join(spiders_dir,"init.rb")
|
15
|
+
require init_file if File.exists? init_file
|
16
|
+
|
17
|
+
file_patten = File.join(spiders_dir,"**","*.rb")
|
18
|
+
files = Dir[file_patten]
|
19
|
+
|
20
|
+
site_files = files.find_all{|i| i =~ /site\.rb/}
|
21
|
+
site_files.each{|i| require i}
|
22
|
+
|
23
|
+
base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
|
24
|
+
base_page_files.each{|i| require i}
|
25
|
+
|
26
|
+
files.each{|i| require i }
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/spider/http.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require "spider/httparty_patch"
|
2
3
|
module Spider::Http
|
3
4
|
include HTTParty
|
4
5
|
headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
|
@@ -21,8 +22,8 @@ module Spider::Http
|
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
24
|
-
def self.with_proxy(
|
25
|
-
http_proxy
|
25
|
+
def self.with_proxy(proxy,&block)
|
26
|
+
http_proxy proxy.host,proxy.port,proxy.user,proxy.password
|
26
27
|
result = yield
|
27
28
|
clear_proxy
|
28
29
|
result
|
@@ -31,6 +32,8 @@ module Spider::Http
|
|
31
32
|
def self.clear_proxy
|
32
33
|
Spider::Http.default_options.delete :http_proxyaddr
|
33
34
|
Spider::Http.default_options.delete :http_proxyport
|
35
|
+
Spider::Http.default_options.delete :http_proxyuser
|
36
|
+
Spider::Http.default_options.delete :http_proxypassword
|
34
37
|
end
|
35
38
|
|
36
39
|
=begin
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTTParty
|
2
|
+
module ClassMethods
|
3
|
+
def http_proxy(addr=nil, port = nil, user=nil, password=nil)
|
4
|
+
default_options[:http_proxyaddr] = addr
|
5
|
+
default_options[:http_proxyport] = port
|
6
|
+
default_options[:http_proxyuser] = user
|
7
|
+
default_options[:http_proxypassword] = password
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Request
|
12
|
+
def http
|
13
|
+
http = Net::HTTP.new(
|
14
|
+
uri.host,
|
15
|
+
uri.port,
|
16
|
+
options[:http_proxyaddr],
|
17
|
+
options[:http_proxyport],
|
18
|
+
options[:http_proxyuser],
|
19
|
+
options[:http_proxypassword]
|
20
|
+
)
|
21
|
+
http.use_ssl = ssl_implied?
|
22
|
+
|
23
|
+
if options[:timeout] && (options[:timeout].is_a?(Integer) || options[:timeout].is_a?(Float))
|
24
|
+
http.open_timeout = options[:timeout]
|
25
|
+
end
|
26
|
+
|
27
|
+
attach_ssl_certificates(http)
|
28
|
+
|
29
|
+
if options[:debug_output]
|
30
|
+
http.set_debug_output(options[:debug_output])
|
31
|
+
end
|
32
|
+
|
33
|
+
http
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
data/lib/spider/page.rb
CHANGED
@@ -4,6 +4,8 @@ class Spider::PageExistsAndDoneException < Exception; end
|
|
4
4
|
require "iconv"
|
5
5
|
require "digest/md5"
|
6
6
|
require "htmlentities"
|
7
|
+
require "spider/spider_page"
|
8
|
+
require "v8"
|
7
9
|
# 从本质上讲,所有的WEB页面都是一个页面(Page)
|
8
10
|
# 每个页面拥有一些属性,比如(encoding,title,url)
|
9
11
|
# 每个页面有我们感兴趣的信息,我们需要提取出来
|
@@ -37,16 +39,17 @@ class Spider::Page
|
|
37
39
|
@coder ||= HTMLEntities.new
|
38
40
|
end
|
39
41
|
|
40
|
-
def
|
41
|
-
|
42
|
+
def self.parse_query(string)
|
43
|
+
Rack::Utils.parse_query(string)
|
42
44
|
end
|
43
45
|
|
44
|
-
def
|
45
|
-
|
46
|
-
# 现在还是得使用class_inheritable_accessor
|
47
|
-
class_inheritable_accessor *args # 目前还没找到更好的方式
|
46
|
+
def parse_query(string)
|
47
|
+
self.class.parse_query string
|
48
48
|
end
|
49
49
|
|
50
|
+
def coder
|
51
|
+
self.class.coder
|
52
|
+
end
|
50
53
|
|
51
54
|
extend ActiveModel::Callbacks
|
52
55
|
|
@@ -61,9 +64,17 @@ class Spider::Page
|
|
61
64
|
self.options = {}
|
62
65
|
self.options[:example_url] ||= []
|
63
66
|
|
64
|
-
|
65
67
|
SEPARATOR = "<!-- PAGINATE SEPARATOR -->"
|
66
68
|
|
69
|
+
def self.inherited(subclass)
|
70
|
+
subclass.options = options.dup
|
71
|
+
super
|
72
|
+
end
|
73
|
+
|
74
|
+
def options
|
75
|
+
self.class.options
|
76
|
+
end
|
77
|
+
|
67
78
|
@@paginate_symbol = "--NEXTPAGE--"
|
68
79
|
cattr_accessor :paginate_symbol
|
69
80
|
|
@@ -115,7 +126,7 @@ class Spider::Page
|
|
115
126
|
end
|
116
127
|
|
117
128
|
def self.define_attribute(attribute)
|
118
|
-
self.attribute_names
|
129
|
+
self.attribute_names += [attribute]
|
119
130
|
self.attribute_names.uniq!
|
120
131
|
self.attribute_names.compact!
|
121
132
|
attribute
|
@@ -140,6 +151,18 @@ class Spider::Page
|
|
140
151
|
hash
|
141
152
|
end
|
142
153
|
|
154
|
+
def v8
|
155
|
+
@v8 ||= V8::Context.new
|
156
|
+
end
|
157
|
+
|
158
|
+
# eval_js "info = {name:'name'}"
|
159
|
+
# v8['info']['name'] => "name"
|
160
|
+
# v8 is a V8::Context instance
|
161
|
+
# for details about V8 , refer to therubyracer gem
|
162
|
+
def eval_js(js)
|
163
|
+
v8.eval js
|
164
|
+
end
|
165
|
+
|
143
166
|
# 对 <base href="xxxx" /> 的标记进行快捷获取
|
144
167
|
def base_href
|
145
168
|
doc.at("base").try(:attributes).try(:[],"href")
|
@@ -148,12 +171,11 @@ class Spider::Page
|
|
148
171
|
|
149
172
|
# 从url的query string中分析得到params
|
150
173
|
def params
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
r[key.to_sym] = value
|
155
|
-
end
|
174
|
+
h = {}
|
175
|
+
Rack::Utils.parse_query(uri.query).each_pair do |key,value|
|
176
|
+
h[key.to_sym] = value
|
156
177
|
end
|
178
|
+
h
|
157
179
|
end
|
158
180
|
|
159
181
|
# 提供对 attributes 的快捷访问
|
@@ -559,7 +581,7 @@ class Spider::Page
|
|
559
581
|
if exists?
|
560
582
|
page = spider_page
|
561
583
|
else
|
562
|
-
page = Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
|
584
|
+
page = ::Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
|
563
585
|
end
|
564
586
|
page.content_length = content_length
|
565
587
|
#page.labels_hash = Digest::MD5.hexdigest(labels.to_yaml)
|
@@ -744,16 +766,4 @@ class Spider::Page
|
|
744
766
|
end
|
745
767
|
|
746
768
|
|
747
|
-
def fetch_content_from_url_with_cache(options={})
|
748
|
-
key = Digest::MD5.hexdigest(options.to_json.to_s) + "/" + Digest::MD5.hexdigest(url)
|
749
|
-
@content ||= Rails.cache.fetch key do
|
750
|
-
fetch_content_from_url_without_cache(options)
|
751
|
-
end
|
752
|
-
@content_length = @content.length
|
753
|
-
@content
|
754
|
-
end
|
755
|
-
|
756
|
-
# alias_method_chain :fetch_content_from_url,:cache
|
757
|
-
|
758
|
-
|
759
769
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spider::Page::Cache
|
3
|
+
extend ActiveSupport::Concern
|
4
|
+
included do
|
5
|
+
class_attribute :cache_enabled
|
6
|
+
self.cache_enabled = false
|
7
|
+
alias_method_chain :fetch_content_from_url,:cache
|
8
|
+
end
|
9
|
+
|
10
|
+
def fetch_content_from_url_with_cache(*args)
|
11
|
+
fetch_content_from_url_without_cache *args
|
12
|
+
if self.cache_enabled
|
13
|
+
f = cache_file("content")
|
14
|
+
FileUtils.mkdir_p File.dirname(f)
|
15
|
+
File.open f,"w+" do |file|
|
16
|
+
file.write @content
|
17
|
+
end
|
18
|
+
end
|
19
|
+
@content
|
20
|
+
end
|
21
|
+
|
22
|
+
def cached_content
|
23
|
+
if self.cache_enabled
|
24
|
+
f = cache_file("content")
|
25
|
+
File.read f
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
def cache_file(name = '')
|
31
|
+
md5 = Digest::MD5.hexdigest url
|
32
|
+
file = md5 + name
|
33
|
+
file = "#{file[0,3]}/#{file}"
|
34
|
+
if defined? Rails
|
35
|
+
Rails.root.join("tmp",file).to_s
|
36
|
+
else
|
37
|
+
"/tmp/#{file}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
module ClassMethods
|
42
|
+
|
43
|
+
def enable_cache
|
44
|
+
self.cache_enabled = true
|
45
|
+
end
|
46
|
+
|
47
|
+
def disable_cache
|
48
|
+
self.cache_enabled = false
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
data/lib/spider/page/filter.rb
CHANGED
@@ -41,6 +41,7 @@ module Spider::Page::Filter
|
|
41
41
|
options.assert_valid_keys :filters,:position
|
42
42
|
position = options[:position]
|
43
43
|
position = position.to_s + "_" if position
|
44
|
+
send("attributes_#{position}filters=",{})
|
44
45
|
filter_attrs = send("attributes_#{position}filters")
|
45
46
|
logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
|
46
47
|
args.each do |attr_name|
|
data/lib/spider/page/label.rb
CHANGED
data/lib/spider/page/proxy.rb
CHANGED
@@ -1,22 +1,97 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Spider::Page::Proxy
|
3
|
+
class HttpProxy
|
4
|
+
attr_accessor :user,:password,:host,:port
|
5
|
+
# aotianlong:password@192.168.1.1:8000
|
6
|
+
# => :user => "aotianlong",
|
7
|
+
# :password => "password",
|
8
|
+
# :host => "192.168.1.1",
|
9
|
+
# :port => 8000
|
10
|
+
def self.parse(str)
|
11
|
+
{}.tap do |hash|
|
12
|
+
user = password = host = port = nil
|
13
|
+
if str =~ /@/
|
14
|
+
userinfo,addrinfo = str.split("@")
|
15
|
+
addr_hash = parse addrinfo
|
16
|
+
port,host = addr_hash[:port],addr_hash[:host]
|
17
|
+
user_hash = parse userinfo
|
18
|
+
user,password = user_hash[:host],user_hash[:port]
|
19
|
+
else
|
20
|
+
host,port = str.split(":")
|
21
|
+
port = 80 if port.blank?
|
22
|
+
end
|
23
|
+
hash[:host] = host
|
24
|
+
hash[:port] = port
|
25
|
+
hash[:user] = user if user
|
26
|
+
hash[:password] = password if password
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
if user && password
|
32
|
+
"#{user}:#{password}@#{host}:#{port}"
|
33
|
+
else
|
34
|
+
"#{host}:#{port}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(host,options = {})
|
43
|
+
hash = self.class.parse host
|
44
|
+
hash.merge! options
|
45
|
+
@host = hash[:host]
|
46
|
+
@port = hash[:port]
|
47
|
+
@user = hash[:user]
|
48
|
+
@password = hash[:password]
|
49
|
+
end
|
50
|
+
|
51
|
+
def valid?(options = {})
|
52
|
+
options[:url] ||= "http://www.google.com"
|
53
|
+
options[:code] ||= 200
|
54
|
+
options[:timeout] ||= 10
|
55
|
+
# options[:match] ||= //
|
56
|
+
Spider::Http.with_proxy self do
|
57
|
+
begin
|
58
|
+
timeout options[:timeout] do
|
59
|
+
response = Spider::Http.get options[:url]
|
60
|
+
r = response.code == options[:code]
|
61
|
+
if options[:match]
|
62
|
+
r && (response.to_s =~ options[:match])
|
63
|
+
else
|
64
|
+
r
|
65
|
+
end
|
66
|
+
end
|
67
|
+
rescue Exception => e
|
68
|
+
false
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
3
75
|
def self.included(base)
|
4
76
|
base.send(:include,InstanceMethods)
|
5
77
|
base.send(:extend,ClassMethods)
|
6
78
|
base.class_eval do
|
7
|
-
class_attribute :
|
79
|
+
class_attribute :proxy_items
|
8
80
|
class_attribute :disabled_proxies
|
9
|
-
|
81
|
+
class_attribute :current_proxies
|
82
|
+
class_attribute :proxies_filename
|
83
|
+
self.current_proxies = []
|
10
84
|
self.disabled_proxies = []
|
85
|
+
self.proxies_filename = nil
|
86
|
+
self.proxy_items = []
|
11
87
|
|
12
88
|
before_fetch do |page|
|
13
|
-
proxies.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
Spider::Http.http_proxy host,port
|
89
|
+
self.current_proxies = proxies.shuffle.first(5) if self.current_proxies.empty?
|
90
|
+
logger.debug "current_proxies: #{current_proxies.inspect}"
|
91
|
+
proxy = current_proxies.shuffle.first
|
92
|
+
if proxy.try(:host)
|
93
|
+
logger.debug "set proxy: #{proxy.inspect}"
|
94
|
+
Spider::Http.http_proxy proxy.host,proxy.port,proxy.user,proxy.password
|
20
95
|
else
|
21
96
|
Spider::Http.clear_proxy
|
22
97
|
end
|
@@ -25,15 +100,17 @@ module Spider::Page::Proxy
|
|
25
100
|
after_fetch do |page|
|
26
101
|
logger.debug "reset proxy"
|
27
102
|
# Spider::Http.http_proxy old_host,old_port
|
28
|
-
if page.content.blank?
|
103
|
+
if page.content.blank? #|| page.code == 502 # bad gateway
|
29
104
|
# retry, and set proxy to disabled
|
30
105
|
# proxies
|
31
|
-
puts "proxies before:#{self.
|
32
|
-
disabled_proxy =
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
106
|
+
puts "proxies before:#{self.current_proxies.inspect}"
|
107
|
+
disabled_proxy = current_proxies.find{|proxy| proxy.host == Spider::Http.default_options[:http_proxyaddr] && proxy.port == Spider::Http.default_options[:http_proxyport] }
|
108
|
+
if disabled_proxy
|
109
|
+
current_proxies.delete disabled_proxy
|
110
|
+
self.disabled_proxies += [disabled_proxy]
|
111
|
+
puts "proxies after:#{self.current_proxies.inspect}"
|
112
|
+
end
|
113
|
+
unless current_proxies.empty?
|
37
114
|
puts 'retry'
|
38
115
|
page.request
|
39
116
|
next
|
@@ -42,7 +119,7 @@ module Spider::Page::Proxy
|
|
42
119
|
# no proxies available
|
43
120
|
# recover proxies
|
44
121
|
# 以便下次仍然使用(防止一次意外失败,而永久排除)
|
45
|
-
self.
|
122
|
+
self.current_proxies = []
|
46
123
|
self.disabled_proxies = []
|
47
124
|
# 不用代理服务器使用自身来获取
|
48
125
|
end
|
@@ -57,83 +134,93 @@ module Spider::Page::Proxy
|
|
57
134
|
module ClassMethods
|
58
135
|
|
59
136
|
def disable_proxy
|
60
|
-
proxy(nil
|
137
|
+
proxy(nil)
|
61
138
|
end
|
62
139
|
|
63
140
|
def validate_proxies
|
64
141
|
valid_proxies = proxies.find_all do |proxy|
|
65
|
-
valid_proxy?(
|
142
|
+
valid_proxy?(proxy)
|
66
143
|
end
|
67
144
|
invalid_proxies = proxies - valid_proxies
|
68
145
|
{:valid => valid_proxies,:invalid => invalid_proxies}
|
69
146
|
end
|
70
147
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
config_root = File.join(Rails.root,"config","spiders")
|
75
|
-
if file =~ /^\//
|
76
|
-
# absolute path
|
77
|
-
content = File.read file
|
148
|
+
def proxies
|
149
|
+
if proxies_filename
|
150
|
+
parse_proxy_file proxies_filename
|
78
151
|
else
|
79
|
-
|
152
|
+
proxy_items
|
80
153
|
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def parse_proxies(content)
|
81
157
|
proxies = []
|
82
158
|
content.each_line do |line|
|
83
159
|
line = line.strip
|
84
160
|
if line =~ /^\s*#/
|
85
161
|
# 注释
|
86
162
|
else
|
87
|
-
|
88
|
-
|
89
|
-
port ||= 80
|
90
|
-
proxies += [[ip,port]]
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
self.proxy do |the_proxies|
|
95
|
-
proxies.each do |p|
|
96
|
-
the_proxies += [p]
|
163
|
+
# proxy line,options
|
164
|
+
proxies << Spider::Page::Proxy::HttpProxy.new(line)
|
97
165
|
end
|
98
166
|
end
|
167
|
+
proxies
|
99
168
|
end
|
100
169
|
|
101
|
-
def
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
timeout options[:timeout] do
|
109
|
-
response = Spider::Http.get options[:url]
|
110
|
-
r = response.code == options[:code]
|
111
|
-
if options[:match]
|
112
|
-
r && (response.to_s =~ options[:match])
|
113
|
-
else
|
114
|
-
r
|
115
|
-
end
|
116
|
-
end
|
117
|
-
rescue Exception => e
|
118
|
-
false
|
119
|
-
end
|
170
|
+
def parse_proxy_file(file)
|
171
|
+
config_root = File.join(Rails.root,"config","spiders")
|
172
|
+
if file =~ /^\//
|
173
|
+
# absolute path
|
174
|
+
content = File.read file
|
175
|
+
else
|
176
|
+
content = File.read(File.join(config_root,file))
|
120
177
|
end
|
178
|
+
parse_proxies content
|
179
|
+
end
|
180
|
+
|
181
|
+
# 指定一个 file 作为 proxy 来源
|
182
|
+
# # ip:port
|
183
|
+
def proxy_file(file,options = {})
|
184
|
+
self.proxies_filename = file
|
185
|
+
# parse_proxy_file(file).each do |proxy|
|
186
|
+
# self.proxy proxy
|
187
|
+
# end
|
188
|
+
end
|
189
|
+
|
190
|
+
def valid_proxy?(proxy)
|
191
|
+
proxy.valid?
|
121
192
|
end
|
122
193
|
|
123
194
|
# 直接设置 proxies
|
124
195
|
def proxies=(arr)
|
125
|
-
proxy
|
126
|
-
|
127
|
-
|
196
|
+
proxy arr
|
197
|
+
end
|
198
|
+
|
199
|
+
def clear_proxies
|
200
|
+
self.proxy_items = []
|
201
|
+
self.proxies_filename = nil
|
202
|
+
end
|
203
|
+
|
204
|
+
def proxy(host = nil,options = {})
|
205
|
+
|
206
|
+
|
207
|
+
if host.is_a? Array
|
208
|
+
host.each do |h|
|
209
|
+
proxy h,options
|
128
210
|
end
|
211
|
+
return
|
129
212
|
end
|
130
|
-
end
|
131
213
|
|
132
|
-
|
133
|
-
|
214
|
+
if host.is_a? Spider::Page::Proxy::HttpProxy
|
215
|
+
http_proxy = host
|
216
|
+
else
|
217
|
+
http_proxy = Spider::Page::Proxy::HttpProxy.new(host,options || {})
|
218
|
+
end
|
219
|
+
|
220
|
+
self.proxy_items += [http_proxy]
|
134
221
|
|
135
222
|
if block_given?
|
136
|
-
yield
|
223
|
+
yield([]) # for old syntax
|
137
224
|
end
|
138
225
|
|
139
226
|
end
|
@@ -143,7 +230,10 @@ module Spider::Page::Proxy
|
|
143
230
|
|
144
231
|
|
145
232
|
module InstanceMethods
|
146
|
-
|
233
|
+
def proxies
|
234
|
+
self.class.proxies
|
235
|
+
end
|
147
236
|
end
|
148
237
|
|
238
|
+
|
149
239
|
end
|
data/lib/spider/page/publish.rb
CHANGED
@@ -5,12 +5,12 @@ module Spider::Page::Publish
|
|
5
5
|
|
6
6
|
included do
|
7
7
|
|
8
|
-
|
8
|
+
define_model_callbacks :publish
|
9
|
+
|
10
|
+
cattr_accessor :publishers
|
11
|
+
self.publishers = []
|
12
|
+
after_crawl :publish
|
9
13
|
|
10
|
-
cattr_accessor :publishers
|
11
|
-
self.publishers = []
|
12
|
-
after_crawl :publish
|
13
|
-
|
14
14
|
end
|
15
15
|
|
16
16
|
module ClassMethods
|
@@ -25,54 +25,52 @@ module Spider::Page::Publish
|
|
25
25
|
|
26
26
|
end
|
27
27
|
|
28
|
-
module InstanceMethods
|
29
28
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
def publish_to(*publishers)
|
30
|
+
run_callbacks :publish do
|
31
|
+
logger.debug "publish to #{publishers}"
|
32
|
+
results = []
|
33
|
+
[publishers].flatten.each do |publisher|
|
34
|
+
logger.info "send self to #{publisher}"
|
35
|
+
logger.debug "class:#{publisher.class.name}"
|
36
|
+
publisher = case publisher
|
37
|
+
when String,Symbol
|
38
|
+
publisher.to_s.classify.constantize
|
39
|
+
else
|
40
|
+
# puts "default: #{publisher}"
|
41
|
+
publisher
|
42
|
+
end
|
43
|
+
logger.debug "publisher: #{publisher}"
|
44
|
+
result = nil
|
45
|
+
begin
|
47
46
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
end
|
55
|
-
rescue Exception=>e
|
56
|
-
logger.error e.message
|
57
|
-
logger.error e.backtrace.join("\n")
|
47
|
+
if publisher.respond_to?(:receive_spider_page)
|
48
|
+
logger.debug "#{publisher} receive spider page #{self}"
|
49
|
+
result = publisher.receive_spider_page self
|
50
|
+
logger.debug "#{publisher} return #{result}"
|
51
|
+
else
|
52
|
+
logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
|
58
53
|
end
|
59
|
-
|
54
|
+
rescue Exception=>e
|
55
|
+
logger.error e.message
|
56
|
+
logger.error e.backtrace.join("\n")
|
60
57
|
end
|
61
|
-
results
|
58
|
+
results << result
|
62
59
|
end
|
60
|
+
results
|
63
61
|
end
|
62
|
+
end
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
64
|
+
def publish
|
65
|
+
publishers = self.publishers.uniq
|
66
|
+
if [:title,:body].all?{|name| attribute_names.include?(name) }
|
67
|
+
logger.debug "[#{self} publish to #{publishers}"
|
68
|
+
publish_to(publishers)
|
69
|
+
else
|
70
|
+
logger.debug "attribute names not include :title, :body,so publish canceled."
|
73
71
|
end
|
72
|
+
end
|
74
73
|
|
75
74
|
|
76
|
-
end
|
77
75
|
|
78
76
|
end
|
data/lib/spider/version.rb
CHANGED
data/lib/spider2.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "spider"
|
data/lib/tasks/spider_tasks.rake
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# desc "Explaining what the task does"
|
2
3
|
# task :spider_fu do
|
3
4
|
# # Task goes here
|
@@ -8,25 +9,89 @@ desc "开始采集"
|
|
8
9
|
namespace :spider do
|
9
10
|
|
10
11
|
namespace :proxy do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
|
13
|
+
|
14
|
+
desc "test proxy"
|
15
|
+
task :test2 => :environment do
|
16
|
+
times = ENV['TIMES'] && ENV['TIMES'].to_i || 50
|
17
|
+
verbose = ENV['VERBOSE']
|
18
|
+
proxies = Spider::Page.proxies
|
19
|
+
proxies_count = proxies.size
|
20
|
+
good_proxies = []
|
21
|
+
|
22
|
+
result = {}
|
23
|
+
|
24
|
+
times.times do |i|
|
25
|
+
puts "round #{i + 1}/#{times}" if verbose
|
26
|
+
proxies.each_with_index do |proxy,index|
|
27
|
+
print "(#{index + 1}/#{proxies_count} #{proxy.inspect}): " if verbose
|
28
|
+
result[proxy] ||= []
|
29
|
+
r = {}
|
30
|
+
time = Benchmark.ms do
|
31
|
+
r[:success ] = proxy.valid?
|
32
|
+
end / 1000
|
33
|
+
if r[:success]
|
34
|
+
print "OK" if verbose
|
35
|
+
else
|
36
|
+
print "FAILED" if verbose
|
37
|
+
end
|
38
|
+
print " " if verbose
|
39
|
+
print "#{time}s" if verbose
|
40
|
+
print "\n" if verbose
|
41
|
+
r[:time] = time
|
42
|
+
result[proxy] << r
|
43
|
+
end
|
19
44
|
end
|
20
|
-
|
21
|
-
|
22
|
-
result
|
23
|
-
|
45
|
+
|
46
|
+
good_proxies = []
|
47
|
+
result.each_pair do |proxy,data|
|
48
|
+
success_times = data.find_all{|d| d[:success] }.size
|
49
|
+
success_ratio = success_times.to_f / times
|
50
|
+
average_time = data.map{|d| d[:time] }.sum / data.size
|
51
|
+
good_proxies << proxy if success_ratio > 0.95
|
52
|
+
puts "=> #{proxy}" if verbose
|
53
|
+
puts data.collect{|d| d[:success] ? "*" : "!"}.join("") if verbose
|
54
|
+
puts " => success times: #{success_times}(#{success_ratio * 100}%)" if verbose
|
55
|
+
puts " => average time per request: #{average_time} seconds." if verbose
|
56
|
+
puts if verbose
|
24
57
|
end
|
25
|
-
puts
|
26
|
-
puts
|
27
|
-
|
28
|
-
|
58
|
+
puts " =============================== " if verbose
|
59
|
+
puts 'following proxies are 95% complete all test request:' if verbose
|
60
|
+
puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
|
61
|
+
end
|
62
|
+
|
63
|
+
desc "test proxy PAGE=XXXX::BasePage"
|
64
|
+
task :test => :environment do
|
65
|
+
times = ENV['TIMES'] && ENV['TIMES'].to_i || 10
|
66
|
+
verbose = ENV['VERBOSE']
|
67
|
+
proxies = Spider::Page.proxies
|
68
|
+
proxies_count = proxies.size
|
69
|
+
good_proxies = []
|
70
|
+
proxies.each_with_index do |proxy,index|
|
71
|
+
puts "(#{index + 1}/#{proxies_count}) => #{proxy.to_s}" if verbose
|
72
|
+
total_time = 0.0
|
73
|
+
success_times = 0
|
74
|
+
times.times do
|
75
|
+
total_time += Benchmark.ms do
|
76
|
+
if proxy.valid?
|
77
|
+
success_times += 1
|
78
|
+
print "*" if verbose
|
79
|
+
else
|
80
|
+
print "!" if verbose
|
81
|
+
end
|
82
|
+
STDOUT.flush
|
83
|
+
end / 1000
|
84
|
+
end
|
85
|
+
good_proxies << proxy if success_times == times
|
86
|
+
print "\n"
|
87
|
+
puts " => success times: #{success_times}" if verbose
|
88
|
+
puts " => total time:#{total_time/60} minutes" if verbose
|
89
|
+
puts " => average time per request: #{total_time / times} seconds." if verbose
|
90
|
+
puts if verbose
|
29
91
|
end
|
92
|
+
puts " =============================== " if verbose
|
93
|
+
puts 'following proxies are 100% complete all test request:' if verbose
|
94
|
+
puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
|
30
95
|
end
|
31
96
|
end
|
32
97
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 13
|
5
|
+
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 9
|
10
|
+
version: 0.0.9
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- aotianlong
|
@@ -15,54 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ~>
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 15
|
29
|
-
segments:
|
30
|
-
- 3
|
31
|
-
- 2
|
32
|
-
- 0
|
33
|
-
version: 3.2.0
|
34
|
-
type: :runtime
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: htmlentities
|
38
|
-
prerelease: false
|
39
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
-
none: false
|
41
|
-
requirements:
|
42
|
-
- - ~>
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
hash: 59
|
45
|
-
segments:
|
46
|
-
- 4
|
47
|
-
- 1
|
48
|
-
- 0
|
49
|
-
version: 4.1.0
|
50
|
-
type: :runtime
|
51
|
-
version_requirements: *id002
|
52
|
-
- !ruby/object:Gem::Dependency
|
53
|
-
name: sqlite3
|
54
|
-
prerelease: false
|
55
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
-
none: false
|
57
|
-
requirements:
|
58
|
-
- - ">="
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
hash: 3
|
61
|
-
segments:
|
62
|
-
- 0
|
63
|
-
version: "0"
|
64
|
-
type: :development
|
65
|
-
version_requirements: *id003
|
18
|
+
date: 2012-09-17 00:00:00 +08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
66
22
|
description: a framework to crawl web pages
|
67
23
|
email:
|
68
24
|
- aotianlong@gmail.com
|
@@ -91,7 +47,10 @@ files:
|
|
91
47
|
- lib/generators/spider_migration/spider_migration_generator.rb
|
92
48
|
- lib/generators/spider_migration/templates/migration.rb
|
93
49
|
- lib/spider/active_record_methods.rb
|
50
|
+
- lib/spider/engine.rb
|
94
51
|
- lib/spider/http.rb
|
52
|
+
- lib/spider/httparty_patch.rb
|
53
|
+
- lib/spider/page/cache.rb
|
95
54
|
- lib/spider/page/filter.rb
|
96
55
|
- lib/spider/page/label.rb
|
97
56
|
- lib/spider/page/pagination.rb
|
@@ -104,6 +63,7 @@ files:
|
|
104
63
|
- lib/spider/spider_page_label.rb
|
105
64
|
- lib/spider/version.rb
|
106
65
|
- lib/spider.rb
|
66
|
+
- lib/spider2.rb
|
107
67
|
- lib/tasks/spider_tasks.rake
|
108
68
|
- MIT-LICENSE
|
109
69
|
- Rakefile
|
@@ -113,6 +73,7 @@ files:
|
|
113
73
|
- uninstall.rb
|
114
74
|
- test/spider_fu_test.rb
|
115
75
|
- test/test_helper.rb
|
76
|
+
has_rdoc: true
|
116
77
|
homepage: http://www.powerapple.com
|
117
78
|
licenses: []
|
118
79
|
|
@@ -142,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
103
|
requirements: []
|
143
104
|
|
144
105
|
rubyforge_project:
|
145
|
-
rubygems_version: 1.
|
106
|
+
rubygems_version: 1.3.7
|
146
107
|
signing_key:
|
147
108
|
specification_version: 3
|
148
109
|
summary: spider
|