spider2 0.0.1 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,21 @@
1
1
  # encoding: utf-8
2
2
  #require "hpricot"
3
+ #
4
+
5
+ # copy from active support
6
+ require "httparty"
7
+ Hash.class_eval do
8
+ def deep_dup
9
+ duplicate = self.dup
10
+ duplicate.each_pair do |k,v|
11
+ tv = duplicate[k]
12
+ duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? tv.deep_dup : v
13
+ end
14
+ duplicate
15
+ end
16
+ end unless Hash.new.respond_to? :deep_dup
17
+
18
+
3
19
  module Spider
4
20
  def self.logger
5
21
  unless @logger
@@ -52,30 +68,11 @@ Spider::Page.send(:include,Spider::Page::Proxy)
52
68
  require "spider/page/label"
53
69
  Spider::Page.send(:include,Spider::Page::Label)
54
70
 
71
+ require "spider/page/cache"
72
+ Spider::Page.send(:include,Spider::Page::Cache)
55
73
 
56
- spiders_dir = File.join(Rails.root,"spiders")
57
- $:.push(spiders_dir)
58
-
59
- # define constants
60
- Dir[File.join(spiders_dir,"*")].each do |dir|
61
- dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
62
- Object.const_set(dir_name.classify,Module.new)
63
- end
64
-
65
- # 先包含初始化文件
66
- init_file = File.join(spiders_dir,"init.rb")
67
- require init_file if File.exists? init_file
68
-
69
- file_patten = File.join(spiders_dir,"**","*.rb")
70
- files = Dir[file_patten]
71
-
72
- site_files = files.find_all{|i| i =~ /site\.rb/}
73
- site_files.each{|i| require i}
74
-
75
- base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
76
- base_page_files.each{|i| require i}
77
74
 
78
- files.each{|i| require i }
79
75
 
80
76
  # 包含 active record methods
81
77
  require "spider/active_record_methods"
78
+ require "spider/engine"
@@ -0,0 +1,31 @@
1
+ module Spider
2
+ class Engine < Rails::Engine
3
+ initializer 'spider' do
4
+ spiders_dir = File.join(Rails.root,"spiders")
5
+ $:.push(spiders_dir)
6
+
7
+ # define constants
8
+ Dir[File.join(spiders_dir,"*")].each do |dir|
9
+ dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
10
+ Object.const_set(dir_name.classify,Module.new)
11
+ end
12
+
13
+ # 先包含初始化文件
14
+ init_file = File.join(spiders_dir,"init.rb")
15
+ require init_file if File.exists? init_file
16
+
17
+ file_patten = File.join(spiders_dir,"**","*.rb")
18
+ files = Dir[file_patten]
19
+
20
+ site_files = files.find_all{|i| i =~ /site\.rb/}
21
+ site_files.each{|i| require i}
22
+
23
+ base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
24
+ base_page_files.each{|i| require i}
25
+
26
+ files.each{|i| require i }
27
+
28
+ end
29
+
30
+ end
31
+ end
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ require "spider/httparty_patch"
2
3
  module Spider::Http
3
4
  include HTTParty
4
5
  headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
@@ -21,8 +22,8 @@ module Spider::Http
21
22
  end
22
23
  end
23
24
 
24
- def self.with_proxy(ip,port,&block)
25
- http_proxy ip,port
25
+ def self.with_proxy(proxy,&block)
26
+ http_proxy proxy.host,proxy.port,proxy.user,proxy.password
26
27
  result = yield
27
28
  clear_proxy
28
29
  result
@@ -31,6 +32,8 @@ module Spider::Http
31
32
  def self.clear_proxy
32
33
  Spider::Http.default_options.delete :http_proxyaddr
33
34
  Spider::Http.default_options.delete :http_proxyport
35
+ Spider::Http.default_options.delete :http_proxyuser
36
+ Spider::Http.default_options.delete :http_proxypassword
34
37
  end
35
38
 
36
39
  =begin
@@ -0,0 +1,37 @@
1
+ module HTTParty
2
+ module ClassMethods
3
+ def http_proxy(addr=nil, port = nil, user=nil, password=nil)
4
+ default_options[:http_proxyaddr] = addr
5
+ default_options[:http_proxyport] = port
6
+ default_options[:http_proxyuser] = user
7
+ default_options[:http_proxypassword] = password
8
+ end
9
+ end
10
+
11
+ class Request
12
+ def http
13
+ http = Net::HTTP.new(
14
+ uri.host,
15
+ uri.port,
16
+ options[:http_proxyaddr],
17
+ options[:http_proxyport],
18
+ options[:http_proxyuser],
19
+ options[:http_proxypassword]
20
+ )
21
+ http.use_ssl = ssl_implied?
22
+
23
+ if options[:timeout] && (options[:timeout].is_a?(Integer) || options[:timeout].is_a?(Float))
24
+ http.open_timeout = options[:timeout]
25
+ end
26
+
27
+ attach_ssl_certificates(http)
28
+
29
+ if options[:debug_output]
30
+ http.set_debug_output(options[:debug_output])
31
+ end
32
+
33
+ http
34
+ end
35
+ end
36
+ end
37
+
@@ -4,6 +4,8 @@ class Spider::PageExistsAndDoneException < Exception; end
4
4
  require "iconv"
5
5
  require "digest/md5"
6
6
  require "htmlentities"
7
+ require "spider/spider_page"
8
+ require "v8"
7
9
  # 从本质上讲,所有的WEB页面都是一个页面(Page)
8
10
  # 每个页面拥有一些属性,比如(encoding,title,url)
9
11
  # 每个页面有我们感兴趣的信息,我们需要提取出来
@@ -37,16 +39,17 @@ class Spider::Page
37
39
  @coder ||= HTMLEntities.new
38
40
  end
39
41
 
40
- def coder
41
- self.class.coder
42
+ def self.parse_query(string)
43
+ Rack::Utils.parse_query(string)
42
44
  end
43
45
 
44
- def self.class_attribute(*args)
45
- # class_attribute跟class_inheritable_accessor对待Array,Hash的方式还存在差异
46
- # 现在还是得使用class_inheritable_accessor
47
- class_inheritable_accessor *args # 目前还没找到更好的方式
46
+ def parse_query(string)
47
+ self.class.parse_query string
48
48
  end
49
49
 
50
+ def coder
51
+ self.class.coder
52
+ end
50
53
 
51
54
  extend ActiveModel::Callbacks
52
55
 
@@ -61,9 +64,17 @@ class Spider::Page
61
64
  self.options = {}
62
65
  self.options[:example_url] ||= []
63
66
 
64
-
65
67
  SEPARATOR = "<!-- PAGINATE SEPARATOR -->"
66
68
 
69
+ def self.inherited(subclass)
70
+ subclass.options = options.dup
71
+ super
72
+ end
73
+
74
+ def options
75
+ self.class.options
76
+ end
77
+
67
78
  @@paginate_symbol = "--NEXTPAGE--"
68
79
  cattr_accessor :paginate_symbol
69
80
 
@@ -115,7 +126,7 @@ class Spider::Page
115
126
  end
116
127
 
117
128
  def self.define_attribute(attribute)
118
- self.attribute_names << attribute
129
+ self.attribute_names += [attribute]
119
130
  self.attribute_names.uniq!
120
131
  self.attribute_names.compact!
121
132
  attribute
@@ -140,6 +151,18 @@ class Spider::Page
140
151
  hash
141
152
  end
142
153
 
154
+ def v8
155
+ @v8 ||= V8::Context.new
156
+ end
157
+
158
+ # eval_js "info = {name:'name'}"
159
+ # v8['info']['name'] => "name"
160
+ # v8 is a V8::Context instance
161
+ # for details about V8 , refer to therubyracer gem
162
+ def eval_js(js)
163
+ v8.eval js
164
+ end
165
+
143
166
  # 对 <base href="xxxx" /> 的标记进行快捷获取
144
167
  def base_href
145
168
  doc.at("base").try(:attributes).try(:[],"href")
@@ -148,12 +171,11 @@ class Spider::Page
148
171
 
149
172
  # 从url的query string中分析得到params
150
173
  def params
151
- Rack::Utils.parse_query(uri.query).tap do |r|
152
- # r.symbolize_keys!
153
- r.each_pair do |key,value|
154
- r[key.to_sym] = value
155
- end
174
+ h = {}
175
+ Rack::Utils.parse_query(uri.query).each_pair do |key,value|
176
+ h[key.to_sym] = value
156
177
  end
178
+ h
157
179
  end
158
180
 
159
181
  # 提供对 attributes 的快捷访问
@@ -559,7 +581,7 @@ class Spider::Page
559
581
  if exists?
560
582
  page = spider_page
561
583
  else
562
- page = Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
584
+ page = ::Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
563
585
  end
564
586
  page.content_length = content_length
565
587
  #page.labels_hash = Digest::MD5.hexdigest(labels.to_yaml)
@@ -744,16 +766,4 @@ class Spider::Page
744
766
  end
745
767
 
746
768
 
747
- def fetch_content_from_url_with_cache(options={})
748
- key = Digest::MD5.hexdigest(options.to_json.to_s) + "/" + Digest::MD5.hexdigest(url)
749
- @content ||= Rails.cache.fetch key do
750
- fetch_content_from_url_without_cache(options)
751
- end
752
- @content_length = @content.length
753
- @content
754
- end
755
-
756
- # alias_method_chain :fetch_content_from_url,:cache
757
-
758
-
759
769
  end
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+ module Spider::Page::Cache
3
+ extend ActiveSupport::Concern
4
+ included do
5
+ class_attribute :cache_enabled
6
+ self.cache_enabled = false
7
+ alias_method_chain :fetch_content_from_url,:cache
8
+ end
9
+
10
+ def fetch_content_from_url_with_cache(*args)
11
+ fetch_content_from_url_without_cache *args
12
+ if self.cache_enabled
13
+ f = cache_file("content")
14
+ FileUtils.mkdir_p File.dirname(f)
15
+ File.open f,"w+" do |file|
16
+ file.write @content
17
+ end
18
+ end
19
+ @content
20
+ end
21
+
22
+ def cached_content
23
+ if self.cache_enabled
24
+ f = cache_file("content")
25
+ File.read f
26
+ end
27
+ end
28
+
29
+ protected
30
+ def cache_file(name = '')
31
+ md5 = Digest::MD5.hexdigest url
32
+ file = md5 + name
33
+ file = "#{file[0,3]}/#{file}"
34
+ if defined? Rails
35
+ Rails.root.join("tmp",file).to_s
36
+ else
37
+ "/tmp/#{file}"
38
+ end
39
+ end
40
+
41
+ module ClassMethods
42
+
43
+ def enable_cache
44
+ self.cache_enabled = true
45
+ end
46
+
47
+ def disable_cache
48
+ self.cache_enabled = false
49
+ end
50
+
51
+ end
52
+ end
@@ -41,6 +41,7 @@ module Spider::Page::Filter
41
41
  options.assert_valid_keys :filters,:position
42
42
  position = options[:position]
43
43
  position = position.to_s + "_" if position
44
+ send("attributes_#{position}filters=",{})
44
45
  filter_attrs = send("attributes_#{position}filters")
45
46
  logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
46
47
  args.each do |attr_name|
@@ -3,9 +3,6 @@
3
3
  module Spider::Page::Label
4
4
  extend ActiveSupport::Concern
5
5
 
6
- module InstanceMethods
7
- end
8
-
9
6
  module ClassMethods
10
7
  def label(name,options = {},&block)
11
8
  name = name.to_sym
@@ -1,22 +1,97 @@
1
1
  # encoding: utf-8
2
2
  module Spider::Page::Proxy
3
+ class HttpProxy
4
+ attr_accessor :user,:password,:host,:port
5
+ # aotianlong:password@192.168.1.1:8000
6
+ # => :user => "aotianlong",
7
+ # :password => "password",
8
+ # :host => "192.168.1.1",
9
+ # :port => 8000
10
+ def self.parse(str)
11
+ {}.tap do |hash|
12
+ user = password = host = port = nil
13
+ if str =~ /@/
14
+ userinfo,addrinfo = str.split("@")
15
+ addr_hash = parse addrinfo
16
+ port,host = addr_hash[:port],addr_hash[:host]
17
+ user_hash = parse userinfo
18
+ user,password = user_hash[:host],user_hash[:port]
19
+ else
20
+ host,port = str.split(":")
21
+ port = 80 if port.blank?
22
+ end
23
+ hash[:host] = host
24
+ hash[:port] = port
25
+ hash[:user] = user if user
26
+ hash[:password] = password if password
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ if user && password
32
+ "#{user}:#{password}@#{host}:#{port}"
33
+ else
34
+ "#{host}:#{port}"
35
+ end
36
+ end
37
+
38
+ def inspect
39
+ to_s
40
+ end
41
+
42
+ def initialize(host,options = {})
43
+ hash = self.class.parse host
44
+ hash.merge! options
45
+ @host = hash[:host]
46
+ @port = hash[:port]
47
+ @user = hash[:user]
48
+ @password = hash[:password]
49
+ end
50
+
51
+ def valid?(options = {})
52
+ options[:url] ||= "http://www.google.com"
53
+ options[:code] ||= 200
54
+ options[:timeout] ||= 10
55
+ # options[:match] ||= //
56
+ Spider::Http.with_proxy self do
57
+ begin
58
+ timeout options[:timeout] do
59
+ response = Spider::Http.get options[:url]
60
+ r = response.code == options[:code]
61
+ if options[:match]
62
+ r && (response.to_s =~ options[:match])
63
+ else
64
+ r
65
+ end
66
+ end
67
+ rescue Exception => e
68
+ false
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+
3
75
  def self.included(base)
4
76
  base.send(:include,InstanceMethods)
5
77
  base.send(:extend,ClassMethods)
6
78
  base.class_eval do
7
- class_attribute :proxies
79
+ class_attribute :proxy_items
8
80
  class_attribute :disabled_proxies
9
- self.proxies = []
81
+ class_attribute :current_proxies
82
+ class_attribute :proxies_filename
83
+ self.current_proxies = []
10
84
  self.disabled_proxies = []
85
+ self.proxies_filename = nil
86
+ self.proxy_items = []
11
87
 
12
88
  before_fetch do |page|
13
- proxies.compact!
14
- proxies.uniq!
15
- host,port = proxies.shuffle.first
16
- port ||= 80
17
- if host
18
- logger.debug "set proxy: #{host}:#{port}"
19
- Spider::Http.http_proxy host,port
89
+ self.current_proxies = proxies.shuffle.first(5) if self.current_proxies.empty?
90
+ logger.debug "current_proxies: #{current_proxies.inspect}"
91
+ proxy = current_proxies.shuffle.first
92
+ if proxy.try(:host)
93
+ logger.debug "set proxy: #{proxy.inspect}"
94
+ Spider::Http.http_proxy proxy.host,proxy.port,proxy.user,proxy.password
20
95
  else
21
96
  Spider::Http.clear_proxy
22
97
  end
@@ -25,15 +100,17 @@ module Spider::Page::Proxy
25
100
  after_fetch do |page|
26
101
  logger.debug "reset proxy"
27
102
  # Spider::Http.http_proxy old_host,old_port
28
- if page.content.blank?
103
+ if page.content.blank? #|| page.code == 502 # bad gateway
29
104
  # retry, and set proxy to disabled
30
105
  # proxies
31
- puts "proxies before:#{self.proxies.inspect}"
32
- disabled_proxy = proxies.find{|proxy| proxy.first == Spider::Http.default_options[:http_proxyaddr] }
33
- proxies.delete disabled_proxy
34
- self.disabled_proxies += [disabled_proxy]
35
- puts "proxies after:#{self.proxies.inspect}"
36
- unless proxies.empty?
106
+ puts "proxies before:#{self.current_proxies.inspect}"
107
+ disabled_proxy = current_proxies.find{|proxy| proxy.host == Spider::Http.default_options[:http_proxyaddr] && proxy.port == Spider::Http.default_options[:http_proxyport] }
108
+ if disabled_proxy
109
+ current_proxies.delete disabled_proxy
110
+ self.disabled_proxies += [disabled_proxy]
111
+ puts "proxies after:#{self.current_proxies.inspect}"
112
+ end
113
+ unless current_proxies.empty?
37
114
  puts 'retry'
38
115
  page.request
39
116
  next
@@ -42,7 +119,7 @@ module Spider::Page::Proxy
42
119
  # no proxies available
43
120
  # recover proxies
44
121
  # 以便下次仍然使用(防止一次意外失败,而永久排除)
45
- self.proxies += self.disabled_proxies
122
+ self.current_proxies = []
46
123
  self.disabled_proxies = []
47
124
  # 不用代理服务器使用自身来获取
48
125
  end
@@ -57,83 +134,93 @@ module Spider::Page::Proxy
57
134
  module ClassMethods
58
135
 
59
136
  def disable_proxy
60
- proxy(nil,nil)
137
+ proxy(nil)
61
138
  end
62
139
 
63
140
  def validate_proxies
64
141
  valid_proxies = proxies.find_all do |proxy|
65
- valid_proxy?(*proxy)
142
+ valid_proxy?(proxy)
66
143
  end
67
144
  invalid_proxies = proxies - valid_proxies
68
145
  {:valid => valid_proxies,:invalid => invalid_proxies}
69
146
  end
70
147
 
71
- # 指定一个 file 作为 proxy 来源
72
- # # ip:port
73
- def proxy_file(file)
74
- config_root = File.join(Rails.root,"config","spiders")
75
- if file =~ /^\//
76
- # absolute path
77
- content = File.read file
148
+ def proxies
149
+ if proxies_filename
150
+ parse_proxy_file proxies_filename
78
151
  else
79
- content = File.read(File.join(config_root,file))
152
+ proxy_items
80
153
  end
154
+ end
155
+
156
+ def parse_proxies(content)
81
157
  proxies = []
82
158
  content.each_line do |line|
83
159
  line = line.strip
84
160
  if line =~ /^\s*#/
85
161
  # 注释
86
162
  else
87
- if line =~ /\d+?\.\d+?\.\d+?\.\d+?/
88
- ip,port = line.split(":")
89
- port ||= 80
90
- proxies += [[ip,port]]
91
- end
92
- end
93
- end
94
- self.proxy do |the_proxies|
95
- proxies.each do |p|
96
- the_proxies += [p]
163
+ # proxy line,options
164
+ proxies << Spider::Page::Proxy::HttpProxy.new(line)
97
165
  end
98
166
  end
167
+ proxies
99
168
  end
100
169
 
101
- def valid_proxy?(ip,port = 80,options = {})
102
- options[:url] ||= "http://www.google.com"
103
- options[:code] ||= 200
104
- options[:timeout] ||= 10
105
- # options[:match] ||= //
106
- Spider::Http.with_proxy ip,port do
107
- begin
108
- timeout options[:timeout] do
109
- response = Spider::Http.get options[:url]
110
- r = response.code == options[:code]
111
- if options[:match]
112
- r && (response.to_s =~ options[:match])
113
- else
114
- r
115
- end
116
- end
117
- rescue Exception => e
118
- false
119
- end
170
+ def parse_proxy_file(file)
171
+ config_root = File.join(Rails.root,"config","spiders")
172
+ if file =~ /^\//
173
+ # absolute path
174
+ content = File.read file
175
+ else
176
+ content = File.read(File.join(config_root,file))
120
177
  end
178
+ parse_proxies content
179
+ end
180
+
181
+ # 指定一个 file 作为 proxy 来源
182
+ # # ip:port
183
+ def proxy_file(file,options = {})
184
+ self.proxies_filename = file
185
+ # parse_proxy_file(file).each do |proxy|
186
+ # self.proxy proxy
187
+ # end
188
+ end
189
+
190
+ def valid_proxy?(proxy)
191
+ proxy.valid?
121
192
  end
122
193
 
123
194
  # 直接设置 proxies
124
195
  def proxies=(arr)
125
- proxy do |ps|
126
- arr.each do |a|
127
- ps += [a]
196
+ proxy arr
197
+ end
198
+
199
+ def clear_proxies
200
+ self.proxy_items = []
201
+ self.proxies_filename = nil
202
+ end
203
+
204
+ def proxy(host = nil,options = {})
205
+
206
+
207
+ if host.is_a? Array
208
+ host.each do |h|
209
+ proxy h,options
128
210
  end
211
+ return
129
212
  end
130
- end
131
213
 
132
- def proxy(host = nil,port = 80,&block)
133
- self.proxies += [[host,port]] if host
214
+ if host.is_a? Spider::Page::Proxy::HttpProxy
215
+ http_proxy = host
216
+ else
217
+ http_proxy = Spider::Page::Proxy::HttpProxy.new(host,options || {})
218
+ end
219
+
220
+ self.proxy_items += [http_proxy]
134
221
 
135
222
  if block_given?
136
- yield self.proxies
223
+ yield([]) # for old syntax
137
224
  end
138
225
 
139
226
  end
@@ -143,7 +230,10 @@ module Spider::Page::Proxy
143
230
 
144
231
 
145
232
  module InstanceMethods
146
-
233
+ def proxies
234
+ self.class.proxies
235
+ end
147
236
  end
148
237
 
238
+
149
239
  end
@@ -5,12 +5,12 @@ module Spider::Page::Publish
5
5
 
6
6
  included do
7
7
 
8
- define_model_callbacks :publish
8
+ define_model_callbacks :publish
9
+
10
+ cattr_accessor :publishers
11
+ self.publishers = []
12
+ after_crawl :publish
9
13
 
10
- cattr_accessor :publishers
11
- self.publishers = []
12
- after_crawl :publish
13
-
14
14
  end
15
15
 
16
16
  module ClassMethods
@@ -25,54 +25,52 @@ module Spider::Page::Publish
25
25
 
26
26
  end
27
27
 
28
- module InstanceMethods
29
28
 
30
- def publish_to(*publishers)
31
- run_callbacks :publish do
32
- logger.debug "publish to #{publishers}"
33
- results = []
34
- [publishers].flatten.each do |publisher|
35
- logger.info "send self to #{publisher}"
36
- logger.debug "class:#{publisher.class.name}"
37
- publisher = case publisher
38
- when String,Symbol
39
- publisher.to_s.classify.constantize
40
- else
41
- # puts "default: #{publisher}"
42
- publisher
43
- end
44
- logger.debug "publisher: #{publisher}"
45
- result = nil
46
- begin
29
+ def publish_to(*publishers)
30
+ run_callbacks :publish do
31
+ logger.debug "publish to #{publishers}"
32
+ results = []
33
+ [publishers].flatten.each do |publisher|
34
+ logger.info "send self to #{publisher}"
35
+ logger.debug "class:#{publisher.class.name}"
36
+ publisher = case publisher
37
+ when String,Symbol
38
+ publisher.to_s.classify.constantize
39
+ else
40
+ # puts "default: #{publisher}"
41
+ publisher
42
+ end
43
+ logger.debug "publisher: #{publisher}"
44
+ result = nil
45
+ begin
47
46
 
48
- if publisher.respond_to?(:receive_spider_page)
49
- logger.debug "#{publisher} receive spider page #{self}"
50
- result = publisher.receive_spider_page self
51
- logger.debug "#{publisher} return #{result}"
52
- else
53
- logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
54
- end
55
- rescue Exception=>e
56
- logger.error e.message
57
- logger.error e.backtrace.join("\n")
47
+ if publisher.respond_to?(:receive_spider_page)
48
+ logger.debug "#{publisher} receive spider page #{self}"
49
+ result = publisher.receive_spider_page self
50
+ logger.debug "#{publisher} return #{result}"
51
+ else
52
+ logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
58
53
  end
59
- results << result
54
+ rescue Exception=>e
55
+ logger.error e.message
56
+ logger.error e.backtrace.join("\n")
60
57
  end
61
- results
58
+ results << result
62
59
  end
60
+ results
63
61
  end
62
+ end
64
63
 
65
- def publish
66
- publishers = self.publishers.uniq
67
- if [:title,:body].all?{|name| attribute_names.include?(name) }
68
- logger.debug "[#{self} publish to #{publishers}"
69
- publish_to(publishers)
70
- else
71
- logger.debug "attribute names not include :title, :body,so publish canceled."
72
- end
64
+ def publish
65
+ publishers = self.publishers.uniq
66
+ if [:title,:body].all?{|name| attribute_names.include?(name) }
67
+ logger.debug "[#{self} publish to #{publishers}"
68
+ publish_to(publishers)
69
+ else
70
+ logger.debug "attribute names not include :title, :body,so publish canceled."
73
71
  end
72
+ end
74
73
 
75
74
 
76
- end
77
75
 
78
76
  end
@@ -1,3 +1,3 @@
1
1
  module Spider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -0,0 +1 @@
1
+ require "spider"
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  # desc "Explaining what the task does"
2
3
  # task :spider_fu do
3
4
  # # Task goes here
@@ -8,25 +9,89 @@ desc "开始采集"
8
9
  namespace :spider do
9
10
 
10
11
  namespace :proxy do
11
- desc "test proxy PAGE=XXXX::BasePage"
12
- task :test => :environment do
13
- klass = ENV['PAGE']
14
- begin
15
- klass = klass.constantize
16
- rescue Exception => e
17
- puts "unknow class `#{klass}`, please set a right spider page class to PAGE=XXXX::XxxPage"
18
- exit
12
+
13
+
14
+ desc "test proxy"
15
+ task :test2 => :environment do
16
+ times = ENV['TIMES'] && ENV['TIMES'].to_i || 50
17
+ verbose = ENV['VERBOSE']
18
+ proxies = Spider::Page.proxies
19
+ proxies_count = proxies.size
20
+ good_proxies = []
21
+
22
+ result = {}
23
+
24
+ times.times do |i|
25
+ puts "round #{i + 1}/#{times}" if verbose
26
+ proxies.each_with_index do |proxy,index|
27
+ print "(#{index + 1}/#{proxies_count} #{proxy.inspect}): " if verbose
28
+ result[proxy] ||= []
29
+ r = {}
30
+ time = Benchmark.ms do
31
+ r[:success ] = proxy.valid?
32
+ end / 1000
33
+ if r[:success]
34
+ print "OK" if verbose
35
+ else
36
+ print "FAILED" if verbose
37
+ end
38
+ print " " if verbose
39
+ print "#{time}s" if verbose
40
+ print "\n" if verbose
41
+ r[:time] = time
42
+ result[proxy] << r
43
+ end
19
44
  end
20
- result = klass.validate_proxies
21
- puts "valid proxies:"
22
- result[:valid].each do |proxy|
23
- puts proxy.join(":")
45
+
46
+ good_proxies = []
47
+ result.each_pair do |proxy,data|
48
+ success_times = data.find_all{|d| d[:success] }.size
49
+ success_ratio = success_times.to_f / times
50
+ average_time = data.map{|d| d[:time] }.sum / data.size
51
+ good_proxies << proxy if success_ratio > 0.95
52
+ puts "=> #{proxy}" if verbose
53
+ puts data.collect{|d| d[:success] ? "*" : "!"}.join("") if verbose
54
+ puts " => success times: #{success_times}(#{success_ratio * 100}%)" if verbose
55
+ puts " => average time per request: #{average_time} seconds." if verbose
56
+ puts if verbose
24
57
  end
25
- puts
26
- puts "invalid proxies:"
27
- result[:valid].each do |proxy|
28
- puts proxy.join(":")
58
+ puts " =============================== " if verbose
59
+ puts 'following proxies are 95% complete all test request:' if verbose
60
+ puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
61
+ end
62
+
63
+ desc "test proxy PAGE=XXXX::BasePage"
64
+ task :test => :environment do
65
+ times = ENV['TIMES'] && ENV['TIMES'].to_i || 10
66
+ verbose = ENV['VERBOSE']
67
+ proxies = Spider::Page.proxies
68
+ proxies_count = proxies.size
69
+ good_proxies = []
70
+ proxies.each_with_index do |proxy,index|
71
+ puts "(#{index + 1}/#{proxies_count}) => #{proxy.to_s}" if verbose
72
+ total_time = 0.0
73
+ success_times = 0
74
+ times.times do
75
+ total_time += Benchmark.ms do
76
+ if proxy.valid?
77
+ success_times += 1
78
+ print "*" if verbose
79
+ else
80
+ print "!" if verbose
81
+ end
82
+ STDOUT.flush
83
+ end / 1000
84
+ end
85
+ good_proxies << proxy if success_times == times
86
+ print "\n"
87
+ puts " => success times: #{success_times}" if verbose
88
+ puts " => total time:#{total_time/60} minutes" if verbose
89
+ puts " => average time per request: #{total_time / times} seconds." if verbose
90
+ puts if verbose
29
91
  end
92
+ puts " =============================== " if verbose
93
+ puts 'following proxies are 100% complete all test request:' if verbose
94
+ puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
30
95
  end
31
96
  end
32
97
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider2
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
5
- prerelease:
4
+ hash: 13
5
+ prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 9
10
+ version: 0.0.9
11
11
  platform: ruby
12
12
  authors:
13
13
  - aotianlong
@@ -15,54 +15,10 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-03-04 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: rails
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ~>
27
- - !ruby/object:Gem::Version
28
- hash: 15
29
- segments:
30
- - 3
31
- - 2
32
- - 0
33
- version: 3.2.0
34
- type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: htmlentities
38
- prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
42
- - - ~>
43
- - !ruby/object:Gem::Version
44
- hash: 59
45
- segments:
46
- - 4
47
- - 1
48
- - 0
49
- version: 4.1.0
50
- type: :runtime
51
- version_requirements: *id002
52
- - !ruby/object:Gem::Dependency
53
- name: sqlite3
54
- prerelease: false
55
- requirement: &id003 !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ">="
59
- - !ruby/object:Gem::Version
60
- hash: 3
61
- segments:
62
- - 0
63
- version: "0"
64
- type: :development
65
- version_requirements: *id003
18
+ date: 2012-09-17 00:00:00 +08:00
19
+ default_executable:
20
+ dependencies: []
21
+
66
22
  description: a framework to crawl web pages
67
23
  email:
68
24
  - aotianlong@gmail.com
@@ -91,7 +47,10 @@ files:
91
47
  - lib/generators/spider_migration/spider_migration_generator.rb
92
48
  - lib/generators/spider_migration/templates/migration.rb
93
49
  - lib/spider/active_record_methods.rb
50
+ - lib/spider/engine.rb
94
51
  - lib/spider/http.rb
52
+ - lib/spider/httparty_patch.rb
53
+ - lib/spider/page/cache.rb
95
54
  - lib/spider/page/filter.rb
96
55
  - lib/spider/page/label.rb
97
56
  - lib/spider/page/pagination.rb
@@ -104,6 +63,7 @@ files:
104
63
  - lib/spider/spider_page_label.rb
105
64
  - lib/spider/version.rb
106
65
  - lib/spider.rb
66
+ - lib/spider2.rb
107
67
  - lib/tasks/spider_tasks.rake
108
68
  - MIT-LICENSE
109
69
  - Rakefile
@@ -113,6 +73,7 @@ files:
113
73
  - uninstall.rb
114
74
  - test/spider_fu_test.rb
115
75
  - test/test_helper.rb
76
+ has_rdoc: true
116
77
  homepage: http://www.powerapple.com
117
78
  licenses: []
118
79
 
@@ -142,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
103
  requirements: []
143
104
 
144
105
  rubyforge_project:
145
- rubygems_version: 1.8.15
106
+ rubygems_version: 1.3.7
146
107
  signing_key:
147
108
  specification_version: 3
148
109
  summary: spider