spider2 0.0.1 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,21 @@
1
1
  # encoding: utf-8
2
2
  #require "hpricot"
3
+ #
4
+
5
+ # copy from active support
6
+ require "httparty"
7
+ Hash.class_eval do
8
+ def deep_dup
9
+ duplicate = self.dup
10
+ duplicate.each_pair do |k,v|
11
+ tv = duplicate[k]
12
+ duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? tv.deep_dup : v
13
+ end
14
+ duplicate
15
+ end
16
+ end unless Hash.new.respond_to? :deep_dup
17
+
18
+
3
19
  module Spider
4
20
  def self.logger
5
21
  unless @logger
@@ -52,30 +68,11 @@ Spider::Page.send(:include,Spider::Page::Proxy)
52
68
  require "spider/page/label"
53
69
  Spider::Page.send(:include,Spider::Page::Label)
54
70
 
71
+ require "spider/page/cache"
72
+ Spider::Page.send(:include,Spider::Page::Cache)
55
73
 
56
- spiders_dir = File.join(Rails.root,"spiders")
57
- $:.push(spiders_dir)
58
-
59
- # define constants
60
- Dir[File.join(spiders_dir,"*")].each do |dir|
61
- dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
62
- Object.const_set(dir_name.classify,Module.new)
63
- end
64
-
65
- # 先包含初始化文件
66
- init_file = File.join(spiders_dir,"init.rb")
67
- require init_file if File.exists? init_file
68
-
69
- file_patten = File.join(spiders_dir,"**","*.rb")
70
- files = Dir[file_patten]
71
-
72
- site_files = files.find_all{|i| i =~ /site\.rb/}
73
- site_files.each{|i| require i}
74
-
75
- base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
76
- base_page_files.each{|i| require i}
77
74
 
78
- files.each{|i| require i }
79
75
 
80
76
  # 包含 active record methods
81
77
  require "spider/active_record_methods"
78
+ require "spider/engine"
@@ -0,0 +1,31 @@
1
+ module Spider
2
+ class Engine < Rails::Engine
3
+ initializer 'spider' do
4
+ spiders_dir = File.join(Rails.root,"spiders")
5
+ $:.push(spiders_dir)
6
+
7
+ # define constants
8
+ Dir[File.join(spiders_dir,"*")].each do |dir|
9
+ dir_name = dir.gsub(spiders_dir,"").gsub(/^\//,"")
10
+ Object.const_set(dir_name.classify,Module.new)
11
+ end
12
+
13
+ # 先包含初始化文件
14
+ init_file = File.join(spiders_dir,"init.rb")
15
+ require init_file if File.exists? init_file
16
+
17
+ file_patten = File.join(spiders_dir,"**","*.rb")
18
+ files = Dir[file_patten]
19
+
20
+ site_files = files.find_all{|i| i =~ /site\.rb/}
21
+ site_files.each{|i| require i}
22
+
23
+ base_page_files = files.find_all{|i| i =~ /base_page\.rb/}
24
+ base_page_files.each{|i| require i}
25
+
26
+ files.each{|i| require i }
27
+
28
+ end
29
+
30
+ end
31
+ end
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ require "spider/httparty_patch"
2
3
  module Spider::Http
3
4
  include HTTParty
4
5
  headers "User-Agent"=>"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020518 Ubuntu/9.04 (jaunty) Firefox/3.0.6"
@@ -21,8 +22,8 @@ module Spider::Http
21
22
  end
22
23
  end
23
24
 
24
- def self.with_proxy(ip,port,&block)
25
- http_proxy ip,port
25
+ def self.with_proxy(proxy,&block)
26
+ http_proxy proxy.host,proxy.port,proxy.user,proxy.password
26
27
  result = yield
27
28
  clear_proxy
28
29
  result
@@ -31,6 +32,8 @@ module Spider::Http
31
32
  def self.clear_proxy
32
33
  Spider::Http.default_options.delete :http_proxyaddr
33
34
  Spider::Http.default_options.delete :http_proxyport
35
+ Spider::Http.default_options.delete :http_proxyuser
36
+ Spider::Http.default_options.delete :http_proxypassword
34
37
  end
35
38
 
36
39
  =begin
@@ -0,0 +1,37 @@
1
+ module HTTParty
2
+ module ClassMethods
3
+ def http_proxy(addr=nil, port = nil, user=nil, password=nil)
4
+ default_options[:http_proxyaddr] = addr
5
+ default_options[:http_proxyport] = port
6
+ default_options[:http_proxyuser] = user
7
+ default_options[:http_proxypassword] = password
8
+ end
9
+ end
10
+
11
+ class Request
12
+ def http
13
+ http = Net::HTTP.new(
14
+ uri.host,
15
+ uri.port,
16
+ options[:http_proxyaddr],
17
+ options[:http_proxyport],
18
+ options[:http_proxyuser],
19
+ options[:http_proxypassword]
20
+ )
21
+ http.use_ssl = ssl_implied?
22
+
23
+ if options[:timeout] && (options[:timeout].is_a?(Integer) || options[:timeout].is_a?(Float))
24
+ http.open_timeout = options[:timeout]
25
+ end
26
+
27
+ attach_ssl_certificates(http)
28
+
29
+ if options[:debug_output]
30
+ http.set_debug_output(options[:debug_output])
31
+ end
32
+
33
+ http
34
+ end
35
+ end
36
+ end
37
+
@@ -4,6 +4,8 @@ class Spider::PageExistsAndDoneException < Exception; end
4
4
  require "iconv"
5
5
  require "digest/md5"
6
6
  require "htmlentities"
7
+ require "spider/spider_page"
8
+ require "v8"
7
9
  # 从本质上讲,所有的WEB页面都是一个页面(Page)
8
10
  # 每个页面拥有一些属性,比如(encoding,title,url)
9
11
  # 每个页面有我们感兴趣的信息,我们需要提取出来
@@ -37,16 +39,17 @@ class Spider::Page
37
39
  @coder ||= HTMLEntities.new
38
40
  end
39
41
 
40
- def coder
41
- self.class.coder
42
+ def self.parse_query(string)
43
+ Rack::Utils.parse_query(string)
42
44
  end
43
45
 
44
- def self.class_attribute(*args)
45
- # class_attribute跟class_inheritable_accessor对待Array,Hash的方式还存在差异
46
- # 现在还是得使用class_inheritable_accessor
47
- class_inheritable_accessor *args # 目前还没找到更好的方式
46
+ def parse_query(string)
47
+ self.class.parse_query string
48
48
  end
49
49
 
50
+ def coder
51
+ self.class.coder
52
+ end
50
53
 
51
54
  extend ActiveModel::Callbacks
52
55
 
@@ -61,9 +64,17 @@ class Spider::Page
61
64
  self.options = {}
62
65
  self.options[:example_url] ||= []
63
66
 
64
-
65
67
  SEPARATOR = "<!-- PAGINATE SEPARATOR -->"
66
68
 
69
+ def self.inherited(subclass)
70
+ subclass.options = options.dup
71
+ super
72
+ end
73
+
74
+ def options
75
+ self.class.options
76
+ end
77
+
67
78
  @@paginate_symbol = "--NEXTPAGE--"
68
79
  cattr_accessor :paginate_symbol
69
80
 
@@ -115,7 +126,7 @@ class Spider::Page
115
126
  end
116
127
 
117
128
  def self.define_attribute(attribute)
118
- self.attribute_names << attribute
129
+ self.attribute_names += [attribute]
119
130
  self.attribute_names.uniq!
120
131
  self.attribute_names.compact!
121
132
  attribute
@@ -140,6 +151,18 @@ class Spider::Page
140
151
  hash
141
152
  end
142
153
 
154
+ def v8
155
+ @v8 ||= V8::Context.new
156
+ end
157
+
158
+ # eval_js "info = {name:'name'}"
159
+ # v8['info']['name'] => "name"
160
+ # v8 is a V8::Context instance
161
+ # for details about V8 , refer to therubyracer gem
162
+ def eval_js(js)
163
+ v8.eval js
164
+ end
165
+
143
166
  # 对 <base href="xxxx" /> 的标记进行快捷获取
144
167
  def base_href
145
168
  doc.at("base").try(:attributes).try(:[],"href")
@@ -148,12 +171,11 @@ class Spider::Page
148
171
 
149
172
  # 从url的query string中分析得到params
150
173
  def params
151
- Rack::Utils.parse_query(uri.query).tap do |r|
152
- # r.symbolize_keys!
153
- r.each_pair do |key,value|
154
- r[key.to_sym] = value
155
- end
174
+ h = {}
175
+ Rack::Utils.parse_query(uri.query).each_pair do |key,value|
176
+ h[key.to_sym] = value
156
177
  end
178
+ h
157
179
  end
158
180
 
159
181
  # 提供对 attributes 的快捷访问
@@ -559,7 +581,7 @@ class Spider::Page
559
581
  if exists?
560
582
  page = spider_page
561
583
  else
562
- page = Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
584
+ page = ::Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
563
585
  end
564
586
  page.content_length = content_length
565
587
  #page.labels_hash = Digest::MD5.hexdigest(labels.to_yaml)
@@ -744,16 +766,4 @@ class Spider::Page
744
766
  end
745
767
 
746
768
 
747
- def fetch_content_from_url_with_cache(options={})
748
- key = Digest::MD5.hexdigest(options.to_json.to_s) + "/" + Digest::MD5.hexdigest(url)
749
- @content ||= Rails.cache.fetch key do
750
- fetch_content_from_url_without_cache(options)
751
- end
752
- @content_length = @content.length
753
- @content
754
- end
755
-
756
- # alias_method_chain :fetch_content_from_url,:cache
757
-
758
-
759
769
  end
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+ module Spider::Page::Cache
3
+ extend ActiveSupport::Concern
4
+ included do
5
+ class_attribute :cache_enabled
6
+ self.cache_enabled = false
7
+ alias_method_chain :fetch_content_from_url,:cache
8
+ end
9
+
10
+ def fetch_content_from_url_with_cache(*args)
11
+ fetch_content_from_url_without_cache *args
12
+ if self.cache_enabled
13
+ f = cache_file("content")
14
+ FileUtils.mkdir_p File.dirname(f)
15
+ File.open f,"w+" do |file|
16
+ file.write @content
17
+ end
18
+ end
19
+ @content
20
+ end
21
+
22
+ def cached_content
23
+ if self.cache_enabled
24
+ f = cache_file("content")
25
+ File.read f
26
+ end
27
+ end
28
+
29
+ protected
30
+ def cache_file(name = '')
31
+ md5 = Digest::MD5.hexdigest url
32
+ file = md5 + name
33
+ file = "#{file[0,3]}/#{file}"
34
+ if defined? Rails
35
+ Rails.root.join("tmp",file).to_s
36
+ else
37
+ "/tmp/#{file}"
38
+ end
39
+ end
40
+
41
+ module ClassMethods
42
+
43
+ def enable_cache
44
+ self.cache_enabled = true
45
+ end
46
+
47
+ def disable_cache
48
+ self.cache_enabled = false
49
+ end
50
+
51
+ end
52
+ end
@@ -41,6 +41,7 @@ module Spider::Page::Filter
41
41
  options.assert_valid_keys :filters,:position
42
42
  position = options[:position]
43
43
  position = position.to_s + "_" if position
44
+ send("attributes_#{position}filters=",{})
44
45
  filter_attrs = send("attributes_#{position}filters")
45
46
  logger.debug "create filter: #{name} : position : #{position},options : #{options.inspect}"
46
47
  args.each do |attr_name|
@@ -3,9 +3,6 @@
3
3
  module Spider::Page::Label
4
4
  extend ActiveSupport::Concern
5
5
 
6
- module InstanceMethods
7
- end
8
-
9
6
  module ClassMethods
10
7
  def label(name,options = {},&block)
11
8
  name = name.to_sym
@@ -1,22 +1,97 @@
1
1
  # encoding: utf-8
2
2
  module Spider::Page::Proxy
3
+ class HttpProxy
4
+ attr_accessor :user,:password,:host,:port
5
+ # aotianlong:password@192.168.1.1:8000
6
+ # => :user => "aotianlong",
7
+ # :password => "password",
8
+ # :host => "192.168.1.1",
9
+ # :port => 8000
10
+ def self.parse(str)
11
+ {}.tap do |hash|
12
+ user = password = host = port = nil
13
+ if str =~ /@/
14
+ userinfo,addrinfo = str.split("@")
15
+ addr_hash = parse addrinfo
16
+ port,host = addr_hash[:port],addr_hash[:host]
17
+ user_hash = parse userinfo
18
+ user,password = user_hash[:host],user_hash[:port]
19
+ else
20
+ host,port = str.split(":")
21
+ port = 80 if port.blank?
22
+ end
23
+ hash[:host] = host
24
+ hash[:port] = port
25
+ hash[:user] = user if user
26
+ hash[:password] = password if password
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ if user && password
32
+ "#{user}:#{password}@#{host}:#{port}"
33
+ else
34
+ "#{host}:#{port}"
35
+ end
36
+ end
37
+
38
+ def inspect
39
+ to_s
40
+ end
41
+
42
+ def initialize(host,options = {})
43
+ hash = self.class.parse host
44
+ hash.merge! options
45
+ @host = hash[:host]
46
+ @port = hash[:port]
47
+ @user = hash[:user]
48
+ @password = hash[:password]
49
+ end
50
+
51
+ def valid?(options = {})
52
+ options[:url] ||= "http://www.google.com"
53
+ options[:code] ||= 200
54
+ options[:timeout] ||= 10
55
+ # options[:match] ||= //
56
+ Spider::Http.with_proxy self do
57
+ begin
58
+ timeout options[:timeout] do
59
+ response = Spider::Http.get options[:url]
60
+ r = response.code == options[:code]
61
+ if options[:match]
62
+ r && (response.to_s =~ options[:match])
63
+ else
64
+ r
65
+ end
66
+ end
67
+ rescue Exception => e
68
+ false
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+
3
75
  def self.included(base)
4
76
  base.send(:include,InstanceMethods)
5
77
  base.send(:extend,ClassMethods)
6
78
  base.class_eval do
7
- class_attribute :proxies
79
+ class_attribute :proxy_items
8
80
  class_attribute :disabled_proxies
9
- self.proxies = []
81
+ class_attribute :current_proxies
82
+ class_attribute :proxies_filename
83
+ self.current_proxies = []
10
84
  self.disabled_proxies = []
85
+ self.proxies_filename = nil
86
+ self.proxy_items = []
11
87
 
12
88
  before_fetch do |page|
13
- proxies.compact!
14
- proxies.uniq!
15
- host,port = proxies.shuffle.first
16
- port ||= 80
17
- if host
18
- logger.debug "set proxy: #{host}:#{port}"
19
- Spider::Http.http_proxy host,port
89
+ self.current_proxies = proxies.shuffle.first(5) if self.current_proxies.empty?
90
+ logger.debug "current_proxies: #{current_proxies.inspect}"
91
+ proxy = current_proxies.shuffle.first
92
+ if proxy.try(:host)
93
+ logger.debug "set proxy: #{proxy.inspect}"
94
+ Spider::Http.http_proxy proxy.host,proxy.port,proxy.user,proxy.password
20
95
  else
21
96
  Spider::Http.clear_proxy
22
97
  end
@@ -25,15 +100,17 @@ module Spider::Page::Proxy
25
100
  after_fetch do |page|
26
101
  logger.debug "reset proxy"
27
102
  # Spider::Http.http_proxy old_host,old_port
28
- if page.content.blank?
103
+ if page.content.blank? #|| page.code == 502 # bad gateway
29
104
  # retry, and set proxy to disabled
30
105
  # proxies
31
- puts "proxies before:#{self.proxies.inspect}"
32
- disabled_proxy = proxies.find{|proxy| proxy.first == Spider::Http.default_options[:http_proxyaddr] }
33
- proxies.delete disabled_proxy
34
- self.disabled_proxies += [disabled_proxy]
35
- puts "proxies after:#{self.proxies.inspect}"
36
- unless proxies.empty?
106
+ puts "proxies before:#{self.current_proxies.inspect}"
107
+ disabled_proxy = current_proxies.find{|proxy| proxy.host == Spider::Http.default_options[:http_proxyaddr] && proxy.port == Spider::Http.default_options[:http_proxyport] }
108
+ if disabled_proxy
109
+ current_proxies.delete disabled_proxy
110
+ self.disabled_proxies += [disabled_proxy]
111
+ puts "proxies after:#{self.current_proxies.inspect}"
112
+ end
113
+ unless current_proxies.empty?
37
114
  puts 'retry'
38
115
  page.request
39
116
  next
@@ -42,7 +119,7 @@ module Spider::Page::Proxy
42
119
  # no proxies available
43
120
  # recover proxies
44
121
  # 以便下次仍然使用(防止一次意外失败,而永久排除)
45
- self.proxies += self.disabled_proxies
122
+ self.current_proxies = []
46
123
  self.disabled_proxies = []
47
124
  # 不用代理服务器使用自身来获取
48
125
  end
@@ -57,83 +134,93 @@ module Spider::Page::Proxy
57
134
  module ClassMethods
58
135
 
59
136
  def disable_proxy
60
- proxy(nil,nil)
137
+ proxy(nil)
61
138
  end
62
139
 
63
140
  def validate_proxies
64
141
  valid_proxies = proxies.find_all do |proxy|
65
- valid_proxy?(*proxy)
142
+ valid_proxy?(proxy)
66
143
  end
67
144
  invalid_proxies = proxies - valid_proxies
68
145
  {:valid => valid_proxies,:invalid => invalid_proxies}
69
146
  end
70
147
 
71
- # 指定一个 file 作为 proxy 来源
72
- # # ip:port
73
- def proxy_file(file)
74
- config_root = File.join(Rails.root,"config","spiders")
75
- if file =~ /^\//
76
- # absolute path
77
- content = File.read file
148
+ def proxies
149
+ if proxies_filename
150
+ parse_proxy_file proxies_filename
78
151
  else
79
- content = File.read(File.join(config_root,file))
152
+ proxy_items
80
153
  end
154
+ end
155
+
156
+ def parse_proxies(content)
81
157
  proxies = []
82
158
  content.each_line do |line|
83
159
  line = line.strip
84
160
  if line =~ /^\s*#/
85
161
  # 注释
86
162
  else
87
- if line =~ /\d+?\.\d+?\.\d+?\.\d+?/
88
- ip,port = line.split(":")
89
- port ||= 80
90
- proxies += [[ip,port]]
91
- end
92
- end
93
- end
94
- self.proxy do |the_proxies|
95
- proxies.each do |p|
96
- the_proxies += [p]
163
+ # proxy line,options
164
+ proxies << Spider::Page::Proxy::HttpProxy.new(line)
97
165
  end
98
166
  end
167
+ proxies
99
168
  end
100
169
 
101
- def valid_proxy?(ip,port = 80,options = {})
102
- options[:url] ||= "http://www.google.com"
103
- options[:code] ||= 200
104
- options[:timeout] ||= 10
105
- # options[:match] ||= //
106
- Spider::Http.with_proxy ip,port do
107
- begin
108
- timeout options[:timeout] do
109
- response = Spider::Http.get options[:url]
110
- r = response.code == options[:code]
111
- if options[:match]
112
- r && (response.to_s =~ options[:match])
113
- else
114
- r
115
- end
116
- end
117
- rescue Exception => e
118
- false
119
- end
170
+ def parse_proxy_file(file)
171
+ config_root = File.join(Rails.root,"config","spiders")
172
+ if file =~ /^\//
173
+ # absolute path
174
+ content = File.read file
175
+ else
176
+ content = File.read(File.join(config_root,file))
120
177
  end
178
+ parse_proxies content
179
+ end
180
+
181
+ # 指定一个 file 作为 proxy 来源
182
+ # # ip:port
183
+ def proxy_file(file,options = {})
184
+ self.proxies_filename = file
185
+ # parse_proxy_file(file).each do |proxy|
186
+ # self.proxy proxy
187
+ # end
188
+ end
189
+
190
+ def valid_proxy?(proxy)
191
+ proxy.valid?
121
192
  end
122
193
 
123
194
  # 直接设置 proxies
124
195
  def proxies=(arr)
125
- proxy do |ps|
126
- arr.each do |a|
127
- ps += [a]
196
+ proxy arr
197
+ end
198
+
199
+ def clear_proxies
200
+ self.proxy_items = []
201
+ self.proxies_filename = nil
202
+ end
203
+
204
+ def proxy(host = nil,options = {})
205
+
206
+
207
+ if host.is_a? Array
208
+ host.each do |h|
209
+ proxy h,options
128
210
  end
211
+ return
129
212
  end
130
- end
131
213
 
132
- def proxy(host = nil,port = 80,&block)
133
- self.proxies += [[host,port]] if host
214
+ if host.is_a? Spider::Page::Proxy::HttpProxy
215
+ http_proxy = host
216
+ else
217
+ http_proxy = Spider::Page::Proxy::HttpProxy.new(host,options || {})
218
+ end
219
+
220
+ self.proxy_items += [http_proxy]
134
221
 
135
222
  if block_given?
136
- yield self.proxies
223
+ yield([]) # for old syntax
137
224
  end
138
225
 
139
226
  end
@@ -143,7 +230,10 @@ module Spider::Page::Proxy
143
230
 
144
231
 
145
232
  module InstanceMethods
146
-
233
+ def proxies
234
+ self.class.proxies
235
+ end
147
236
  end
148
237
 
238
+
149
239
  end
@@ -5,12 +5,12 @@ module Spider::Page::Publish
5
5
 
6
6
  included do
7
7
 
8
- define_model_callbacks :publish
8
+ define_model_callbacks :publish
9
+
10
+ cattr_accessor :publishers
11
+ self.publishers = []
12
+ after_crawl :publish
9
13
 
10
- cattr_accessor :publishers
11
- self.publishers = []
12
- after_crawl :publish
13
-
14
14
  end
15
15
 
16
16
  module ClassMethods
@@ -25,54 +25,52 @@ module Spider::Page::Publish
25
25
 
26
26
  end
27
27
 
28
- module InstanceMethods
29
28
 
30
- def publish_to(*publishers)
31
- run_callbacks :publish do
32
- logger.debug "publish to #{publishers}"
33
- results = []
34
- [publishers].flatten.each do |publisher|
35
- logger.info "send self to #{publisher}"
36
- logger.debug "class:#{publisher.class.name}"
37
- publisher = case publisher
38
- when String,Symbol
39
- publisher.to_s.classify.constantize
40
- else
41
- # puts "default: #{publisher}"
42
- publisher
43
- end
44
- logger.debug "publisher: #{publisher}"
45
- result = nil
46
- begin
29
+ def publish_to(*publishers)
30
+ run_callbacks :publish do
31
+ logger.debug "publish to #{publishers}"
32
+ results = []
33
+ [publishers].flatten.each do |publisher|
34
+ logger.info "send self to #{publisher}"
35
+ logger.debug "class:#{publisher.class.name}"
36
+ publisher = case publisher
37
+ when String,Symbol
38
+ publisher.to_s.classify.constantize
39
+ else
40
+ # puts "default: #{publisher}"
41
+ publisher
42
+ end
43
+ logger.debug "publisher: #{publisher}"
44
+ result = nil
45
+ begin
47
46
 
48
- if publisher.respond_to?(:receive_spider_page)
49
- logger.debug "#{publisher} receive spider page #{self}"
50
- result = publisher.receive_spider_page self
51
- logger.debug "#{publisher} return #{result}"
52
- else
53
- logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
54
- end
55
- rescue Exception=>e
56
- logger.error e.message
57
- logger.error e.backtrace.join("\n")
47
+ if publisher.respond_to?(:receive_spider_page)
48
+ logger.debug "#{publisher} receive spider page #{self}"
49
+ result = publisher.receive_spider_page self
50
+ logger.debug "#{publisher} return #{result}"
51
+ else
52
+ logger.debug "publisher: #{publisher} not respond to :receive_spider_page"
58
53
  end
59
- results << result
54
+ rescue Exception=>e
55
+ logger.error e.message
56
+ logger.error e.backtrace.join("\n")
60
57
  end
61
- results
58
+ results << result
62
59
  end
60
+ results
63
61
  end
62
+ end
64
63
 
65
- def publish
66
- publishers = self.publishers.uniq
67
- if [:title,:body].all?{|name| attribute_names.include?(name) }
68
- logger.debug "[#{self} publish to #{publishers}"
69
- publish_to(publishers)
70
- else
71
- logger.debug "attribute names not include :title, :body,so publish canceled."
72
- end
64
+ def publish
65
+ publishers = self.publishers.uniq
66
+ if [:title,:body].all?{|name| attribute_names.include?(name) }
67
+ logger.debug "[#{self} publish to #{publishers}"
68
+ publish_to(publishers)
69
+ else
70
+ logger.debug "attribute names not include :title, :body,so publish canceled."
73
71
  end
72
+ end
74
73
 
75
74
 
76
- end
77
75
 
78
76
  end
@@ -1,3 +1,3 @@
1
1
  module Spider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -0,0 +1 @@
1
+ require "spider"
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  # desc "Explaining what the task does"
2
3
  # task :spider_fu do
3
4
  # # Task goes here
@@ -8,25 +9,89 @@ desc "开始采集"
8
9
  namespace :spider do
9
10
 
10
11
  namespace :proxy do
11
- desc "test proxy PAGE=XXXX::BasePage"
12
- task :test => :environment do
13
- klass = ENV['PAGE']
14
- begin
15
- klass = klass.constantize
16
- rescue Exception => e
17
- puts "unknow class `#{klass}`, please set a right spider page class to PAGE=XXXX::XxxPage"
18
- exit
12
+
13
+
14
+ desc "test proxy"
15
+ task :test2 => :environment do
16
+ times = ENV['TIMES'] && ENV['TIMES'].to_i || 50
17
+ verbose = ENV['VERBOSE']
18
+ proxies = Spider::Page.proxies
19
+ proxies_count = proxies.size
20
+ good_proxies = []
21
+
22
+ result = {}
23
+
24
+ times.times do |i|
25
+ puts "round #{i + 1}/#{times}" if verbose
26
+ proxies.each_with_index do |proxy,index|
27
+ print "(#{index + 1}/#{proxies_count} #{proxy.inspect}): " if verbose
28
+ result[proxy] ||= []
29
+ r = {}
30
+ time = Benchmark.ms do
31
+ r[:success ] = proxy.valid?
32
+ end / 1000
33
+ if r[:success]
34
+ print "OK" if verbose
35
+ else
36
+ print "FAILED" if verbose
37
+ end
38
+ print " " if verbose
39
+ print "#{time}s" if verbose
40
+ print "\n" if verbose
41
+ r[:time] = time
42
+ result[proxy] << r
43
+ end
19
44
  end
20
- result = klass.validate_proxies
21
- puts "valid proxies:"
22
- result[:valid].each do |proxy|
23
- puts proxy.join(":")
45
+
46
+ good_proxies = []
47
+ result.each_pair do |proxy,data|
48
+ success_times = data.find_all{|d| d[:success] }.size
49
+ success_ratio = success_times.to_f / times
50
+ average_time = data.map{|d| d[:time] }.sum / data.size
51
+ good_proxies << proxy if success_ratio > 0.95
52
+ puts "=> #{proxy}" if verbose
53
+ puts data.collect{|d| d[:success] ? "*" : "!"}.join("") if verbose
54
+ puts " => success times: #{success_times}(#{success_ratio * 100}%)" if verbose
55
+ puts " => average time per request: #{average_time} seconds." if verbose
56
+ puts if verbose
24
57
  end
25
- puts
26
- puts "invalid proxies:"
27
- result[:valid].each do |proxy|
28
- puts proxy.join(":")
58
+ puts " =============================== " if verbose
59
+ puts 'following proxies are 95% complete all test request:' if verbose
60
+ puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
61
+ end
62
+
63
+ desc "test proxy PAGE=XXXX::BasePage"
64
+ task :test => :environment do
65
+ times = ENV['TIMES'] && ENV['TIMES'].to_i || 10
66
+ verbose = ENV['VERBOSE']
67
+ proxies = Spider::Page.proxies
68
+ proxies_count = proxies.size
69
+ good_proxies = []
70
+ proxies.each_with_index do |proxy,index|
71
+ puts "(#{index + 1}/#{proxies_count}) => #{proxy.to_s}" if verbose
72
+ total_time = 0.0
73
+ success_times = 0
74
+ times.times do
75
+ total_time += Benchmark.ms do
76
+ if proxy.valid?
77
+ success_times += 1
78
+ print "*" if verbose
79
+ else
80
+ print "!" if verbose
81
+ end
82
+ STDOUT.flush
83
+ end / 1000
84
+ end
85
+ good_proxies << proxy if success_times == times
86
+ print "\n"
87
+ puts " => success times: #{success_times}" if verbose
88
+ puts " => total time:#{total_time/60} minutes" if verbose
89
+ puts " => average time per request: #{total_time / times} seconds." if verbose
90
+ puts if verbose
29
91
  end
92
+ puts " =============================== " if verbose
93
+ puts 'following proxies are 100% complete all test request:' if verbose
94
+ puts good_proxies.map{|proxy| proxy.to_s }.join("\n")
30
95
  end
31
96
  end
32
97
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider2
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
5
- prerelease:
4
+ hash: 13
5
+ prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 9
10
+ version: 0.0.9
11
11
  platform: ruby
12
12
  authors:
13
13
  - aotianlong
@@ -15,54 +15,10 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-03-04 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: rails
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ~>
27
- - !ruby/object:Gem::Version
28
- hash: 15
29
- segments:
30
- - 3
31
- - 2
32
- - 0
33
- version: 3.2.0
34
- type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: htmlentities
38
- prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
42
- - - ~>
43
- - !ruby/object:Gem::Version
44
- hash: 59
45
- segments:
46
- - 4
47
- - 1
48
- - 0
49
- version: 4.1.0
50
- type: :runtime
51
- version_requirements: *id002
52
- - !ruby/object:Gem::Dependency
53
- name: sqlite3
54
- prerelease: false
55
- requirement: &id003 !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ">="
59
- - !ruby/object:Gem::Version
60
- hash: 3
61
- segments:
62
- - 0
63
- version: "0"
64
- type: :development
65
- version_requirements: *id003
18
+ date: 2012-09-17 00:00:00 +08:00
19
+ default_executable:
20
+ dependencies: []
21
+
66
22
  description: a framework to crawl web pages
67
23
  email:
68
24
  - aotianlong@gmail.com
@@ -91,7 +47,10 @@ files:
91
47
  - lib/generators/spider_migration/spider_migration_generator.rb
92
48
  - lib/generators/spider_migration/templates/migration.rb
93
49
  - lib/spider/active_record_methods.rb
50
+ - lib/spider/engine.rb
94
51
  - lib/spider/http.rb
52
+ - lib/spider/httparty_patch.rb
53
+ - lib/spider/page/cache.rb
95
54
  - lib/spider/page/filter.rb
96
55
  - lib/spider/page/label.rb
97
56
  - lib/spider/page/pagination.rb
@@ -104,6 +63,7 @@ files:
104
63
  - lib/spider/spider_page_label.rb
105
64
  - lib/spider/version.rb
106
65
  - lib/spider.rb
66
+ - lib/spider2.rb
107
67
  - lib/tasks/spider_tasks.rake
108
68
  - MIT-LICENSE
109
69
  - Rakefile
@@ -113,6 +73,7 @@ files:
113
73
  - uninstall.rb
114
74
  - test/spider_fu_test.rb
115
75
  - test/test_helper.rb
76
+ has_rdoc: true
116
77
  homepage: http://www.powerapple.com
117
78
  licenses: []
118
79
 
@@ -142,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
103
  requirements: []
143
104
 
144
105
  rubyforge_project:
145
- rubygems_version: 1.8.15
106
+ rubygems_version: 1.3.7
146
107
  signing_key:
147
108
  specification_version: 3
148
109
  summary: spider