list_spider 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/list_spider.rb +209 -57
  3. metadata +2 -3
  4. data/lib/spider_base.rb +0 -298
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f73a2e9b358cac55336907ac76ebdb666b9d31f5
4
- data.tar.gz: 2df50eff29a1963224ca3f7d0cd9b3ac0c89156f
3
+ metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
4
+ data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
5
5
  SHA512:
6
- metadata.gz: 2cb02f9eb8593a05cc6b0a0c9d015ad93bf08663750d3dfe3007c30febfaa47d57c222960eba3b8e9275fd1f5acb942278180c8b613d8ffc0d983333f059ea8a
7
- data.tar.gz: a1800a9b27c769adbae11bc8e3f08e5d57d15b3b345d19acbd329142048e742acab8878328e3c8a9053a957c3c27ecf14cd067848d1bc056f552386243c33730
6
+ metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
7
+ data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465
data/lib/list_spider.rb CHANGED
@@ -1,81 +1,233 @@
1
- require File.expand_path('../spider_base', __FILE__)
1
+ require 'em-http-request'
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require 'set'
5
+ require "addressable/uri"
6
+ require File.expand_path('../spider_helper', __FILE__)
2
7
  require File.expand_path('../delete_unvalid', __FILE__)
3
8
 
4
- class ListSpider
5
-
6
- RANDOM_TIME = -1
7
- NO_LIMIT_CONCURRENT = -1
9
+ class TaskStruct
10
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
11
+ @origin_href = href
12
+ @href = href
13
+ if @href.class == "".class
14
+ @href = SpiderHelper.string_to_uri(@href)
15
+ end
16
+ @local_path = local_path
17
+ @http_method = http_method
18
+ @params = params
19
+ @extra_data = extra_data
20
+ @parse_method = parse_method
21
+ end
8
22
 
9
- @@random_time_range = 3..10
23
+ def == (o)
24
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
25
+ end
10
26
 
11
- include SpiderBase
27
+ attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
12
28
 
13
- def initialize(down_list, inter_val: 0, max: 30)
14
- @down_list = down_list
15
- @inter_val = inter_val
16
- @max = max
17
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
18
- @succeed_size = 0
19
- @failed_size = 0
20
- end
29
+ end
30
+
31
+ module ListSpider
32
+
33
+ RANDOM_TIME = -1
34
+ NO_LIMIT_CONCURRENT = -1
21
35
 
22
- attr_reader :succeed_size, :failed_size
36
+ @@random_time_range = 3..10
37
+ @@conver_to_utf8 = false
38
+ @@connection_opts = {:connect_timeout => 2*60}
39
+ @@overwrite_exist = false
40
+ @@max_redirects = 10
41
+ @@url_set = Set.new
23
42
 
24
43
  class << self
25
44
 
26
- attr_accessor :random_time_range
45
+ attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
27
46
 
28
- end
47
+ def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
48
+ @@connection_opts = {
49
+ :proxy => {
50
+ :host => proxy_addr,
51
+ :port => proxy_port
52
+ }
53
+ }
54
+ @@connection_opts[:proxy][:authorization] = [username, password] if username && password
55
+ end
29
56
 
30
- def add_task(task)
31
- if task.is_a?Array
32
- @down_list = @down_list + task
33
- elsif task.is_a?TaskStruct
34
- @down_list << task
35
- else
36
- puts "error task type:#{task.class}"
57
+ def connect_timeout(max_connect_time)
58
+ @@connection_opts[:connect_timeout] = max_connect_time
37
59
  end
38
- end
39
60
 
40
- def complete(multi, success_list, failed_list)
41
- @succeed_size += success_list.size
42
- @failed_size += failed_list.size
43
- # puts "success size:#{success_list.size}"
44
- # puts "failed size:#{failed_list.size}"
45
- success_list.each do |e|
46
- e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
61
+ def set_header_option(header_option)
62
+ @@header_option = optHash
47
63
  end
48
-
49
- todo = @down_list.slice!(0, @max)
50
- if todo.empty?
51
- puts "success size:#{@succeed_size}"
52
- puts "failed size:#{@failed_size}"
53
- EventMachine.stop
54
- else
55
- if @inter_val != 0
56
- if success_list.size != 0 || failed_list.size !=0
57
- if @inter_val == RANDOM_TIME
58
- sleep(rand(@@random_time_range))
64
+
65
+ def event_machine_down(link_struct_list, callback = nil)
66
+ failed_list = []
67
+ succeed_list = []
68
+ multi = EventMachine::MultiRequest.new
69
+ # no_job = true
70
+ begin_time = Time.now
71
+
72
+ for_each_proc = proc do |e|
73
+ # if !@@overwrite_exist && File.exist?(e.local_path)
74
+ # succeed_list << e
75
+ # else
76
+ next unless @@url_set.add?(e.href)
77
+ # no_job = false
78
+ opt = {}
79
+ opt = {:redirects => @@max_redirects}
80
+ opt[:head] = @@header_option if defined? @@header_option
81
+ if e.http_method == :post
82
+ opt[:body] = e.params unless e.params.empty?
83
+ if @@connection_opts
84
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
85
+ else
86
+ w = EventMachine::HttpRequest.new(e.href).post opt
87
+ end
59
88
  else
60
- sleep(@inter_val)
89
+ if @@connection_opts
90
+ opt[:query] = e.params unless e.params.empty?
91
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
92
+ else
93
+ w = EventMachine::HttpRequest.new(e.href).get opt
94
+ end
61
95
  end
96
+
97
+ w.callback {
98
+ @@url_set.delete(e.href)
99
+ # puts "complete:#{w.response_header}"
100
+ s = w.response_header.status
101
+ puts s
102
+ if s != 404
103
+ local_dir = File.dirname(e.local_path)
104
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
105
+ begin
106
+ File.open(e.local_path, "w") do |f|
107
+ if @@conver_to_utf8 == true
108
+ f << SpiderHelper.to_utf8( w.response)
109
+ else
110
+ f << w.response
111
+ end
112
+ end
113
+ succeed_list << e
114
+ rescue Exception => e
115
+ puts e
116
+ end
117
+ end
118
+ }
119
+ w.errback {
120
+ @@url_set.delete(e.href)
121
+ puts "errback:#{w.response_header}"
122
+ puts e.origin_href
123
+ puts e.href
124
+ puts w.response_header.status
125
+ failed_list << e
126
+ if e.http_method == :get
127
+ SpiderHelper.direct_http_get(e.href, e.local_path)
128
+ elsif e.http_method == :post
129
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
130
+ end
131
+ }
132
+ multi.add e.local_path, w
133
+ # end
134
+ end
135
+
136
+ cb = Proc.new do
137
+ end_time = Time.now
138
+ puts "use time:#{end_time-begin_time} seconds"
139
+ if callback.nil?
140
+ puts "success size:#{self.succeed_size}"
141
+ puts "failed size:#{self.failed_size}"
142
+ EventMachine.stop
143
+ else
144
+ callback.call(multi, succeed_list, failed_list)
62
145
  end
63
146
  end
64
- batch_down_list(todo, method(:complete))
147
+
148
+ link_struct_list.each &for_each_proc
149
+ multi.callback &cb
65
150
  end
66
- end
67
151
 
68
- def start
69
- puts "total size:#{@down_list.size}"
70
- event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
71
- end
152
+ def complete(multi, success_list, failed_list)
153
+ @@succeed_size += success_list.size
154
+ @@failed_size += failed_list.size
155
+ success_list.each do |e|
156
+ e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
157
+ end
72
158
 
73
- def self.get_list(down_list, inter_val: 0, max: 30)
74
- ListSpider.new(down_list, inter_val: inter_val, max: max).start
75
- end
159
+ todo = @@down_list.slice!(0, @@max)
160
+ if todo.empty?
161
+ puts "success size:#{@@succeed_size}"
162
+ puts "failed size:#{@@failed_size}"
163
+ EventMachine.stop
164
+ else
165
+ if @@inter_val != 0
166
+ if success_list.size != 0 || failed_list.size !=0
167
+ if @@inter_val == RANDOM_TIME
168
+ sleep(rand(@@random_time_range))
169
+ else
170
+ sleep(@@inter_val)
171
+ end
172
+ end
173
+ end
174
+ event_machine_down(todo, method(:complete))
175
+ end
176
+ end
76
177
 
77
- def self.get_one(task)
78
- ListSpider.new([task]).start
79
- end
178
+ def event_machine_start_list(down_list, callback = nil)
179
+ EventMachine.run {
180
+ if down_list.empty?
181
+ callback.call(nil, [], []) if callback
182
+ else
183
+ event_machine_down(down_list, callback)
184
+ end
185
+ }
186
+ end
80
187
 
188
+ def filter_list(down_list)
189
+ need_down_list = []
190
+ down_list.each do |ts|
191
+ if !@@overwrite_exist && File.exist?(ts.local_path)
192
+ ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
193
+ else
194
+ need_down_list << ts
195
+ end
196
+ end
197
+ return need_down_list
198
+ end
199
+
200
+ def get_list(down_list, inter_val: 0, max: 30)
201
+ @@down_list = []
202
+
203
+ need_down_list = filter_list(down_list)
204
+
205
+ @@down_list = @@down_list + need_down_list
206
+ @@inter_val = inter_val
207
+ @@max = max
208
+ @@max = @@down_list.size if @@max == NO_LIMIT_CONCURRENT
209
+ @@succeed_size = 0
210
+ @@failed_size = 0
211
+
212
+ puts "total size:#{@@down_list.size}"
213
+ event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
214
+ end
215
+
216
+ def get_one(task)
217
+ get_list([task])
218
+ end
219
+
220
+ def add_task(task)
221
+ if task.is_a?Array
222
+ need_down_list = filter_list(task)
223
+ @@down_list = @@down_list + need_down_list
224
+ elsif task.is_a?TaskStruct
225
+ need_down_list = filter_list([task])
226
+ @@down_list = @@down_list + need_down_list
227
+ else
228
+ puts "error task type:#{task.class}"
229
+ end
230
+ end
231
+
232
+ end
81
233
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-29 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -58,7 +58,6 @@ extra_rdoc_files: []
58
58
  files:
59
59
  - lib/delete_unvalid.rb
60
60
  - lib/list_spider.rb
61
- - lib/spider_base.rb
62
61
  - lib/spider_helper.rb
63
62
  homepage: https://github.com/chinazhangchao/list_spider
64
63
  licenses:
data/lib/spider_base.rb DELETED
@@ -1,298 +0,0 @@
1
- require 'em-http-request'
2
- require 'nokogiri'
3
- require 'fileutils'
4
- require 'set'
5
- require File.expand_path('../spider_helper', __FILE__)
6
- require "addressable/uri"
7
-
8
- class TaskStruct
9
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
- @origin_href = href
11
- @href = href
12
- if @href.class == "".class
13
- @href = SpiderHelper.string_to_uri(@href)
14
- end
15
- @local_path = local_path
16
- @http_method = http_method
17
- @params = params
18
- @extra_data = extra_data
19
- @parse_method = parse_method
20
- end
21
-
22
- def == (o)
23
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
24
- end
25
-
26
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
27
-
28
- end
29
-
30
- module SpiderBase
31
-
32
- @@conver_to_utf8 = false
33
- @@connection_opts = {:connect_timeout => 2*60}
34
- @@overwrite_exist = false
35
- @@max_redirects = 10
36
-
37
- class << self
38
-
39
- attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
40
-
41
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
42
- @@connection_opts = {
43
- :proxy => {
44
- :host => proxy_addr,
45
- :port => proxy_port
46
- }
47
- }
48
- @@connection_opts[:proxy][:authorization] = [username, password] if username && password
49
- end
50
-
51
- def connect_timeout(max_connect_time)
52
- @@connection_opts[:connect_timeout] = max_connect_time
53
- end
54
-
55
- def set_header_option(header_option)
56
- @@header_option = optHash
57
- end
58
-
59
- def event_machine_down(link_struct_list, callback = nil)
60
- failed_list = []
61
- succeed_list = []
62
- # puts "event_machine_down callback:#{callback}"
63
- multi = EventMachine::MultiRequest.new
64
- no_job = true
65
- begin_time = Time.now
66
-
67
- for_each_proc = proc do |e|
68
- if !@@overwrite_exist && File.exist?(e.local_path)
69
- succeed_list << e
70
- else
71
- no_job = false
72
- opt = {}
73
- opt = {:redirects => @@max_redirects}
74
- opt[:head] = @@header_option if defined? @@header_option
75
- if e.http_method == :post
76
- opt[:body] = e.params unless e.params.empty?
77
- if @@connection_opts
78
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
79
- else
80
- w = EventMachine::HttpRequest.new(e.href).post opt
81
- end
82
- else
83
- if @@connection_opts
84
- opt[:query] = e.params unless e.params.empty?
85
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
86
- else
87
- w = EventMachine::HttpRequest.new(e.href).get opt
88
- end
89
- end
90
-
91
- w.callback {
92
- # puts "complete:#{w.response_header}"
93
- s = w.response_header.status
94
- puts s
95
- if s == 403 || s == 502 #Forbidden
96
- # EventMachine.stop
97
- elsif s != 404
98
- local_dir = File.dirname(e.local_path)
99
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
100
- begin
101
- File.open(e.local_path, "w") do |f|
102
- if @@conver_to_utf8 == true
103
- f << SpiderHelper.to_utf8( w.response)
104
- else
105
- f << w.response
106
- end
107
- end
108
- succeed_list << e
109
- rescue Exception => e
110
- puts e
111
- end
112
- end
113
- }
114
- w.errback {
115
- puts "errback:#{w.response_header}"
116
- puts e.origin_href
117
- puts e.href
118
- puts w.response_header.status
119
- failed_list << e
120
- if e.http_method == :get
121
- SpiderHelper.direct_http_get(e.href, e.local_path)
122
- elsif e.http_method == :post
123
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
124
- end
125
- }
126
- multi.add e.local_path, w
127
- end
128
- end
129
-
130
- # em_for_each_proc = proc do |e, iter|
131
- # for_each_proc.call(e)
132
- # iter.next
133
- # end
134
-
135
- cb = Proc.new do
136
- end_time = Time.now
137
- puts "use time:#{end_time-begin_time} seconds"
138
- if callback.nil?
139
- puts "success size:#{self.succeed_size}"
140
- puts "failed size:#{self.failed_size}"
141
- EventMachine.stop
142
- else
143
- callback.call(multi, succeed_list, failed_list)
144
- end
145
- end
146
-
147
- after_proc = proc {
148
- if no_job #没有任务直接调回调
149
- cb.call
150
- else
151
- multi.callback &cb
152
- end
153
- }
154
-
155
- # if DownLoadConfig::MaxConcurrent <= 0
156
- link_struct_list.each &for_each_proc
157
- after_proc.call
158
- # else
159
- # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
160
- # end
161
- end
162
-
163
- def event_machine_start(url, down_dir, file_name, callback = nil)
164
- down_dir << "/" unless down_dir.end_with?("/")
165
- FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
166
- down_list = []
167
- down_list << TaskStruct.new(url, down_dir + file_name)
168
- EventMachine.run {
169
- index = 0
170
- begin_time = Time.now
171
- event_machine_down(down_list, callback)
172
- end_time = Time.now
173
- }
174
- end
175
-
176
- def event_machine_start_list(down_list, callback = nil)
177
- EventMachine.run {
178
- index = 0
179
- begin_time = Time.now
180
- event_machine_down(down_list, callback)
181
- end_time = Time.now
182
- }
183
- end
184
-
185
- end#self end
186
- end#SpiderBase end
187
-
188
- def batch_down_list(down_list, callback = nil)
189
- SpiderBase.event_machine_down(down_list, callback)
190
- end
191
-
192
- def event_machine_start_list(down_list, callback = nil)
193
- SpiderBase.event_machine_start_list(down_list, callback)
194
- end
195
-
196
- def parse_down_load_url(url, down_dir, file_name, callback = nil)
197
- SpiderBase.event_machine_start(url, down_dir, file_name, callback)
198
- end
199
-
200
- class GetRelative
201
-
202
- def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
203
- @get_depth = get_depth
204
- @base_url = base_url
205
- @down_dir = down_dir
206
- @suffix = suffix
207
- end
208
-
209
- def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
210
- puts "success"
211
- puts succeed_list.size
212
- puts "error"
213
- puts failed_list.size
214
- puts failed_list
215
- puts "get index complete"
216
- if succeed_list.size > 0
217
- link_list = []
218
- succeed_list.each do |e|
219
- doc = Nokogiri::HTML(open(e.local_path))
220
- link_list.concat(doc.css("a"))
221
- end
222
- puts "extrat href complete"
223
-
224
- down_dir << "/" unless down_dir.end_with?("/")
225
- FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
226
-
227
- down_list = []
228
- set_list = Set.new
229
- link_list.each do |link|
230
- href = link['href']
231
- next if href.nil? || !href.include?(@suffix)
232
- #process such as "scheme_2.html#SEC15"
233
- href = href[0, href.index(@suffix) + 5]
234
- #process such as "./preface.html"
235
- href = href[2..-1] if href.start_with?("./")
236
-
237
- next if !set_list.add?(href)
238
- unless base_url.end_with?("/")
239
- i = base_url.rindex"/"
240
- base_url = base_url[0..i]
241
- end
242
-
243
- #process such as "http://www.ccs.neu.edu/~dorai"
244
- next if href.start_with?("http:") || href.start_with?("https:")
245
-
246
- local_path = down_dir + href
247
-
248
- down_list.push( TaskStruct.new(base_url + href, local_path))
249
- end
250
- puts "down list complete,size:#{down_list.size}"
251
- batch_down_list(down_list, callback)
252
- end
253
- end
254
-
255
- def down_other_node (multi, succeed_list, failed_list)
256
- puts "down_other_node"
257
- @get_depth = @get_depth - 1
258
- puts "depth:#{@get_depth}"
259
- if @get_depth <= 0
260
- down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
261
- else
262
- down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
263
- end
264
- end
265
-
266
- def event_all_complete (multi, succeed_list, failed_list)
267
- puts "all complete"
268
- puts "success"
269
- puts succeed_list.size
270
- puts "error"
271
- puts failed_list.size
272
- puts failed_list
273
- EventMachine.stop
274
- end
275
-
276
- attr_writer :get_depth,:base_url,:down_dir
277
-
278
- def start
279
- index_file_name = "index.html"
280
- #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
281
- unless @base_url.end_with?("/")
282
- i = @base_url.rindex"/"
283
- index_file_name = @base_url[i+1 .. -1]
284
- end
285
-
286
- @get_depth = @get_depth - 1
287
- puts @get_depth
288
- if @get_depth <= 0
289
- parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
290
- else
291
- parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
292
- end
293
- end
294
-
295
- def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
296
- GetRelative.new(base_url,down_dir, get_depth, suffix).start
297
- end
298
- end #GetRelative