list_spider 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/list_spider.rb +209 -57
  3. metadata +2 -3
  4. data/lib/spider_base.rb +0 -298
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f73a2e9b358cac55336907ac76ebdb666b9d31f5
4
- data.tar.gz: 2df50eff29a1963224ca3f7d0cd9b3ac0c89156f
3
+ metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
4
+ data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
5
5
  SHA512:
6
- metadata.gz: 2cb02f9eb8593a05cc6b0a0c9d015ad93bf08663750d3dfe3007c30febfaa47d57c222960eba3b8e9275fd1f5acb942278180c8b613d8ffc0d983333f059ea8a
7
- data.tar.gz: a1800a9b27c769adbae11bc8e3f08e5d57d15b3b345d19acbd329142048e742acab8878328e3c8a9053a957c3c27ecf14cd067848d1bc056f552386243c33730
6
+ metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
7
+ data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465
data/lib/list_spider.rb CHANGED
@@ -1,81 +1,233 @@
1
- require File.expand_path('../spider_base', __FILE__)
1
+ require 'em-http-request'
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require 'set'
5
+ require "addressable/uri"
6
+ require File.expand_path('../spider_helper', __FILE__)
2
7
  require File.expand_path('../delete_unvalid', __FILE__)
3
8
 
4
- class ListSpider
5
-
6
- RANDOM_TIME = -1
7
- NO_LIMIT_CONCURRENT = -1
9
+ class TaskStruct
10
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
11
+ @origin_href = href
12
+ @href = href
13
+ if @href.class == "".class
14
+ @href = SpiderHelper.string_to_uri(@href)
15
+ end
16
+ @local_path = local_path
17
+ @http_method = http_method
18
+ @params = params
19
+ @extra_data = extra_data
20
+ @parse_method = parse_method
21
+ end
8
22
 
9
- @@random_time_range = 3..10
23
+ def == (o)
24
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
25
+ end
10
26
 
11
- include SpiderBase
27
+ attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
12
28
 
13
- def initialize(down_list, inter_val: 0, max: 30)
14
- @down_list = down_list
15
- @inter_val = inter_val
16
- @max = max
17
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
18
- @succeed_size = 0
19
- @failed_size = 0
20
- end
29
+ end
30
+
31
+ module ListSpider
32
+
33
+ RANDOM_TIME = -1
34
+ NO_LIMIT_CONCURRENT = -1
21
35
 
22
- attr_reader :succeed_size, :failed_size
36
+ @@random_time_range = 3..10
37
+ @@conver_to_utf8 = false
38
+ @@connection_opts = {:connect_timeout => 2*60}
39
+ @@overwrite_exist = false
40
+ @@max_redirects = 10
41
+ @@url_set = Set.new
23
42
 
24
43
  class << self
25
44
 
26
- attr_accessor :random_time_range
45
+ attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
27
46
 
28
- end
47
+ def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
48
+ @@connection_opts = {
49
+ :proxy => {
50
+ :host => proxy_addr,
51
+ :port => proxy_port
52
+ }
53
+ }
54
+ @@connection_opts[:proxy][:authorization] = [username, password] if username && password
55
+ end
29
56
 
30
- def add_task(task)
31
- if task.is_a?Array
32
- @down_list = @down_list + task
33
- elsif task.is_a?TaskStruct
34
- @down_list << task
35
- else
36
- puts "error task type:#{task.class}"
57
+ def connect_timeout(max_connect_time)
58
+ @@connection_opts[:connect_timeout] = max_connect_time
37
59
  end
38
- end
39
60
 
40
- def complete(multi, success_list, failed_list)
41
- @succeed_size += success_list.size
42
- @failed_size += failed_list.size
43
- # puts "success size:#{success_list.size}"
44
- # puts "failed size:#{failed_list.size}"
45
- success_list.each do |e|
46
- e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
61
+ def set_header_option(header_option)
62
+ @@header_option = optHash
47
63
  end
48
-
49
- todo = @down_list.slice!(0, @max)
50
- if todo.empty?
51
- puts "success size:#{@succeed_size}"
52
- puts "failed size:#{@failed_size}"
53
- EventMachine.stop
54
- else
55
- if @inter_val != 0
56
- if success_list.size != 0 || failed_list.size !=0
57
- if @inter_val == RANDOM_TIME
58
- sleep(rand(@@random_time_range))
64
+
65
+ def event_machine_down(link_struct_list, callback = nil)
66
+ failed_list = []
67
+ succeed_list = []
68
+ multi = EventMachine::MultiRequest.new
69
+ # no_job = true
70
+ begin_time = Time.now
71
+
72
+ for_each_proc = proc do |e|
73
+ # if !@@overwrite_exist && File.exist?(e.local_path)
74
+ # succeed_list << e
75
+ # else
76
+ next unless @@url_set.add?(e.href)
77
+ # no_job = false
78
+ opt = {}
79
+ opt = {:redirects => @@max_redirects}
80
+ opt[:head] = @@header_option if defined? @@header_option
81
+ if e.http_method == :post
82
+ opt[:body] = e.params unless e.params.empty?
83
+ if @@connection_opts
84
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
85
+ else
86
+ w = EventMachine::HttpRequest.new(e.href).post opt
87
+ end
59
88
  else
60
- sleep(@inter_val)
89
+ if @@connection_opts
90
+ opt[:query] = e.params unless e.params.empty?
91
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
92
+ else
93
+ w = EventMachine::HttpRequest.new(e.href).get opt
94
+ end
61
95
  end
96
+
97
+ w.callback {
98
+ @@url_set.delete(e.href)
99
+ # puts "complete:#{w.response_header}"
100
+ s = w.response_header.status
101
+ puts s
102
+ if s != 404
103
+ local_dir = File.dirname(e.local_path)
104
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
105
+ begin
106
+ File.open(e.local_path, "w") do |f|
107
+ if @@conver_to_utf8 == true
108
+ f << SpiderHelper.to_utf8( w.response)
109
+ else
110
+ f << w.response
111
+ end
112
+ end
113
+ succeed_list << e
114
+ rescue Exception => e
115
+ puts e
116
+ end
117
+ end
118
+ }
119
+ w.errback {
120
+ @@url_set.delete(e.href)
121
+ puts "errback:#{w.response_header}"
122
+ puts e.origin_href
123
+ puts e.href
124
+ puts w.response_header.status
125
+ failed_list << e
126
+ if e.http_method == :get
127
+ SpiderHelper.direct_http_get(e.href, e.local_path)
128
+ elsif e.http_method == :post
129
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
130
+ end
131
+ }
132
+ multi.add e.local_path, w
133
+ # end
134
+ end
135
+
136
+ cb = Proc.new do
137
+ end_time = Time.now
138
+ puts "use time:#{end_time-begin_time} seconds"
139
+ if callback.nil?
140
+ puts "success size:#{self.succeed_size}"
141
+ puts "failed size:#{self.failed_size}"
142
+ EventMachine.stop
143
+ else
144
+ callback.call(multi, succeed_list, failed_list)
62
145
  end
63
146
  end
64
- batch_down_list(todo, method(:complete))
147
+
148
+ link_struct_list.each &for_each_proc
149
+ multi.callback &cb
65
150
  end
66
- end
67
151
 
68
- def start
69
- puts "total size:#{@down_list.size}"
70
- event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
71
- end
152
+ def complete(multi, success_list, failed_list)
153
+ @@succeed_size += success_list.size
154
+ @@failed_size += failed_list.size
155
+ success_list.each do |e|
156
+ e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
157
+ end
72
158
 
73
- def self.get_list(down_list, inter_val: 0, max: 30)
74
- ListSpider.new(down_list, inter_val: inter_val, max: max).start
75
- end
159
+ todo = @@down_list.slice!(0, @@max)
160
+ if todo.empty?
161
+ puts "success size:#{@@succeed_size}"
162
+ puts "failed size:#{@@failed_size}"
163
+ EventMachine.stop
164
+ else
165
+ if @@inter_val != 0
166
+ if success_list.size != 0 || failed_list.size !=0
167
+ if @@inter_val == RANDOM_TIME
168
+ sleep(rand(@@random_time_range))
169
+ else
170
+ sleep(@@inter_val)
171
+ end
172
+ end
173
+ end
174
+ event_machine_down(todo, method(:complete))
175
+ end
176
+ end
76
177
 
77
- def self.get_one(task)
78
- ListSpider.new([task]).start
79
- end
178
+ def event_machine_start_list(down_list, callback = nil)
179
+ EventMachine.run {
180
+ if down_list.empty?
181
+ callback.call(nil, [], []) if callback
182
+ else
183
+ event_machine_down(down_list, callback)
184
+ end
185
+ }
186
+ end
80
187
 
188
+ def filter_list(down_list)
189
+ need_down_list = []
190
+ down_list.each do |ts|
191
+ if !@@overwrite_exist && File.exist?(ts.local_path)
192
+ ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
193
+ else
194
+ need_down_list << ts
195
+ end
196
+ end
197
+ return need_down_list
198
+ end
199
+
200
+ def get_list(down_list, inter_val: 0, max: 30)
201
+ @@down_list = []
202
+
203
+ need_down_list = filter_list(down_list)
204
+
205
+ @@down_list = @@down_list + need_down_list
206
+ @@inter_val = inter_val
207
+ @@max = max
208
+ @@max = @@down_list.size if @@max == NO_LIMIT_CONCURRENT
209
+ @@succeed_size = 0
210
+ @@failed_size = 0
211
+
212
+ puts "total size:#{@@down_list.size}"
213
+ event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
214
+ end
215
+
216
+ def get_one(task)
217
+ get_list([task])
218
+ end
219
+
220
+ def add_task(task)
221
+ if task.is_a?Array
222
+ need_down_list = filter_list(task)
223
+ @@down_list = @@down_list + need_down_list
224
+ elsif task.is_a?TaskStruct
225
+ need_down_list = filter_list([task])
226
+ @@down_list = @@down_list + need_down_list
227
+ else
228
+ puts "error task type:#{task.class}"
229
+ end
230
+ end
231
+
232
+ end
81
233
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-29 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -58,7 +58,6 @@ extra_rdoc_files: []
58
58
  files:
59
59
  - lib/delete_unvalid.rb
60
60
  - lib/list_spider.rb
61
- - lib/spider_base.rb
62
61
  - lib/spider_helper.rb
63
62
  homepage: https://github.com/chinazhangchao/list_spider
64
63
  licenses:
data/lib/spider_base.rb DELETED
@@ -1,298 +0,0 @@
1
- require 'em-http-request'
2
- require 'nokogiri'
3
- require 'fileutils'
4
- require 'set'
5
- require File.expand_path('../spider_helper', __FILE__)
6
- require "addressable/uri"
7
-
8
- class TaskStruct
9
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
- @origin_href = href
11
- @href = href
12
- if @href.class == "".class
13
- @href = SpiderHelper.string_to_uri(@href)
14
- end
15
- @local_path = local_path
16
- @http_method = http_method
17
- @params = params
18
- @extra_data = extra_data
19
- @parse_method = parse_method
20
- end
21
-
22
- def == (o)
23
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
24
- end
25
-
26
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
27
-
28
- end
29
-
30
- module SpiderBase
31
-
32
- @@conver_to_utf8 = false
33
- @@connection_opts = {:connect_timeout => 2*60}
34
- @@overwrite_exist = false
35
- @@max_redirects = 10
36
-
37
- class << self
38
-
39
- attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
40
-
41
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
42
- @@connection_opts = {
43
- :proxy => {
44
- :host => proxy_addr,
45
- :port => proxy_port
46
- }
47
- }
48
- @@connection_opts[:proxy][:authorization] = [username, password] if username && password
49
- end
50
-
51
- def connect_timeout(max_connect_time)
52
- @@connection_opts[:connect_timeout] = max_connect_time
53
- end
54
-
55
- def set_header_option(header_option)
56
- @@header_option = optHash
57
- end
58
-
59
- def event_machine_down(link_struct_list, callback = nil)
60
- failed_list = []
61
- succeed_list = []
62
- # puts "event_machine_down callback:#{callback}"
63
- multi = EventMachine::MultiRequest.new
64
- no_job = true
65
- begin_time = Time.now
66
-
67
- for_each_proc = proc do |e|
68
- if !@@overwrite_exist && File.exist?(e.local_path)
69
- succeed_list << e
70
- else
71
- no_job = false
72
- opt = {}
73
- opt = {:redirects => @@max_redirects}
74
- opt[:head] = @@header_option if defined? @@header_option
75
- if e.http_method == :post
76
- opt[:body] = e.params unless e.params.empty?
77
- if @@connection_opts
78
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
79
- else
80
- w = EventMachine::HttpRequest.new(e.href).post opt
81
- end
82
- else
83
- if @@connection_opts
84
- opt[:query] = e.params unless e.params.empty?
85
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
86
- else
87
- w = EventMachine::HttpRequest.new(e.href).get opt
88
- end
89
- end
90
-
91
- w.callback {
92
- # puts "complete:#{w.response_header}"
93
- s = w.response_header.status
94
- puts s
95
- if s == 403 || s == 502 #Forbidden
96
- # EventMachine.stop
97
- elsif s != 404
98
- local_dir = File.dirname(e.local_path)
99
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
100
- begin
101
- File.open(e.local_path, "w") do |f|
102
- if @@conver_to_utf8 == true
103
- f << SpiderHelper.to_utf8( w.response)
104
- else
105
- f << w.response
106
- end
107
- end
108
- succeed_list << e
109
- rescue Exception => e
110
- puts e
111
- end
112
- end
113
- }
114
- w.errback {
115
- puts "errback:#{w.response_header}"
116
- puts e.origin_href
117
- puts e.href
118
- puts w.response_header.status
119
- failed_list << e
120
- if e.http_method == :get
121
- SpiderHelper.direct_http_get(e.href, e.local_path)
122
- elsif e.http_method == :post
123
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
124
- end
125
- }
126
- multi.add e.local_path, w
127
- end
128
- end
129
-
130
- # em_for_each_proc = proc do |e, iter|
131
- # for_each_proc.call(e)
132
- # iter.next
133
- # end
134
-
135
- cb = Proc.new do
136
- end_time = Time.now
137
- puts "use time:#{end_time-begin_time} seconds"
138
- if callback.nil?
139
- puts "success size:#{self.succeed_size}"
140
- puts "failed size:#{self.failed_size}"
141
- EventMachine.stop
142
- else
143
- callback.call(multi, succeed_list, failed_list)
144
- end
145
- end
146
-
147
- after_proc = proc {
148
- if no_job #没有任务直接调回调
149
- cb.call
150
- else
151
- multi.callback &cb
152
- end
153
- }
154
-
155
- # if DownLoadConfig::MaxConcurrent <= 0
156
- link_struct_list.each &for_each_proc
157
- after_proc.call
158
- # else
159
- # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
160
- # end
161
- end
162
-
163
- def event_machine_start(url, down_dir, file_name, callback = nil)
164
- down_dir << "/" unless down_dir.end_with?("/")
165
- FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
166
- down_list = []
167
- down_list << TaskStruct.new(url, down_dir + file_name)
168
- EventMachine.run {
169
- index = 0
170
- begin_time = Time.now
171
- event_machine_down(down_list, callback)
172
- end_time = Time.now
173
- }
174
- end
175
-
176
- def event_machine_start_list(down_list, callback = nil)
177
- EventMachine.run {
178
- index = 0
179
- begin_time = Time.now
180
- event_machine_down(down_list, callback)
181
- end_time = Time.now
182
- }
183
- end
184
-
185
- end#self end
186
- end#SpiderBase end
187
-
188
- def batch_down_list(down_list, callback = nil)
189
- SpiderBase.event_machine_down(down_list, callback)
190
- end
191
-
192
- def event_machine_start_list(down_list, callback = nil)
193
- SpiderBase.event_machine_start_list(down_list, callback)
194
- end
195
-
196
- def parse_down_load_url(url, down_dir, file_name, callback = nil)
197
- SpiderBase.event_machine_start(url, down_dir, file_name, callback)
198
- end
199
-
200
- class GetRelative
201
-
202
- def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
203
- @get_depth = get_depth
204
- @base_url = base_url
205
- @down_dir = down_dir
206
- @suffix = suffix
207
- end
208
-
209
- def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
210
- puts "success"
211
- puts succeed_list.size
212
- puts "error"
213
- puts failed_list.size
214
- puts failed_list
215
- puts "get index complete"
216
- if succeed_list.size > 0
217
- link_list = []
218
- succeed_list.each do |e|
219
- doc = Nokogiri::HTML(open(e.local_path))
220
- link_list.concat(doc.css("a"))
221
- end
222
- puts "extrat href complete"
223
-
224
- down_dir << "/" unless down_dir.end_with?("/")
225
- FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
226
-
227
- down_list = []
228
- set_list = Set.new
229
- link_list.each do |link|
230
- href = link['href']
231
- next if href.nil? || !href.include?(@suffix)
232
- #process such as "scheme_2.html#SEC15"
233
- href = href[0, href.index(@suffix) + 5]
234
- #process such as "./preface.html"
235
- href = href[2..-1] if href.start_with?("./")
236
-
237
- next if !set_list.add?(href)
238
- unless base_url.end_with?("/")
239
- i = base_url.rindex"/"
240
- base_url = base_url[0..i]
241
- end
242
-
243
- #process such as "http://www.ccs.neu.edu/~dorai"
244
- next if href.start_with?("http:") || href.start_with?("https:")
245
-
246
- local_path = down_dir + href
247
-
248
- down_list.push( TaskStruct.new(base_url + href, local_path))
249
- end
250
- puts "down list complete,size:#{down_list.size}"
251
- batch_down_list(down_list, callback)
252
- end
253
- end
254
-
255
- def down_other_node (multi, succeed_list, failed_list)
256
- puts "down_other_node"
257
- @get_depth = @get_depth - 1
258
- puts "depth:#{@get_depth}"
259
- if @get_depth <= 0
260
- down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
261
- else
262
- down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
263
- end
264
- end
265
-
266
- def event_all_complete (multi, succeed_list, failed_list)
267
- puts "all complete"
268
- puts "success"
269
- puts succeed_list.size
270
- puts "error"
271
- puts failed_list.size
272
- puts failed_list
273
- EventMachine.stop
274
- end
275
-
276
- attr_writer :get_depth,:base_url,:down_dir
277
-
278
- def start
279
- index_file_name = "index.html"
280
- #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
281
- unless @base_url.end_with?("/")
282
- i = @base_url.rindex"/"
283
- index_file_name = @base_url[i+1 .. -1]
284
- end
285
-
286
- @get_depth = @get_depth - 1
287
- puts @get_depth
288
- if @get_depth <= 0
289
- parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
290
- else
291
- parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
292
- end
293
- end
294
-
295
- def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
296
- GetRelative.new(base_url,down_dir, get_depth, suffix).start
297
- end
298
- end #GetRelative