list_spider 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f897ec31adc2d2a6c9713729030e8decc85cb9d
4
+ data.tar.gz: c81ed9cab5fc1bbd4395aeb9692924887042c27f
5
+ SHA512:
6
+ metadata.gz: 907529b23336256e2c72232ec2b413c50ebee99df49695bac368ae4dc029afaa81424fe542c63d6c7247714964382e2eaeaae0ea34ab51658e34ce67e5c7b9e7
7
+ data.tar.gz: 22dff7b012b1c12f8cefb50606e0a9b0874c5b013a305c268ae2a950374760ba77ed24af075f6ca5217e128d905b31bae754107e0cf063e0d90171dd51b94f20
@@ -0,0 +1,40 @@
1
+
2
+ class DeleteUnvalid
3
+ # 4033
4
+ # 920
5
+ def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
6
+ @dir_pattern = dir_pattern
7
+ @size_threshold = size_threshold
8
+ if cust_judge
9
+ @cust_judge = cust_judge
10
+ else
11
+ @cust_judge = method(:default_judge)
12
+ end
13
+ @total = 0
14
+ end
15
+
16
+ def default_judge(f)
17
+ File.size(f) <= @size_threshold
18
+ end
19
+
20
+ def delete_unvaild(f)
21
+ if @cust_judge.call(f)
22
+ @total += 1
23
+ puts "deleted file: #{f}"
24
+ File.delete(f)
25
+ end
26
+ end
27
+
28
+ def start
29
+ Dir.glob(@dir_pattern) do |f|
30
+ # puts f
31
+ delete_unvaild(f)
32
+ end
33
+ puts "delete total:#{@total}"
34
+ end
35
+
36
+ def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
37
+ DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
38
+ end
39
+
40
+ end
@@ -0,0 +1,77 @@
1
+ require File.expand_path('../spider_base', __FILE__)
2
+ require File.expand_path('../delete_unvalid', __FILE__)
3
+
4
+ class ListSpider
5
+
6
+ RANDOM_TIME = -1
7
+ NO_LIMIT_CONCURRENT = -1
8
+
9
+ @@random_time_range = 3..10
10
+
11
+ include SpiderBase
12
+
13
+ def initialize(down_list, inter_val: 0, max: 30)
14
+ @down_list = down_list
15
+ @inter_val = inter_val
16
+ @max = max
17
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
18
+ @succeed_size = 0
19
+ @failed_size = 0
20
+ end
21
+
22
+ attr_reader :succeed_size, :failed_size
23
+
24
+ class << self
25
+
26
+ attr_accessor :random_time_range
27
+
28
+ end
29
+
30
+ def add_task(task)
31
+ if task.is_a?Array
32
+ @down_list = @down_list + task
33
+ elsif task.is_a?TaskStruct
34
+ @down_list << task
35
+ else
36
+ puts "error task type:#{task.class}"
37
+ end
38
+ end
39
+
40
+ def complete(multi, success_list, failed_list)
41
+ @succeed_size += success_list.size
42
+ @failed_size += failed_list.size
43
+ # puts "success size:#{success_list.size}"
44
+ # puts "failed size:#{failed_list.size}"
45
+ success_list.each do |e|
46
+ e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
47
+ end
48
+
49
+ todo = @down_list.slice!(0, @max)
50
+ if todo.empty?
51
+ puts "success size:#{@succeed_size}"
52
+ puts "failed size:#{@failed_size}"
53
+ EventMachine.stop
54
+ else
55
+ if @inter_val != 0
56
+ if success_list.size != 0 || failed_list.size !=0
57
+ if @inter_val == RANDOM_TIME
58
+ sleep(rand(@@random_time_range))
59
+ else
60
+ sleep(@inter_val)
61
+ end
62
+ end
63
+ end
64
+ batch_down_list(todo, method(:complete))
65
+ end
66
+ end
67
+
68
+ def start
69
+ puts "total size:#{@down_list.size}"
70
+ event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
71
+ end
72
+
73
+ def self.get_list(down_list, inter_val: 0, max: 30)
74
+ ListSpider.new(down_list, inter_val: inter_val, max: max).start
75
+ end
76
+
77
+ end
@@ -0,0 +1,296 @@
1
+ require 'em-http-request'
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require 'set'
5
+ require File.expand_path('../spider_helper', __FILE__)
6
+ require "addressable/uri"
7
+
8
+ class TaskStruct
9
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
+ @href = href
11
+ if @href.class == "".class
12
+ @href = SpiderHelper.string_to_uri(@href)
13
+ end
14
+ @local_path = local_path
15
+ @http_method = http_method
16
+ @params = params
17
+ @extra_data = extra_data
18
+ @parse_method = parse_method
19
+ end
20
+
21
+ def == (o)
22
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
23
+ end
24
+
25
+ attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method
26
+
27
+ end
28
+
29
+ module SpiderBase
30
+
31
+ @@conver_to_utf8 = false
32
+ @@connection_opts = {:connect_timeout => 2*60}
33
+ @@overwrite_exist = false
34
+ @@max_redirects = 10
35
+
36
+ class << self
37
+
38
+ attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
39
+
40
+ def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
41
+ @@connection_opts = {
42
+ :proxy => {
43
+ :host => proxy_addr,
44
+ :port => proxy_port
45
+ }
46
+ }
47
+ @@connection_opts[:proxy][:authorization] = [username, password] if username && password
48
+ end
49
+
50
+ def connect_timeout(max_connect_time)
51
+ @@connection_opts[:connect_timeout] = max_connect_time
52
+ end
53
+
54
+ def set_header_option(header_option)
55
+ @@header_option = optHash
56
+ end
57
+
58
+ def event_machine_down(link_struct_list, callback = nil)
59
+ failed_list = []
60
+ succeed_list = []
61
+ # puts "event_machine_down callback:#{callback}"
62
+ multi = EventMachine::MultiRequest.new
63
+ no_job = true
64
+ begin_time = Time.now
65
+
66
+ for_each_proc = proc do |e|
67
+ if !@@overwrite_exist && File.exist?(e.local_path)
68
+ succeed_list << e
69
+ else
70
+ no_job = false
71
+ opt = {}
72
+ opt = {:redirects => @@max_redirects}
73
+ opt[:head] = @@header_option if defined? @@header_option
74
+ if e.http_method == :post
75
+ opt[:body] = e.params unless e.params.empty?
76
+ if @@connection_opts
77
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
78
+ else
79
+ w = EventMachine::HttpRequest.new(e.href).post opt
80
+ end
81
+ else
82
+ if @@connection_opts
83
+ opt[:query] = e.params unless e.params.empty?
84
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
85
+ else
86
+ w = EventMachine::HttpRequest.new(e.href).get opt
87
+ end
88
+ end
89
+
90
+ w.callback {
91
+ # puts "complete:#{w.response_header}"
92
+ s = w.response_header.status
93
+ puts s
94
+ if s == 403 || s == 502 #Forbidden
95
+ # EventMachine.stop
96
+ elsif s != 404
97
+ local_dir = File.dirname(e.local_path)
98
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
99
+ begin
100
+ File.open(e.local_path, "w") do |f|
101
+ if @@conver_to_utf8 == true
102
+ f << SpiderHelper.to_utf8( w.response)
103
+ else
104
+ f << w.response
105
+ end
106
+ end
107
+ succeed_list << e
108
+ rescue Exception => e
109
+ puts e
110
+ end
111
+ end
112
+ }
113
+ w.errback {
114
+ puts "errback:#{w.response_header}"
115
+ puts e.href
116
+ puts w.response_header.status
117
+ failed_list << e
118
+ if e.http_method == :get
119
+ SpiderHelper.direct_http_get(e.href, e.local_path)
120
+ elsif e.http_method == :post
121
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
122
+ end
123
+ }
124
+ multi.add e.local_path, w
125
+ end
126
+ end
127
+
128
+ # em_for_each_proc = proc do |e, iter|
129
+ # for_each_proc.call(e)
130
+ # iter.next
131
+ # end
132
+
133
+ cb = Proc.new do
134
+ end_time = Time.now
135
+ puts "use time:#{end_time-begin_time} seconds"
136
+ if callback.nil?
137
+ puts "success size:#{self.succeed_size}"
138
+ puts "failed size:#{self.failed_size}"
139
+ EventMachine.stop
140
+ else
141
+ callback.call(multi, succeed_list, failed_list)
142
+ end
143
+ end
144
+
145
+ after_proc = proc {
146
+ if no_job #没有任务直接调回调
147
+ cb.call
148
+ else
149
+ multi.callback &cb
150
+ end
151
+ }
152
+
153
+ # if DownLoadConfig::MaxConcurrent <= 0
154
+ link_struct_list.each &for_each_proc
155
+ after_proc.call
156
+ # else
157
+ # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
158
+ # end
159
+ end
160
+
161
+ def event_machine_start(url, down_dir, file_name, callback = nil)
162
+ down_dir << "/" unless down_dir.end_with?("/")
163
+ FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
164
+ down_list = []
165
+ down_list << TaskStruct.new(url, down_dir + file_name)
166
+ EventMachine.run {
167
+ index = 0
168
+ begin_time = Time.now
169
+ event_machine_down(down_list, callback)
170
+ end_time = Time.now
171
+ }
172
+ end
173
+
174
+ def event_machine_start_list(down_list, callback = nil)
175
+ EventMachine.run {
176
+ index = 0
177
+ begin_time = Time.now
178
+ event_machine_down(down_list, callback)
179
+ end_time = Time.now
180
+ }
181
+ end
182
+
183
+ end#self end
184
+ end#SpiderBase end
185
+
186
+ def batch_down_list(down_list, callback = nil)
187
+ SpiderBase.event_machine_down(down_list, callback)
188
+ end
189
+
190
+ def event_machine_start_list(down_list, callback = nil)
191
+ SpiderBase.event_machine_start_list(down_list, callback)
192
+ end
193
+
194
+ def parse_down_load_url(url, down_dir, file_name, callback = nil)
195
+ SpiderBase.event_machine_start(url, down_dir, file_name, callback)
196
+ end
197
+
198
+ class GetRelative
199
+
200
+ def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
201
+ @get_depth = get_depth
202
+ @base_url = base_url
203
+ @down_dir = down_dir
204
+ @suffix = suffix
205
+ end
206
+
207
+ def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
208
+ puts "success"
209
+ puts succeed_list.size
210
+ puts "error"
211
+ puts failed_list.size
212
+ puts failed_list
213
+ puts "get index complete"
214
+ if succeed_list.size > 0
215
+ link_list = []
216
+ succeed_list.each do |e|
217
+ doc = Nokogiri::HTML(open(e.local_path))
218
+ link_list.concat(doc.css("a"))
219
+ end
220
+ puts "extrat href complete"
221
+
222
+ down_dir << "/" unless down_dir.end_with?("/")
223
+ FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
224
+
225
+ down_list = []
226
+ set_list = Set.new
227
+ link_list.each do |link|
228
+ href = link['href']
229
+ next if href.nil? || !href.include?(@suffix)
230
+ #process such as "scheme_2.html#SEC15"
231
+ href = href[0, href.index(@suffix) + 5]
232
+ #process such as "./preface.html"
233
+ href = href[2..-1] if href.start_with?("./")
234
+
235
+ next if !set_list.add?(href)
236
+ unless base_url.end_with?("/")
237
+ i = base_url.rindex"/"
238
+ base_url = base_url[0..i]
239
+ end
240
+
241
+ #process such as "http://www.ccs.neu.edu/~dorai"
242
+ next if href.start_with?("http:") || href.start_with?("https:")
243
+
244
+ local_path = down_dir + href
245
+
246
+ down_list.push( TaskStruct.new(base_url + href, local_path))
247
+ end
248
+ puts "down list complete,size:#{down_list.size}"
249
+ batch_down_list(down_list, callback)
250
+ end
251
+ end
252
+
253
+ def down_other_node (multi, succeed_list, failed_list)
254
+ puts "down_other_node"
255
+ @get_depth = @get_depth - 1
256
+ puts "depth:#{@get_depth}"
257
+ if @get_depth <= 0
258
+ down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
259
+ else
260
+ down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
261
+ end
262
+ end
263
+
264
+ def event_all_complete (multi, succeed_list, failed_list)
265
+ puts "all complete"
266
+ puts "success"
267
+ puts succeed_list.size
268
+ puts "error"
269
+ puts failed_list.size
270
+ puts failed_list
271
+ EventMachine.stop
272
+ end
273
+
274
+ attr_writer :get_depth,:base_url,:down_dir
275
+
276
+ def start
277
+ index_file_name = "index.html"
278
+ #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
279
+ unless @base_url.end_with?("/")
280
+ i = @base_url.rindex"/"
281
+ index_file_name = @base_url[i+1 .. -1]
282
+ end
283
+
284
+ @get_depth = @get_depth - 1
285
+ puts @get_depth
286
+ if @get_depth <= 0
287
+ parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
288
+ else
289
+ parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
290
+ end
291
+ end
292
+
293
+ def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
294
+ GetRelative.new(base_url,down_dir, get_depth, suffix).start
295
+ end
296
+ end #GetRelative
@@ -0,0 +1,103 @@
1
+ require 'rchardet'
2
+ require 'net/http'
3
+
4
+ module SpiderHelper
5
+
6
+ class << self
7
+
8
+ def direct_http_get(href, local_path, params: nil, header: nil)
9
+ if href.class == "".class
10
+ href = string_to_uri(href)
11
+ end
12
+
13
+ begin
14
+ href.query = URI.encode_www_form(params) if params
15
+ req = Net::HTTP::Get.new(href)
16
+ header.each{|k,v| req[k] = v} if header
17
+
18
+ res = Net::HTTP.start(href.hostname, href.port) do |http|
19
+ http.request(req)
20
+ end
21
+
22
+ if res.is_a?(Net::HTTPSuccess)
23
+ local_dir = File.dirname(local_path)
24
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
25
+ File.write(local_path, res.body)
26
+ puts "succeed"
27
+ else
28
+ puts res
29
+ end
30
+ rescue Exception => e
31
+ puts e.backtrace
32
+ puts e
33
+ end
34
+ end
35
+
36
+ def direct_http_post(href, local_path, params, header: nil)
37
+ if href.class == "".class
38
+ href = string_to_uri(href)
39
+ end
40
+
41
+ begin
42
+ req = Net::HTTP::Post.new(href)
43
+ req.set_form_data(params)
44
+ header.each{|k,v| req[k] = v} if header
45
+
46
+ res = Net::HTTP.start(href.hostname, href.port) do |http|
47
+ http.request(req)
48
+ end
49
+
50
+ if res.is_a?(Net::HTTPSuccess)
51
+ local_dir = File.dirname(local_path)
52
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
53
+ File.write(local_path, res.body)
54
+ else
55
+ puts res
56
+ end
57
+ rescue Exception => e
58
+ puts e
59
+ end
60
+ end
61
+
62
+ def extract_href_last(origin_href)
63
+ origin_href.split('/')[-1]
64
+ end
65
+
66
+ def string_to_uri(href)
67
+ l = href
68
+ l.sub!('http:///', 'http://') if l.start_with?('http:///')
69
+ l = Addressable::URI.parse(l)
70
+ l.normalize!
71
+ end
72
+
73
+ BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
74
+ "UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
75
+ "UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
76
+ "UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
77
+ "UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
78
+
79
+ #此函数有时此判断有误,使用to_utf8函数直接转换
80
+ def smart_to_utf8(str)
81
+ return str if str.encoding == Encoding::UTF_8
82
+ to_utf8(str)
83
+ end
84
+
85
+ def to_utf8(str)
86
+ #解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
87
+ str.force_encoding(Encoding::ASCII_8BIT)
88
+ cd = CharDet.detect(str)
89
+ if cd["confidence"] > 0.6
90
+ puts cd["encoding"]
91
+ str.force_encoding(cd["encoding"])
92
+ #移除BOM头
93
+ bomHeader = BomHeaderMap[cd["encoding"]]
94
+ str.sub!(bomHeader, "") if bomHeader
95
+ end
96
+ str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
97
+
98
+ return str
99
+ end
100
+
101
+ end
102
+
103
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: list_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Charles Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: em-http-request
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.3
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.1'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.3
33
+ - !ruby/object:Gem::Dependency
34
+ name: nokogiri
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.6'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.6.7
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.6'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.6.7
53
+ description: A url list spider based on em-http-request.
54
+ email: gis05zc@163.com
55
+ executables: []
56
+ extensions: []
57
+ extra_rdoc_files: []
58
+ files:
59
+ - lib/delete_unvalid.rb
60
+ - lib/list_spider.rb
61
+ - lib/spider_base.rb
62
+ - lib/spider_helper.rb
63
+ homepage: https://github.com/chinazhangchao/list_spider
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.6.4
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: List Spider
87
+ test_files: []
88
+ has_rdoc: