list_spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f897ec31adc2d2a6c9713729030e8decc85cb9d
4
+ data.tar.gz: c81ed9cab5fc1bbd4395aeb9692924887042c27f
5
+ SHA512:
6
+ metadata.gz: 907529b23336256e2c72232ec2b413c50ebee99df49695bac368ae4dc029afaa81424fe542c63d6c7247714964382e2eaeaae0ea34ab51658e34ce67e5c7b9e7
7
+ data.tar.gz: 22dff7b012b1c12f8cefb50606e0a9b0874c5b013a305c268ae2a950374760ba77ed24af075f6ca5217e128d905b31bae754107e0cf063e0d90171dd51b94f20
@@ -0,0 +1,40 @@
1
+
2
+ class DeleteUnvalid
3
+ # 4033
4
+ # 920
5
+ def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
6
+ @dir_pattern = dir_pattern
7
+ @size_threshold = size_threshold
8
+ if cust_judge
9
+ @cust_judge = cust_judge
10
+ else
11
+ @cust_judge = method(:default_judge)
12
+ end
13
+ @total = 0
14
+ end
15
+
16
+ def default_judge(f)
17
+ File.size(f) <= @size_threshold
18
+ end
19
+
20
+ def delete_unvaild(f)
21
+ if @cust_judge.call(f)
22
+ @total += 1
23
+ puts "deleted file: #{f}"
24
+ File.delete(f)
25
+ end
26
+ end
27
+
28
+ def start
29
+ Dir.glob(@dir_pattern) do |f|
30
+ # puts f
31
+ delete_unvaild(f)
32
+ end
33
+ puts "delete total:#{@total}"
34
+ end
35
+
36
+ def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
37
+ DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
38
+ end
39
+
40
+ end
@@ -0,0 +1,77 @@
1
+ require File.expand_path('../spider_base', __FILE__)
2
+ require File.expand_path('../delete_unvalid', __FILE__)
3
+
4
+ class ListSpider
5
+
6
+ RANDOM_TIME = -1
7
+ NO_LIMIT_CONCURRENT = -1
8
+
9
+ @@random_time_range = 3..10
10
+
11
+ include SpiderBase
12
+
13
+ def initialize(down_list, inter_val: 0, max: 30)
14
+ @down_list = down_list
15
+ @inter_val = inter_val
16
+ @max = max
17
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
18
+ @succeed_size = 0
19
+ @failed_size = 0
20
+ end
21
+
22
+ attr_reader :succeed_size, :failed_size
23
+
24
+ class << self
25
+
26
+ attr_accessor :random_time_range
27
+
28
+ end
29
+
30
+ def add_task(task)
31
+ if task.is_a?Array
32
+ @down_list = @down_list + task
33
+ elsif task.is_a?TaskStruct
34
+ @down_list << task
35
+ else
36
+ puts "error task type:#{task.class}"
37
+ end
38
+ end
39
+
40
+ def complete(multi, success_list, failed_list)
41
+ @succeed_size += success_list.size
42
+ @failed_size += failed_list.size
43
+ # puts "success size:#{success_list.size}"
44
+ # puts "failed size:#{failed_list.size}"
45
+ success_list.each do |e|
46
+ e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
47
+ end
48
+
49
+ todo = @down_list.slice!(0, @max)
50
+ if todo.empty?
51
+ puts "success size:#{@succeed_size}"
52
+ puts "failed size:#{@failed_size}"
53
+ EventMachine.stop
54
+ else
55
+ if @inter_val != 0
56
+ if success_list.size != 0 || failed_list.size !=0
57
+ if @inter_val == RANDOM_TIME
58
+ sleep(rand(@@random_time_range))
59
+ else
60
+ sleep(@inter_val)
61
+ end
62
+ end
63
+ end
64
+ batch_down_list(todo, method(:complete))
65
+ end
66
+ end
67
+
68
+ def start
69
+ puts "total size:#{@down_list.size}"
70
+ event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
71
+ end
72
+
73
+ def self.get_list(down_list, inter_val: 0, max: 30)
74
+ ListSpider.new(down_list, inter_val: inter_val, max: max).start
75
+ end
76
+
77
+ end
@@ -0,0 +1,296 @@
1
+ require 'em-http-request'
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require 'set'
5
+ require File.expand_path('../spider_helper', __FILE__)
6
+ require "addressable/uri"
7
+
8
+ class TaskStruct
9
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
+ @href = href
11
+ if @href.class == "".class
12
+ @href = SpiderHelper.string_to_uri(@href)
13
+ end
14
+ @local_path = local_path
15
+ @http_method = http_method
16
+ @params = params
17
+ @extra_data = extra_data
18
+ @parse_method = parse_method
19
+ end
20
+
21
+ def == (o)
22
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
23
+ end
24
+
25
+ attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method
26
+
27
+ end
28
+
29
+ module SpiderBase
30
+
31
+ @@conver_to_utf8 = false
32
+ @@connection_opts = {:connect_timeout => 2*60}
33
+ @@overwrite_exist = false
34
+ @@max_redirects = 10
35
+
36
+ class << self
37
+
38
+ attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
39
+
40
+ def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
41
+ @@connection_opts = {
42
+ :proxy => {
43
+ :host => proxy_addr,
44
+ :port => proxy_port
45
+ }
46
+ }
47
+ @@connection_opts[:proxy][:authorization] = [username, password] if username && password
48
+ end
49
+
50
+ def connect_timeout(max_connect_time)
51
+ @@connection_opts[:connect_timeout] = max_connect_time
52
+ end
53
+
54
+ def set_header_option(header_option)
55
+ @@header_option = optHash
56
+ end
57
+
58
+ def event_machine_down(link_struct_list, callback = nil)
59
+ failed_list = []
60
+ succeed_list = []
61
+ # puts "event_machine_down callback:#{callback}"
62
+ multi = EventMachine::MultiRequest.new
63
+ no_job = true
64
+ begin_time = Time.now
65
+
66
+ for_each_proc = proc do |e|
67
+ if !@@overwrite_exist && File.exist?(e.local_path)
68
+ succeed_list << e
69
+ else
70
+ no_job = false
71
+ opt = {}
72
+ opt = {:redirects => @@max_redirects}
73
+ opt[:head] = @@header_option if defined? @@header_option
74
+ if e.http_method == :post
75
+ opt[:body] = e.params unless e.params.empty?
76
+ if @@connection_opts
77
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
78
+ else
79
+ w = EventMachine::HttpRequest.new(e.href).post opt
80
+ end
81
+ else
82
+ if @@connection_opts
83
+ opt[:query] = e.params unless e.params.empty?
84
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
85
+ else
86
+ w = EventMachine::HttpRequest.new(e.href).get opt
87
+ end
88
+ end
89
+
90
+ w.callback {
91
+ # puts "complete:#{w.response_header}"
92
+ s = w.response_header.status
93
+ puts s
94
+ if s == 403 || s == 502 #Forbidden
95
+ # EventMachine.stop
96
+ elsif s != 404
97
+ local_dir = File.dirname(e.local_path)
98
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
99
+ begin
100
+ File.open(e.local_path, "w") do |f|
101
+ if @@conver_to_utf8 == true
102
+ f << SpiderHelper.to_utf8( w.response)
103
+ else
104
+ f << w.response
105
+ end
106
+ end
107
+ succeed_list << e
108
+ rescue Exception => e
109
+ puts e
110
+ end
111
+ end
112
+ }
113
+ w.errback {
114
+ puts "errback:#{w.response_header}"
115
+ puts e.href
116
+ puts w.response_header.status
117
+ failed_list << e
118
+ if e.http_method == :get
119
+ SpiderHelper.direct_http_get(e.href, e.local_path)
120
+ elsif e.http_method == :post
121
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
122
+ end
123
+ }
124
+ multi.add e.local_path, w
125
+ end
126
+ end
127
+
128
+ # em_for_each_proc = proc do |e, iter|
129
+ # for_each_proc.call(e)
130
+ # iter.next
131
+ # end
132
+
133
+ cb = Proc.new do
134
+ end_time = Time.now
135
+ puts "use time:#{end_time-begin_time} seconds"
136
+ if callback.nil?
137
+ puts "success size:#{self.succeed_size}"
138
+ puts "failed size:#{self.failed_size}"
139
+ EventMachine.stop
140
+ else
141
+ callback.call(multi, succeed_list, failed_list)
142
+ end
143
+ end
144
+
145
+ after_proc = proc {
146
+ if no_job #没有任务直接调回调
147
+ cb.call
148
+ else
149
+ multi.callback &cb
150
+ end
151
+ }
152
+
153
+ # if DownLoadConfig::MaxConcurrent <= 0
154
+ link_struct_list.each &for_each_proc
155
+ after_proc.call
156
+ # else
157
+ # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
158
+ # end
159
+ end
160
+
161
+ def event_machine_start(url, down_dir, file_name, callback = nil)
162
+ down_dir << "/" unless down_dir.end_with?("/")
163
+ FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
164
+ down_list = []
165
+ down_list << TaskStruct.new(url, down_dir + file_name)
166
+ EventMachine.run {
167
+ index = 0
168
+ begin_time = Time.now
169
+ event_machine_down(down_list, callback)
170
+ end_time = Time.now
171
+ }
172
+ end
173
+
174
+ def event_machine_start_list(down_list, callback = nil)
175
+ EventMachine.run {
176
+ index = 0
177
+ begin_time = Time.now
178
+ event_machine_down(down_list, callback)
179
+ end_time = Time.now
180
+ }
181
+ end
182
+
183
+ end#self end
184
+ end#SpiderBase end
185
+
186
+ def batch_down_list(down_list, callback = nil)
187
+ SpiderBase.event_machine_down(down_list, callback)
188
+ end
189
+
190
+ def event_machine_start_list(down_list, callback = nil)
191
+ SpiderBase.event_machine_start_list(down_list, callback)
192
+ end
193
+
194
+ def parse_down_load_url(url, down_dir, file_name, callback = nil)
195
+ SpiderBase.event_machine_start(url, down_dir, file_name, callback)
196
+ end
197
+
198
+ class GetRelative
199
+
200
+ def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
201
+ @get_depth = get_depth
202
+ @base_url = base_url
203
+ @down_dir = down_dir
204
+ @suffix = suffix
205
+ end
206
+
207
+ def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
208
+ puts "success"
209
+ puts succeed_list.size
210
+ puts "error"
211
+ puts failed_list.size
212
+ puts failed_list
213
+ puts "get index complete"
214
+ if succeed_list.size > 0
215
+ link_list = []
216
+ succeed_list.each do |e|
217
+ doc = Nokogiri::HTML(open(e.local_path))
218
+ link_list.concat(doc.css("a"))
219
+ end
220
+ puts "extrat href complete"
221
+
222
+ down_dir << "/" unless down_dir.end_with?("/")
223
+ FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
224
+
225
+ down_list = []
226
+ set_list = Set.new
227
+ link_list.each do |link|
228
+ href = link['href']
229
+ next if href.nil? || !href.include?(@suffix)
230
+ #process such as "scheme_2.html#SEC15"
231
+ href = href[0, href.index(@suffix) + 5]
232
+ #process such as "./preface.html"
233
+ href = href[2..-1] if href.start_with?("./")
234
+
235
+ next if !set_list.add?(href)
236
+ unless base_url.end_with?("/")
237
+ i = base_url.rindex"/"
238
+ base_url = base_url[0..i]
239
+ end
240
+
241
+ #process such as "http://www.ccs.neu.edu/~dorai"
242
+ next if href.start_with?("http:") || href.start_with?("https:")
243
+
244
+ local_path = down_dir + href
245
+
246
+ down_list.push( TaskStruct.new(base_url + href, local_path))
247
+ end
248
+ puts "down list complete,size:#{down_list.size}"
249
+ batch_down_list(down_list, callback)
250
+ end
251
+ end
252
+
253
+ def down_other_node (multi, succeed_list, failed_list)
254
+ puts "down_other_node"
255
+ @get_depth = @get_depth - 1
256
+ puts "depth:#{@get_depth}"
257
+ if @get_depth <= 0
258
+ down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
259
+ else
260
+ down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
261
+ end
262
+ end
263
+
264
+ def event_all_complete (multi, succeed_list, failed_list)
265
+ puts "all complete"
266
+ puts "success"
267
+ puts succeed_list.size
268
+ puts "error"
269
+ puts failed_list.size
270
+ puts failed_list
271
+ EventMachine.stop
272
+ end
273
+
274
+ attr_writer :get_depth,:base_url,:down_dir
275
+
276
+ def start
277
+ index_file_name = "index.html"
278
+ #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
279
+ unless @base_url.end_with?("/")
280
+ i = @base_url.rindex"/"
281
+ index_file_name = @base_url[i+1 .. -1]
282
+ end
283
+
284
+ @get_depth = @get_depth - 1
285
+ puts @get_depth
286
+ if @get_depth <= 0
287
+ parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
288
+ else
289
+ parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
290
+ end
291
+ end
292
+
293
+ def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
294
+ GetRelative.new(base_url,down_dir, get_depth, suffix).start
295
+ end
296
+ end #GetRelative
@@ -0,0 +1,103 @@
1
+ require 'rchardet'
2
+ require 'net/http'
3
+
4
+ module SpiderHelper
5
+
6
+ class << self
7
+
8
+ def direct_http_get(href, local_path, params: nil, header: nil)
9
+ if href.class == "".class
10
+ href = string_to_uri(href)
11
+ end
12
+
13
+ begin
14
+ href.query = URI.encode_www_form(params) if params
15
+ req = Net::HTTP::Get.new(href)
16
+ header.each{|k,v| req[k] = v} if header
17
+
18
+ res = Net::HTTP.start(href.hostname, href.port) do |http|
19
+ http.request(req)
20
+ end
21
+
22
+ if res.is_a?(Net::HTTPSuccess)
23
+ local_dir = File.dirname(local_path)
24
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
25
+ File.write(local_path, res.body)
26
+ puts "succeed"
27
+ else
28
+ puts res
29
+ end
30
+ rescue Exception => e
31
+ puts e.backtrace
32
+ puts e
33
+ end
34
+ end
35
+
36
+ def direct_http_post(href, local_path, params, header: nil)
37
+ if href.class == "".class
38
+ href = string_to_uri(href)
39
+ end
40
+
41
+ begin
42
+ req = Net::HTTP::Post.new(href)
43
+ req.set_form_data(params)
44
+ header.each{|k,v| req[k] = v} if header
45
+
46
+ res = Net::HTTP.start(href.hostname, href.port) do |http|
47
+ http.request(req)
48
+ end
49
+
50
+ if res.is_a?(Net::HTTPSuccess)
51
+ local_dir = File.dirname(local_path)
52
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
53
+ File.write(local_path, res.body)
54
+ else
55
+ puts res
56
+ end
57
+ rescue Exception => e
58
+ puts e
59
+ end
60
+ end
61
+
62
+ def extract_href_last(origin_href)
63
+ origin_href.split('/')[-1]
64
+ end
65
+
66
+ def string_to_uri(href)
67
+ l = href
68
+ l.sub!('http:///', 'http://') if l.start_with?('http:///')
69
+ l = Addressable::URI.parse(l)
70
+ l.normalize!
71
+ end
72
+
73
+ BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
74
+ "UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
75
+ "UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
76
+ "UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
77
+ "UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
78
+
79
+ #此函数有时此判断有误,使用to_utf8函数直接转换
80
+ def smart_to_utf8(str)
81
+ return str if str.encoding == Encoding::UTF_8
82
+ to_utf8(str)
83
+ end
84
+
85
+ def to_utf8(str)
86
+ #解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
87
+ str.force_encoding(Encoding::ASCII_8BIT)
88
+ cd = CharDet.detect(str)
89
+ if cd["confidence"] > 0.6
90
+ puts cd["encoding"]
91
+ str.force_encoding(cd["encoding"])
92
+ #移除BOM头
93
+ bomHeader = BomHeaderMap[cd["encoding"]]
94
+ str.sub!(bomHeader, "") if bomHeader
95
+ end
96
+ str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
97
+
98
+ return str
99
+ end
100
+
101
+ end
102
+
103
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: list_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Charles Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: em-http-request
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.3
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.1'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.3
33
+ - !ruby/object:Gem::Dependency
34
+ name: nokogiri
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.6'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.6.7
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.6'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.6.7
53
+ description: A url list spider based on em-http-request.
54
+ email: gis05zc@163.com
55
+ executables: []
56
+ extensions: []
57
+ extra_rdoc_files: []
58
+ files:
59
+ - lib/delete_unvalid.rb
60
+ - lib/list_spider.rb
61
+ - lib/spider_base.rb
62
+ - lib/spider_helper.rb
63
+ homepage: https://github.com/chinazhangchao/list_spider
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.6.4
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: List Spider
87
+ test_files: []
88
+ has_rdoc: