list_spider 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +16 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 624a5d306b193b1ece24867cc1ff8b8031931d90
|
4
|
+
data.tar.gz: 00b861e2d021dfd668275cf33a87b275fafa0def
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1e739921dc0b12d47a267cb5ed01ff626055d4e5ff5589c9ec9330906ab4db051079631a23671772af51a27d5dca9c03652b8f163e0028e4ea04f4dd382c6f29
|
7
|
+
data.tar.gz: dd6badf6fae1780525c5c4be335021624b3ecc943d7db12c60e9dd4c4bb21cfb731c30b24148c436858eb3617c1fd997c27ad01fea59e7dc5743e1b1dcdc94d2
|
data/lib/list_spider.rb
CHANGED
@@ -33,11 +33,11 @@ module ListSpider
|
|
33
33
|
RANDOM_TIME = -1
|
34
34
|
NO_LIMIT_CONCURRENT = -1
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
@random_time_range = 3..10
|
37
|
+
@conver_to_utf8 = false
|
38
|
+
@connection_opts = {connect_timeout: 2*60}
|
39
|
+
@overwrite_exist = false
|
40
|
+
@max_redirects = 10
|
41
41
|
@@url_set = Set.new
|
42
42
|
|
43
43
|
class << self
|
@@ -45,17 +45,17 @@ module ListSpider
|
|
45
45
|
attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
|
46
46
|
|
47
47
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
48
|
-
|
48
|
+
@connection_opts = {
|
49
49
|
:proxy => {
|
50
50
|
:host => proxy_addr,
|
51
51
|
:port => proxy_port
|
52
52
|
}
|
53
53
|
}
|
54
|
-
|
54
|
+
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
55
55
|
end
|
56
56
|
|
57
57
|
def connect_timeout(max_connect_time)
|
58
|
-
|
58
|
+
@connection_opts[:connect_timeout] = max_connect_time
|
59
59
|
end
|
60
60
|
|
61
61
|
def set_header_option(header_option)
|
@@ -70,19 +70,19 @@ module ListSpider
|
|
70
70
|
|
71
71
|
for_each_proc = proc do |e|
|
72
72
|
opt = {}
|
73
|
-
opt = {:redirects =>
|
73
|
+
opt = {:redirects => @max_redirects}
|
74
74
|
opt[:head] = @@header_option if defined? @@header_option
|
75
75
|
if e.http_method == :post
|
76
76
|
opt[:body] = e.params unless e.params.empty?
|
77
|
-
if
|
78
|
-
w = EventMachine::HttpRequest.new(e.href,
|
77
|
+
if @connection_opts
|
78
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
79
79
|
else
|
80
80
|
w = EventMachine::HttpRequest.new(e.href).post opt
|
81
81
|
end
|
82
82
|
else
|
83
|
-
if
|
83
|
+
if @connection_opts
|
84
84
|
opt[:query] = e.params unless e.params.empty?
|
85
|
-
w = EventMachine::HttpRequest.new(e.href,
|
85
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
86
86
|
else
|
87
87
|
w = EventMachine::HttpRequest.new(e.href).get opt
|
88
88
|
end
|
@@ -96,7 +96,7 @@ module ListSpider
|
|
96
96
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
97
97
|
begin
|
98
98
|
File.open(e.local_path, "w") do |f|
|
99
|
-
if
|
99
|
+
if @conver_to_utf8 == true
|
100
100
|
f << SpiderHelper.to_utf8( w.response)
|
101
101
|
else
|
102
102
|
f << w.response
|
@@ -173,7 +173,7 @@ module ListSpider
|
|
173
173
|
if @@inter_val != 0
|
174
174
|
if success_list.size != 0 || failed_list.size != 0
|
175
175
|
if @@inter_val == RANDOM_TIME
|
176
|
-
sleep(rand(
|
176
|
+
sleep(rand(@random_time_range))
|
177
177
|
else
|
178
178
|
sleep(@@inter_val)
|
179
179
|
end
|
@@ -201,7 +201,7 @@ module ListSpider
|
|
201
201
|
def filter_list(down_list)
|
202
202
|
need_down_list = []
|
203
203
|
down_list.each do |ts|
|
204
|
-
if
|
204
|
+
if !@overwrite_exist && File.exist?(ts.local_path)
|
205
205
|
ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
|
206
206
|
else
|
207
207
|
need_down_list << ts
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|