list_spider 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +16 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 624a5d306b193b1ece24867cc1ff8b8031931d90
|
4
|
+
data.tar.gz: 00b861e2d021dfd668275cf33a87b275fafa0def
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1e739921dc0b12d47a267cb5ed01ff626055d4e5ff5589c9ec9330906ab4db051079631a23671772af51a27d5dca9c03652b8f163e0028e4ea04f4dd382c6f29
|
7
|
+
data.tar.gz: dd6badf6fae1780525c5c4be335021624b3ecc943d7db12c60e9dd4c4bb21cfb731c30b24148c436858eb3617c1fd997c27ad01fea59e7dc5743e1b1dcdc94d2
|
data/lib/list_spider.rb
CHANGED
@@ -33,11 +33,11 @@ module ListSpider
|
|
33
33
|
RANDOM_TIME = -1
|
34
34
|
NO_LIMIT_CONCURRENT = -1
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
@random_time_range = 3..10
|
37
|
+
@conver_to_utf8 = false
|
38
|
+
@connection_opts = {connect_timeout: 2*60}
|
39
|
+
@overwrite_exist = false
|
40
|
+
@max_redirects = 10
|
41
41
|
@@url_set = Set.new
|
42
42
|
|
43
43
|
class << self
|
@@ -45,17 +45,17 @@ module ListSpider
|
|
45
45
|
attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
|
46
46
|
|
47
47
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
48
|
-
|
48
|
+
@connection_opts = {
|
49
49
|
:proxy => {
|
50
50
|
:host => proxy_addr,
|
51
51
|
:port => proxy_port
|
52
52
|
}
|
53
53
|
}
|
54
|
-
|
54
|
+
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
55
55
|
end
|
56
56
|
|
57
57
|
def connect_timeout(max_connect_time)
|
58
|
-
|
58
|
+
@connection_opts[:connect_timeout] = max_connect_time
|
59
59
|
end
|
60
60
|
|
61
61
|
def set_header_option(header_option)
|
@@ -70,19 +70,19 @@ module ListSpider
|
|
70
70
|
|
71
71
|
for_each_proc = proc do |e|
|
72
72
|
opt = {}
|
73
|
-
opt = {:redirects =>
|
73
|
+
opt = {:redirects => @max_redirects}
|
74
74
|
opt[:head] = @@header_option if defined? @@header_option
|
75
75
|
if e.http_method == :post
|
76
76
|
opt[:body] = e.params unless e.params.empty?
|
77
|
-
if
|
78
|
-
w = EventMachine::HttpRequest.new(e.href,
|
77
|
+
if @connection_opts
|
78
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
79
79
|
else
|
80
80
|
w = EventMachine::HttpRequest.new(e.href).post opt
|
81
81
|
end
|
82
82
|
else
|
83
|
-
if
|
83
|
+
if @connection_opts
|
84
84
|
opt[:query] = e.params unless e.params.empty?
|
85
|
-
w = EventMachine::HttpRequest.new(e.href,
|
85
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
86
86
|
else
|
87
87
|
w = EventMachine::HttpRequest.new(e.href).get opt
|
88
88
|
end
|
@@ -96,7 +96,7 @@ module ListSpider
|
|
96
96
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
97
97
|
begin
|
98
98
|
File.open(e.local_path, "w") do |f|
|
99
|
-
if
|
99
|
+
if @conver_to_utf8 == true
|
100
100
|
f << SpiderHelper.to_utf8( w.response)
|
101
101
|
else
|
102
102
|
f << w.response
|
@@ -173,7 +173,7 @@ module ListSpider
|
|
173
173
|
if @@inter_val != 0
|
174
174
|
if success_list.size != 0 || failed_list.size != 0
|
175
175
|
if @@inter_val == RANDOM_TIME
|
176
|
-
sleep(rand(
|
176
|
+
sleep(rand(@random_time_range))
|
177
177
|
else
|
178
178
|
sleep(@@inter_val)
|
179
179
|
end
|
@@ -201,7 +201,7 @@ module ListSpider
|
|
201
201
|
def filter_list(down_list)
|
202
202
|
need_down_list = []
|
203
203
|
down_list.each do |ts|
|
204
|
-
if
|
204
|
+
if !@overwrite_exist && File.exist?(ts.local_path)
|
205
205
|
ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
|
206
206
|
else
|
207
207
|
need_down_list << ts
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|