list_spider 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +0 -0
- data/lib/list_spider.rb +15 -10
- data/lib/spider_helper.rb +15 -4
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41ff55315cb0b0cbb03b8ad9a11001e0c410935d
|
4
|
+
data.tar.gz: c3a79e8a3bebddf021cb0c9e13bed61be086bd33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20a16743baa0b56ac3438493e8e4ecc6a84c6e4cd933cf23c6a2e7b16906ebd3e425702808813d6634f66af857155c14867a2a4edaeb2dabeaa3c006e847771e
|
7
|
+
data.tar.gz: 4259ce7eea88f064a946a424542d3db30a830e0ed379ffadde7a78b7fd89ffe99099ed1caa0770edb313b5e22e358c04bb1e76a30065662a07624f3abbeb45aa
|
data/lib/file_filter.rb
CHANGED
File without changes
|
data/lib/list_spider.rb
CHANGED
@@ -8,7 +8,6 @@ require File.expand_path('../file_filter', __FILE__)
|
|
8
8
|
|
9
9
|
class TaskStruct
|
10
10
|
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
|
-
@origin_href = href
|
12
11
|
@href = href
|
13
12
|
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
14
13
|
@local_path = local_path
|
@@ -23,7 +22,7 @@ class TaskStruct
|
|
23
22
|
other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
|
24
23
|
end
|
25
24
|
|
26
|
-
attr_accessor :
|
25
|
+
attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
27
26
|
end
|
28
27
|
|
29
28
|
module ListSpider
|
@@ -33,14 +32,14 @@ module ListSpider
|
|
33
32
|
DEFAULT_INTERVAL = 0
|
34
33
|
|
35
34
|
@random_time_range = 3..10
|
36
|
-
@
|
35
|
+
@convert_to_utf8 = false
|
37
36
|
@connection_opts = { connect_timeout: 60 }
|
38
37
|
@overwrite_exist = false
|
39
38
|
@max_redirects = 10
|
40
39
|
@local_path_set = Set.new
|
41
40
|
|
42
41
|
class << self
|
43
|
-
attr_accessor :
|
42
|
+
attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
|
44
43
|
|
45
44
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
46
45
|
@connection_opts = {
|
@@ -102,7 +101,7 @@ module ListSpider
|
|
102
101
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
102
|
begin
|
104
103
|
File.open(e.local_path, 'wb') do |f|
|
105
|
-
f << if @
|
104
|
+
f << if @convert_to_utf8 == true
|
106
105
|
SpiderHelper.to_utf8(w.response)
|
107
106
|
else
|
108
107
|
w.response
|
@@ -115,15 +114,21 @@ module ListSpider
|
|
115
114
|
end
|
116
115
|
end
|
117
116
|
w.errback do
|
118
|
-
puts "errback:#{w.response_header}"
|
119
|
-
puts e.origin_href
|
117
|
+
puts "errback:#{w.response_header},retry..."
|
120
118
|
puts e.href
|
121
119
|
puts w.response_header.status
|
122
|
-
|
120
|
+
|
121
|
+
ret = false
|
123
122
|
if e.http_method == :get
|
124
|
-
SpiderHelper.direct_http_get(e.href, e.local_path)
|
123
|
+
ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
|
125
124
|
elsif e.http_method == :post
|
126
|
-
|
125
|
+
ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
|
126
|
+
end
|
127
|
+
|
128
|
+
if ret
|
129
|
+
succeed_list << e
|
130
|
+
else
|
131
|
+
failed_list << e
|
127
132
|
end
|
128
133
|
end
|
129
134
|
|
data/lib/spider_helper.rb
CHANGED
@@ -3,7 +3,7 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
5
|
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil, header: nil)
|
6
|
+
def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
|
7
7
|
href = string_to_uri(href) if href.class == ''.class
|
8
8
|
|
9
9
|
begin
|
@@ -19,18 +19,23 @@ module SpiderHelper
|
|
19
19
|
if res.is_a?(Net::HTTPSuccess)
|
20
20
|
local_dir = File.dirname(local_path)
|
21
21
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
22
|
-
|
22
|
+
content = res.body
|
23
|
+
content = to_utf8(content) if convert_to_utf8
|
24
|
+
File.write(local_path, content)
|
23
25
|
puts 'succeed'
|
26
|
+
return true
|
24
27
|
else
|
25
28
|
puts res
|
26
29
|
end
|
27
30
|
rescue => e
|
28
31
|
puts e.backtrace
|
29
32
|
puts e
|
33
|
+
false
|
30
34
|
end
|
35
|
+
false
|
31
36
|
end
|
32
37
|
|
33
|
-
def direct_http_post(href, local_path, params, header: nil)
|
38
|
+
def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
|
34
39
|
href = string_to_uri(href) if href.class == ''.class
|
35
40
|
|
36
41
|
begin
|
@@ -46,13 +51,19 @@ module SpiderHelper
|
|
46
51
|
if res.is_a?(Net::HTTPSuccess)
|
47
52
|
local_dir = File.dirname(local_path)
|
48
53
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
49
|
-
|
54
|
+
content = res.body
|
55
|
+
content = to_utf8(content) if convert_to_utf8s
|
56
|
+
File.write(local_path, content)
|
57
|
+
puts 'succeed'
|
58
|
+
return true
|
50
59
|
else
|
51
60
|
puts res
|
52
61
|
end
|
53
62
|
rescue => e
|
54
63
|
puts e
|
64
|
+
false
|
55
65
|
end
|
66
|
+
false
|
56
67
|
end
|
57
68
|
|
58
69
|
def extract_href_last(origin_href)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|
@@ -99,9 +99,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
99
|
version: '0'
|
100
100
|
requirements: []
|
101
101
|
rubyforge_project:
|
102
|
-
rubygems_version: 2.
|
102
|
+
rubygems_version: 2.4.5
|
103
103
|
signing_key:
|
104
104
|
specification_version: 4
|
105
105
|
summary: List Spider
|
106
106
|
test_files: []
|
107
|
-
has_rdoc:
|