list_spider 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +0 -0
- data/lib/list_spider.rb +15 -10
- data/lib/spider_helper.rb +15 -4
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41ff55315cb0b0cbb03b8ad9a11001e0c410935d
|
4
|
+
data.tar.gz: c3a79e8a3bebddf021cb0c9e13bed61be086bd33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20a16743baa0b56ac3438493e8e4ecc6a84c6e4cd933cf23c6a2e7b16906ebd3e425702808813d6634f66af857155c14867a2a4edaeb2dabeaa3c006e847771e
|
7
|
+
data.tar.gz: 4259ce7eea88f064a946a424542d3db30a830e0ed379ffadde7a78b7fd89ffe99099ed1caa0770edb313b5e22e358c04bb1e76a30065662a07624f3abbeb45aa
|
data/lib/file_filter.rb
CHANGED
File without changes
|
data/lib/list_spider.rb
CHANGED
@@ -8,7 +8,6 @@ require File.expand_path('../file_filter', __FILE__)
|
|
8
8
|
|
9
9
|
class TaskStruct
|
10
10
|
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
|
-
@origin_href = href
|
12
11
|
@href = href
|
13
12
|
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
14
13
|
@local_path = local_path
|
@@ -23,7 +22,7 @@ class TaskStruct
|
|
23
22
|
other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
|
24
23
|
end
|
25
24
|
|
26
|
-
attr_accessor :
|
25
|
+
attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
27
26
|
end
|
28
27
|
|
29
28
|
module ListSpider
|
@@ -33,14 +32,14 @@ module ListSpider
|
|
33
32
|
DEFAULT_INTERVAL = 0
|
34
33
|
|
35
34
|
@random_time_range = 3..10
|
36
|
-
@
|
35
|
+
@convert_to_utf8 = false
|
37
36
|
@connection_opts = { connect_timeout: 60 }
|
38
37
|
@overwrite_exist = false
|
39
38
|
@max_redirects = 10
|
40
39
|
@local_path_set = Set.new
|
41
40
|
|
42
41
|
class << self
|
43
|
-
attr_accessor :
|
42
|
+
attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
|
44
43
|
|
45
44
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
46
45
|
@connection_opts = {
|
@@ -102,7 +101,7 @@ module ListSpider
|
|
102
101
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
102
|
begin
|
104
103
|
File.open(e.local_path, 'wb') do |f|
|
105
|
-
f << if @
|
104
|
+
f << if @convert_to_utf8 == true
|
106
105
|
SpiderHelper.to_utf8(w.response)
|
107
106
|
else
|
108
107
|
w.response
|
@@ -115,15 +114,21 @@ module ListSpider
|
|
115
114
|
end
|
116
115
|
end
|
117
116
|
w.errback do
|
118
|
-
puts "errback:#{w.response_header}"
|
119
|
-
puts e.origin_href
|
117
|
+
puts "errback:#{w.response_header},retry..."
|
120
118
|
puts e.href
|
121
119
|
puts w.response_header.status
|
122
|
-
|
120
|
+
|
121
|
+
ret = false
|
123
122
|
if e.http_method == :get
|
124
|
-
SpiderHelper.direct_http_get(e.href, e.local_path)
|
123
|
+
ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
|
125
124
|
elsif e.http_method == :post
|
126
|
-
|
125
|
+
ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
|
126
|
+
end
|
127
|
+
|
128
|
+
if ret
|
129
|
+
succeed_list << e
|
130
|
+
else
|
131
|
+
failed_list << e
|
127
132
|
end
|
128
133
|
end
|
129
134
|
|
data/lib/spider_helper.rb
CHANGED
@@ -3,7 +3,7 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
5
|
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil, header: nil)
|
6
|
+
def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
|
7
7
|
href = string_to_uri(href) if href.class == ''.class
|
8
8
|
|
9
9
|
begin
|
@@ -19,18 +19,23 @@ module SpiderHelper
|
|
19
19
|
if res.is_a?(Net::HTTPSuccess)
|
20
20
|
local_dir = File.dirname(local_path)
|
21
21
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
22
|
-
|
22
|
+
content = res.body
|
23
|
+
content = to_utf8(content) if convert_to_utf8
|
24
|
+
File.write(local_path, content)
|
23
25
|
puts 'succeed'
|
26
|
+
return true
|
24
27
|
else
|
25
28
|
puts res
|
26
29
|
end
|
27
30
|
rescue => e
|
28
31
|
puts e.backtrace
|
29
32
|
puts e
|
33
|
+
false
|
30
34
|
end
|
35
|
+
false
|
31
36
|
end
|
32
37
|
|
33
|
-
def direct_http_post(href, local_path, params, header: nil)
|
38
|
+
def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
|
34
39
|
href = string_to_uri(href) if href.class == ''.class
|
35
40
|
|
36
41
|
begin
|
@@ -46,13 +51,19 @@ module SpiderHelper
|
|
46
51
|
if res.is_a?(Net::HTTPSuccess)
|
47
52
|
local_dir = File.dirname(local_path)
|
48
53
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
49
|
-
|
54
|
+
content = res.body
|
55
|
+
content = to_utf8(content) if convert_to_utf8s
|
56
|
+
File.write(local_path, content)
|
57
|
+
puts 'succeed'
|
58
|
+
return true
|
50
59
|
else
|
51
60
|
puts res
|
52
61
|
end
|
53
62
|
rescue => e
|
54
63
|
puts e
|
64
|
+
false
|
55
65
|
end
|
66
|
+
false
|
56
67
|
end
|
57
68
|
|
58
69
|
def extract_href_last(origin_href)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|
@@ -99,9 +99,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
99
|
version: '0'
|
100
100
|
requirements: []
|
101
101
|
rubyforge_project:
|
102
|
-
rubygems_version: 2.
|
102
|
+
rubygems_version: 2.4.5
|
103
103
|
signing_key:
|
104
104
|
specification_version: 4
|
105
105
|
summary: List Spider
|
106
106
|
test_files: []
|
107
|
-
has_rdoc:
|