list_spider 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5f95841ea2cee7bec53f2fec3ef4400148714653
4
- data.tar.gz: 3be30dba8ca687741c0639e050f9eac5b35102b0
3
+ metadata.gz: 41ff55315cb0b0cbb03b8ad9a11001e0c410935d
4
+ data.tar.gz: c3a79e8a3bebddf021cb0c9e13bed61be086bd33
5
5
  SHA512:
6
- metadata.gz: 98991f0a63168037d6007a56864e44b1c32a3764f7281f32051cf572ccd699e928bbd0adc8141217d2286092ddaa236b8e5bc3cc2f54e70393ed2d28d5cf8d2a
7
- data.tar.gz: e04cd448af2fd6fc9f072ceefafd244ebcde6f0267b7be4d5a8c0fedd9c4dca3e2f7d813484b58f503d470639b27977f730fb97061d35a3d0a89cf82bed9e219
6
+ metadata.gz: 20a16743baa0b56ac3438493e8e4ecc6a84c6e4cd933cf23c6a2e7b16906ebd3e425702808813d6634f66af857155c14867a2a4edaeb2dabeaa3c006e847771e
7
+ data.tar.gz: 4259ce7eea88f064a946a424542d3db30a830e0ed379ffadde7a78b7fd89ffe99099ed1caa0770edb313b5e22e358c04bb1e76a30065662a07624f3abbeb45aa
data/lib/file_filter.rb CHANGED
File without changes
data/lib/list_spider.rb CHANGED
@@ -8,7 +8,6 @@ require File.expand_path('../file_filter', __FILE__)
8
8
 
9
9
  class TaskStruct
10
10
  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
- @origin_href = href
12
11
  @href = href
13
12
  @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
13
  @local_path = local_path
@@ -23,7 +22,7 @@ class TaskStruct
23
22
  other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
23
  end
25
24
 
26
- attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
25
+ attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
27
26
  end
28
27
 
29
28
  module ListSpider
@@ -33,14 +32,14 @@ module ListSpider
33
32
  DEFAULT_INTERVAL = 0
34
33
 
35
34
  @random_time_range = 3..10
36
- @conver_to_utf8 = false
35
+ @convert_to_utf8 = false
37
36
  @connection_opts = { connect_timeout: 60 }
38
37
  @overwrite_exist = false
39
38
  @max_redirects = 10
40
39
  @local_path_set = Set.new
41
40
 
42
41
  class << self
43
- attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
42
+ attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
44
43
 
45
44
  def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
45
  @connection_opts = {
@@ -102,7 +101,7 @@ module ListSpider
102
101
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
102
  begin
104
103
  File.open(e.local_path, 'wb') do |f|
105
- f << if @conver_to_utf8 == true
104
+ f << if @convert_to_utf8 == true
106
105
  SpiderHelper.to_utf8(w.response)
107
106
  else
108
107
  w.response
@@ -115,15 +114,21 @@ module ListSpider
115
114
  end
116
115
  end
117
116
  w.errback do
118
- puts "errback:#{w.response_header}"
119
- puts e.origin_href
117
+ puts "errback:#{w.response_header},retry..."
120
118
  puts e.href
121
119
  puts w.response_header.status
122
- failed_list << e
120
+
121
+ ret = false
123
122
  if e.http_method == :get
124
- SpiderHelper.direct_http_get(e.href, e.local_path)
123
+ ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
124
  elsif e.http_method == :post
126
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
125
+ ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
+ end
127
+
128
+ if ret
129
+ succeed_list << e
130
+ else
131
+ failed_list << e
127
132
  end
128
133
  end
129
134
 
data/lib/spider_helper.rb CHANGED
@@ -3,7 +3,7 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil)
6
+ def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
7
  href = string_to_uri(href) if href.class == ''.class
8
8
 
9
9
  begin
@@ -19,18 +19,23 @@ module SpiderHelper
19
19
  if res.is_a?(Net::HTTPSuccess)
20
20
  local_dir = File.dirname(local_path)
21
21
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
22
- File.write(local_path, res.body)
22
+ content = res.body
23
+ content = to_utf8(content) if convert_to_utf8
24
+ File.write(local_path, content)
23
25
  puts 'succeed'
26
+ return true
24
27
  else
25
28
  puts res
26
29
  end
27
30
  rescue => e
28
31
  puts e.backtrace
29
32
  puts e
33
+ false
30
34
  end
35
+ false
31
36
  end
32
37
 
33
- def direct_http_post(href, local_path, params, header: nil)
38
+ def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
34
39
  href = string_to_uri(href) if href.class == ''.class
35
40
 
36
41
  begin
@@ -46,13 +51,19 @@ module SpiderHelper
46
51
  if res.is_a?(Net::HTTPSuccess)
47
52
  local_dir = File.dirname(local_path)
48
53
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
49
- File.write(local_path, res.body)
54
+ content = res.body
55
+ content = to_utf8(content) if convert_to_utf8s
56
+ File.write(local_path, content)
57
+ puts 'succeed'
58
+ return true
50
59
  else
51
60
  puts res
52
61
  end
53
62
  rescue => e
54
63
  puts e
64
+ false
55
65
  end
66
+ false
56
67
  end
57
68
 
58
69
  def extract_href_last(origin_href)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-19 00:00:00.000000000 Z
11
+ date: 2016-12-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -99,9 +99,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
99
  version: '0'
100
100
  requirements: []
101
101
  rubyforge_project:
102
- rubygems_version: 2.6.4
102
+ rubygems_version: 2.4.5
103
103
  signing_key:
104
104
  specification_version: 4
105
105
  summary: List Spider
106
106
  test_files: []
107
- has_rdoc: