list_spider 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5f95841ea2cee7bec53f2fec3ef4400148714653
4
- data.tar.gz: 3be30dba8ca687741c0639e050f9eac5b35102b0
3
+ metadata.gz: 41ff55315cb0b0cbb03b8ad9a11001e0c410935d
4
+ data.tar.gz: c3a79e8a3bebddf021cb0c9e13bed61be086bd33
5
5
  SHA512:
6
- metadata.gz: 98991f0a63168037d6007a56864e44b1c32a3764f7281f32051cf572ccd699e928bbd0adc8141217d2286092ddaa236b8e5bc3cc2f54e70393ed2d28d5cf8d2a
7
- data.tar.gz: e04cd448af2fd6fc9f072ceefafd244ebcde6f0267b7be4d5a8c0fedd9c4dca3e2f7d813484b58f503d470639b27977f730fb97061d35a3d0a89cf82bed9e219
6
+ metadata.gz: 20a16743baa0b56ac3438493e8e4ecc6a84c6e4cd933cf23c6a2e7b16906ebd3e425702808813d6634f66af857155c14867a2a4edaeb2dabeaa3c006e847771e
7
+ data.tar.gz: 4259ce7eea88f064a946a424542d3db30a830e0ed379ffadde7a78b7fd89ffe99099ed1caa0770edb313b5e22e358c04bb1e76a30065662a07624f3abbeb45aa
data/lib/file_filter.rb CHANGED
File without changes
data/lib/list_spider.rb CHANGED
@@ -8,7 +8,6 @@ require File.expand_path('../file_filter', __FILE__)
8
8
 
9
9
  class TaskStruct
10
10
  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
- @origin_href = href
12
11
  @href = href
13
12
  @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
13
  @local_path = local_path
@@ -23,7 +22,7 @@ class TaskStruct
23
22
  other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
23
  end
25
24
 
26
- attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
25
+ attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
27
26
  end
28
27
 
29
28
  module ListSpider
@@ -33,14 +32,14 @@ module ListSpider
33
32
  DEFAULT_INTERVAL = 0
34
33
 
35
34
  @random_time_range = 3..10
36
- @conver_to_utf8 = false
35
+ @convert_to_utf8 = false
37
36
  @connection_opts = { connect_timeout: 60 }
38
37
  @overwrite_exist = false
39
38
  @max_redirects = 10
40
39
  @local_path_set = Set.new
41
40
 
42
41
  class << self
43
- attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
42
+ attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
44
43
 
45
44
  def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
45
  @connection_opts = {
@@ -102,7 +101,7 @@ module ListSpider
102
101
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
102
  begin
104
103
  File.open(e.local_path, 'wb') do |f|
105
- f << if @conver_to_utf8 == true
104
+ f << if @convert_to_utf8 == true
106
105
  SpiderHelper.to_utf8(w.response)
107
106
  else
108
107
  w.response
@@ -115,15 +114,21 @@ module ListSpider
115
114
  end
116
115
  end
117
116
  w.errback do
118
- puts "errback:#{w.response_header}"
119
- puts e.origin_href
117
+ puts "errback:#{w.response_header},retry..."
120
118
  puts e.href
121
119
  puts w.response_header.status
122
- failed_list << e
120
+
121
+ ret = false
123
122
  if e.http_method == :get
124
- SpiderHelper.direct_http_get(e.href, e.local_path)
123
+ ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
124
  elsif e.http_method == :post
126
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
125
+ ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
+ end
127
+
128
+ if ret
129
+ succeed_list << e
130
+ else
131
+ failed_list << e
127
132
  end
128
133
  end
129
134
 
data/lib/spider_helper.rb CHANGED
@@ -3,7 +3,7 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil)
6
+ def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
7
  href = string_to_uri(href) if href.class == ''.class
8
8
 
9
9
  begin
@@ -19,18 +19,23 @@ module SpiderHelper
19
19
  if res.is_a?(Net::HTTPSuccess)
20
20
  local_dir = File.dirname(local_path)
21
21
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
22
- File.write(local_path, res.body)
22
+ content = res.body
23
+ content = to_utf8(content) if convert_to_utf8
24
+ File.write(local_path, content)
23
25
  puts 'succeed'
26
+ return true
24
27
  else
25
28
  puts res
26
29
  end
27
30
  rescue => e
28
31
  puts e.backtrace
29
32
  puts e
33
+ false
30
34
  end
35
+ false
31
36
  end
32
37
 
33
- def direct_http_post(href, local_path, params, header: nil)
38
+ def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
34
39
  href = string_to_uri(href) if href.class == ''.class
35
40
 
36
41
  begin
@@ -46,13 +51,19 @@ module SpiderHelper
46
51
  if res.is_a?(Net::HTTPSuccess)
47
52
  local_dir = File.dirname(local_path)
48
53
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
49
- File.write(local_path, res.body)
54
+ content = res.body
55
+ content = to_utf8(content) if convert_to_utf8s
56
+ File.write(local_path, content)
57
+ puts 'succeed'
58
+ return true
50
59
  else
51
60
  puts res
52
61
  end
53
62
  rescue => e
54
63
  puts e
64
+ false
55
65
  end
66
+ false
56
67
  end
57
68
 
58
69
  def extract_href_last(origin_href)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-19 00:00:00.000000000 Z
11
+ date: 2016-12-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -99,9 +99,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
99
  version: '0'
100
100
  requirements: []
101
101
  rubyforge_project:
102
- rubygems_version: 2.6.4
102
+ rubygems_version: 2.4.5
103
103
  signing_key:
104
104
  specification_version: 4
105
105
  summary: List Spider
106
106
  test_files: []
107
- has_rdoc: