http_crawler 0.2.3.1 → 0.2.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
4
- data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
3
+ metadata.gz: c93afa1f523cd20478bf06f854f1588a2ec9e248
4
+ data.tar.gz: 55fcaf8ad7870b6fdf89a63731b1ea8814083cc1
5
5
  SHA512:
6
- metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
7
- data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
6
+ metadata.gz: c72785e4e5ce5a19d62f0371df23b7ad18e2446e50f89a02bff8cc8953e074713b0ad995b22b6c8e39adbffeef83783d889caa90b3de53bf701416f0e2035590
7
+ data.tar.gz: f092ae6bb1a5cfe704675b1cf8792a93a01dd9365486a89c5fd3663079019ecffb68931f2c25ccef6ee4356f735a2dc03c32539511f6309b20424650d35e0b99
@@ -46,6 +46,11 @@ module HttpCrawler
46
46
 
47
47
  end
48
48
 
49
+ # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
50
+ def add_error_url(url_string)
51
+ @http.error_urls << url_string
52
+ end
53
+
49
54
  # init_uri 如果未初始化@uri,则会报错
50
55
  # 继承类需要实现 @uri = URI("http://host")
51
56
  #
@@ -9,12 +9,15 @@ module HttpCrawler
9
9
  attr_accessor :proxy_key
10
10
  # 请求错误后的重复最大请求次数
11
11
  attr_accessor :max_error_num
12
+ # 错误的url地址,存的是正则
13
+ attr_accessor :error_urls
12
14
 
13
15
  def initialize(address, port = nil)
14
16
  super(address, port)
15
17
  @max_error_num = 2
16
18
  @error_num = 0
17
19
  @proxy_key = "default"
20
+ @error_urls = []
18
21
  end
19
22
 
20
23
  def http_error_sleep
@@ -54,48 +57,61 @@ module HttpCrawler
54
57
  # 通过调用 api 获取代理或者通过自定义设置代理
55
58
  def get_proxy
56
59
 
57
- while @@proxy_list.blank?
58
- Rails.logger.debug("@@proxy_list 为空进行更新")
60
+ # while @@proxy_list.blank?
61
+ # Rails.logger.debug("@@proxy_list 为空进行更新")
62
+ # proxy_client = HttpCrawler::Proxy.for(proxy_api)
63
+ # proxy_r = proxy_client.get_proxy(key: proxy_key)
64
+ # @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
65
+ # Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
66
+ # sleep(1)
67
+ # end
68
+ # p = @@proxy_list.delete_at(0)
69
+
70
+ proxy_ip = nil
71
+ begin
72
+ Rails.logger.debug("开始获取代理IP")
59
73
  proxy_client = HttpCrawler::Proxy.for(proxy_api)
60
74
  proxy_r = proxy_client.get_proxy(key: proxy_key)
61
- @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
62
- Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
63
- sleep(1)
64
- end
65
-
66
- p = @@proxy_list.delete_at(0)
75
+ proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
76
+ if proxy_ip.blank?
77
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
78
+ else
79
+ break
80
+ end
81
+ sleep(5)
82
+ end while true
67
83
 
68
- Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")
84
+ Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
69
85
 
70
- unless p && p["p_addr"] && p["p_port"]
86
+ unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
71
87
  Rails.logger.warn "无最新代理等待5秒后重新获取"
72
88
  sleep(5)
73
- p = get_proxy
89
+ proxy_ip = get_proxy
74
90
  end
75
91
 
76
- if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
92
+ if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
77
93
  Rails.logger.warn "无最新代理等待5秒后重新获取"
78
94
  sleep(5)
79
- p = get_proxy
95
+ proxy_ip = get_proxy
80
96
  end
81
- p
97
+ proxy_ip
82
98
  end
83
99
 
84
- def update_proxy(p = {})
85
- if p.blank?
100
+ def update_proxy(proxy_ip = {})
101
+ if proxy_ip.blank?
86
102
  proxy(get_proxy)
87
103
  else
88
- proxy(p)
104
+ proxy(proxy_ip)
89
105
  end
90
106
  end
91
107
 
92
108
  # 如果自动更新代理 则更新代理返回 true,否则返回false
93
- def update_proxy?(p = {})
109
+ def update_proxy?(proxy_ip = {})
94
110
  if auto_proxy
95
- if p.blank?
111
+ if proxy_ip.blank?
96
112
  proxy(get_proxy)
97
113
  else
98
- proxy(p)
114
+ proxy(proxy_ip)
99
115
  end
100
116
  return true
101
117
  else
@@ -118,6 +134,11 @@ module HttpCrawler
118
134
  when Net::HTTPRedirection then
119
135
  location = response['location']
120
136
  Rails.logger.warn "redirected to #{location}"
137
+ @error_urls.each do |url_string|
138
+ if location =~ /#{url_string}/
139
+ raise "跳转到异常url => #{location}"
140
+ end
141
+ end
121
142
  # 传入 location 进行跳转
122
143
  get_fetch(location, initheader, dest, limit - 1, &block)
123
144
  when Net::HTTPServerError then
@@ -152,6 +173,11 @@ module HttpCrawler
152
173
  when Net::HTTPRedirection then
153
174
  location = response['location']
154
175
  Rails.logger.warn "redirected to #{location}"
176
+ @error_urls.each do |url_string|
177
+ if location =~ /#{url_string}/
178
+ raise "跳转到异常url => #{location}"
179
+ end
180
+ end
155
181
  # 传入 location 进行跳转
156
182
  get_fetch(location, initheader, dest, 9, &block)
157
183
  when Net::HTTPServerError then
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.1"
2
+ VERSION = "0.2.3.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.1
4
+ version: 0.2.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger