http_crawler 0.2.3.1 → 0.2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
4
- data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
3
+ metadata.gz: c93afa1f523cd20478bf06f854f1588a2ec9e248
4
+ data.tar.gz: 55fcaf8ad7870b6fdf89a63731b1ea8814083cc1
5
5
  SHA512:
6
- metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
7
- data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
6
+ metadata.gz: c72785e4e5ce5a19d62f0371df23b7ad18e2446e50f89a02bff8cc8953e074713b0ad995b22b6c8e39adbffeef83783d889caa90b3de53bf701416f0e2035590
7
+ data.tar.gz: f092ae6bb1a5cfe704675b1cf8792a93a01dd9365486a89c5fd3663079019ecffb68931f2c25ccef6ee4356f735a2dc03c32539511f6309b20424650d35e0b99
@@ -46,6 +46,11 @@ module HttpCrawler
46
46
 
47
47
  end
48
48
 
49
+ # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
50
+ def add_error_url(url_string)
51
+ @http.error_urls << url_string
52
+ end
53
+
49
54
  # init_uri 如果未初始化@uri,则会报错
50
55
  # 继承类需要实现 @uri = URI("http://host")
51
56
  #
@@ -9,12 +9,15 @@ module HttpCrawler
9
9
  attr_accessor :proxy_key
10
10
  # 请求错误后的重复最大请求次数
11
11
  attr_accessor :max_error_num
12
+ # 错误的url地址,存的是正则
13
+ attr_accessor :error_urls
12
14
 
13
15
  def initialize(address, port = nil)
14
16
  super(address, port)
15
17
  @max_error_num = 2
16
18
  @error_num = 0
17
19
  @proxy_key = "default"
20
+ @error_urls = []
18
21
  end
19
22
 
20
23
  def http_error_sleep
@@ -54,48 +57,61 @@ module HttpCrawler
54
57
  # 通过调用 api 获取代理或者通过自定义设置代理
55
58
  def get_proxy
56
59
 
57
- while @@proxy_list.blank?
58
- Rails.logger.debug("@@proxy_list 为空进行更新")
60
+ # while @@proxy_list.blank?
61
+ # Rails.logger.debug("@@proxy_list 为空进行更新")
62
+ # proxy_client = HttpCrawler::Proxy.for(proxy_api)
63
+ # proxy_r = proxy_client.get_proxy(key: proxy_key)
64
+ # @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
65
+ # Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
66
+ # sleep(1)
67
+ # end
68
+ # p = @@proxy_list.delete_at(0)
69
+
70
+ proxy_ip = nil
71
+ begin
72
+ Rails.logger.debug("开始获取代理IP")
59
73
  proxy_client = HttpCrawler::Proxy.for(proxy_api)
60
74
  proxy_r = proxy_client.get_proxy(key: proxy_key)
61
- @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
62
- Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
63
- sleep(1)
64
- end
65
-
66
- p = @@proxy_list.delete_at(0)
75
+ proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
76
+ if proxy_ip.blank?
77
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
78
+ else
79
+ break
80
+ end
81
+ sleep(5)
82
+ end while true
67
83
 
68
- Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")
84
+ Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
69
85
 
70
- unless p && p["p_addr"] && p["p_port"]
86
+ unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
71
87
  Rails.logger.warn "无最新代理等待5秒后重新获取"
72
88
  sleep(5)
73
- p = get_proxy
89
+ proxy_ip = get_proxy
74
90
  end
75
91
 
76
- if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
92
+ if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
77
93
  Rails.logger.warn "无最新代理等待5秒后重新获取"
78
94
  sleep(5)
79
- p = get_proxy
95
+ proxy_ip = get_proxy
80
96
  end
81
- p
97
+ proxy_ip
82
98
  end
83
99
 
84
- def update_proxy(p = {})
85
- if p.blank?
100
+ def update_proxy(proxy_ip = {})
101
+ if proxy_ip.blank?
86
102
  proxy(get_proxy)
87
103
  else
88
- proxy(p)
104
+ proxy(proxy_ip)
89
105
  end
90
106
  end
91
107
 
92
108
  # 如果自动更新代理 则更新代理返回 true,否则返回false
93
- def update_proxy?(p = {})
109
+ def update_proxy?(proxy_ip = {})
94
110
  if auto_proxy
95
- if p.blank?
111
+ if proxy_ip.blank?
96
112
  proxy(get_proxy)
97
113
  else
98
- proxy(p)
114
+ proxy(proxy_ip)
99
115
  end
100
116
  return true
101
117
  else
@@ -118,6 +134,11 @@ module HttpCrawler
118
134
  when Net::HTTPRedirection then
119
135
  location = response['location']
120
136
  Rails.logger.warn "redirected to #{location}"
137
+ @error_urls.each do |url_string|
138
+ if location =~ /#{url_string}/
139
+ raise "跳转到异常url => #{location}"
140
+ end
141
+ end
121
142
  # 传入 location 进行跳转
122
143
  get_fetch(location, initheader, dest, limit - 1, &block)
123
144
  when Net::HTTPServerError then
@@ -152,6 +173,11 @@ module HttpCrawler
152
173
  when Net::HTTPRedirection then
153
174
  location = response['location']
154
175
  Rails.logger.warn "redirected to #{location}"
176
+ @error_urls.each do |url_string|
177
+ if location =~ /#{url_string}/
178
+ raise "跳转到异常url => #{location}"
179
+ end
180
+ end
155
181
  # 传入 location 进行跳转
156
182
  get_fetch(location, initheader, dest, 9, &block)
157
183
  when Net::HTTPServerError then
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.1"
2
+ VERSION = "0.2.3.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.1
4
+ version: 0.2.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger