RubyGems - http_crawler - Versions diffs - 0.2.3.1 → 0.2.3.2 - Mend

http_crawler 0.2.3.1 → 0.2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
-  data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
+  metadata.gz: c93afa1f523cd20478bf06f854f1588a2ec9e248
+  data.tar.gz: 55fcaf8ad7870b6fdf89a63731b1ea8814083cc1
 SHA512:
-  metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
-  data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
+  metadata.gz: c72785e4e5ce5a19d62f0371df23b7ad18e2446e50f89a02bff8cc8953e074713b0ad995b22b6c8e39adbffeef83783d889caa90b3de53bf701416f0e2035590
+  data.tar.gz: f092ae6bb1a5cfe704675b1cf8792a93a01dd9365486a89c5fd3663079019ecffb68931f2c25ccef6ee4356f735a2dc03c32539511f6309b20424650d35e0b99

data/lib/http_crawler/client.rb CHANGED Viewed

@@ -46,6 +46,11 @@ module HttpCrawler
     end
+    # 添加错误的url地址，表示这里面的url都是异常地址，存的是正则
+    def add_error_url(url_string)
+      @http.error_urls << url_string
+    end
     #  init_uri 如果未初始化@uri,则会报错
     #  继承类需要实现 @uri = URI("http://host")
     #

data/lib/http_crawler/http.rb CHANGED Viewed

@@ -9,12 +9,15 @@ module HttpCrawler
     attr_accessor :proxy_key
     # 请求错误后的重复最大请求次数
     attr_accessor :max_error_num
+    # 错误的url地址，存的是正则
+    attr_accessor :error_urls
     def initialize(address, port = nil)
       super(address, port)
       @max_error_num = 2
       @error_num = 0
       @proxy_key = "default"
+      @error_urls = []
     end
     def http_error_sleep
@@ -54,48 +57,61 @@ module HttpCrawler
     # 通过调用 api 获取代理或者通过自定义设置代理
     def get_proxy
-      while @@proxy_list.blank?
-        Rails.logger.debug("@@proxy_list 为空进行更新")
+      # while @@proxy_list.blank?
+      #   Rails.logger.debug("@@proxy_list 为空进行更新")
+      #   proxy_client = HttpCrawler::Proxy.for(proxy_api)
+      #   proxy_r = proxy_client.get_proxy(key: proxy_key)
+      #   @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
+      #   Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
+      #   sleep(1)
+      # end
+      # p = @@proxy_list.delete_at(0)
+      proxy_ip = nil
+      begin
+        Rails.logger.debug("开始获取代理IP")
         proxy_client = HttpCrawler::Proxy.for(proxy_api)
         proxy_r = proxy_client.get_proxy(key: proxy_key)
-        @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
-        Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
-        sleep(1)
-      end
-      p = @@proxy_list.delete_at(0)
+        proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
+        if proxy_ip.blank?
+          Rails.logger.warn "无最新代理等待5秒后重新获取"
+        else
+          break
+        end
+        sleep(5)
+      end while true
-      Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")
+      Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
-      unless p && p["p_addr"] && p["p_port"]
+      unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
         Rails.logger.warn "无最新代理等待5秒后重新获取"
         sleep(5)
-        p = get_proxy
+        proxy_ip = get_proxy
       end
-      if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
+      if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
         Rails.logger.warn "无最新代理等待5秒后重新获取"
         sleep(5)
-        p = get_proxy
+        proxy_ip = get_proxy
       end
-      p
+      proxy_ip
     end
-    def update_proxy(p = {})
-      if p.blank?
+    def update_proxy(proxy_ip = {})
+      if proxy_ip.blank?
         proxy(get_proxy)
       else
-        proxy(p)
+        proxy(proxy_ip)
       end
     end
     # 如果自动更新代理 则更新代理返回 true，否则返回false
-    def update_proxy?(p = {})
+    def update_proxy?(proxy_ip = {})
       if auto_proxy
-        if p.blank?
+        if proxy_ip.blank?
           proxy(get_proxy)
         else
-          proxy(p)
+          proxy(proxy_ip)
         end
         return true
       else
@@ -118,6 +134,11 @@ module HttpCrawler
       when Net::HTTPRedirection then
         location = response['location']
         Rails.logger.warn "redirected to #{location}"
+        @error_urls.each do |url_string|
+          if location =~ /#{url_string}/
+            raise "跳转到异常url => #{location}"
+          end
+        end
         # 传入 location 进行跳转
         get_fetch(location, initheader, dest, limit - 1, &block)
       when Net::HTTPServerError then
@@ -152,6 +173,11 @@ module HttpCrawler
       when Net::HTTPRedirection then
         location = response['location']
         Rails.logger.warn "redirected to #{location}"
+        @error_urls.each do |url_string|
+          if location =~ /#{url_string}/
+            raise "跳转到异常url => #{location}"
+          end
+        end
         # 传入 location 进行跳转
         get_fetch(location, initheader, dest, 9, &block)
       when Net::HTTPServerError then

data/lib/http_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HttpCrawler
-  VERSION = "0.2.3.1"
+  VERSION = "0.2.3.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: http_crawler
 version: !ruby/object:Gem::Version
-  version: 0.2.3.1
+  version: 0.2.3.2
 platform: ruby
 authors:
 - jagger