RubyGems - http_crawler - Versions diffs - 0.3.0.0 → 0.3.0.1 - Mend

http_crawler 0.3.0.0 → 0.3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/http_crawler.gemspec +1 -1
data/lib/http_crawler/client.rb +63 -14
data/lib/http_crawler/http/response.rb +22 -1
data/lib/http_crawler/proxy/test_proxy_api/client.rb +0 -1
data/lib/http_crawler/proxy.rb +4 -2
data/lib/http_crawler/version.rb +1 -1
data/lib/http_crawler.rb +3 -0
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
-  data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
+  metadata.gz: 3d3f0a5acf0654b1ccba309c40a7153866850ce8f5be508e06cf587f23cf8a1d
+  data.tar.gz: 38ca43bafdecec27eb57078de5b07dcc07a0d30b75807a44895eb7441c0860ff
 SHA512:
-  metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
-  data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
+  metadata.gz: 98d2057ce312a8beef8508ef411f2e1d3b7f65c0588c19f3dffc1a71f1c85a259e812e8f7fb48b66f0b25120a012eccafaba096e4d1ca1647e3e7fdb790fd6d8
+  data.tar.gz: 6f8b27afbf39767e1484ff62247e007fb666a8d49d6893968be85a93925b8b2f7207ef06e7843902d08d270af3041320f9b70e4f38248108841c4d1be3359b6c

data/http_crawler.gemspec CHANGED Viewed

@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
   spec.email         = ["1336098842@qq.com"]
   spec.summary       = %q{http 爬虫。}
-  spec.description   = %q{初级开发工程师，基于net/http 写的爬虫扩展包。}
+  spec.description   = %q{初级开发工程师，基于 http 写的爬虫扩展包。}
   spec.homepage      = "https://rubygems.org/gems/http_crawler"
   spec.license       = "MIT"

data/lib/http_crawler/client.rb CHANGED Viewed

@@ -23,8 +23,14 @@ module HttpCrawler
       end
     end
-    attr_reader :uri
+    attr_accessor :max_error_num
+    # 最大错误重试次数
+    def max_error_num
+      @max_error_num ||= 1
+    end
+    attr_reader :uri
     #  init_uri 如果未初始化@uri,则会报错
     #  继承类需要实现 @uri = URI("http://host")
     #
@@ -34,9 +40,9 @@ module HttpCrawler
     # 初始化超时时间
     def init_timeout
-      @connect_time = 5
-      @write_time = 2
-      @read_time = 5
+      @connect_time = 3
+      @write_time = 3
+      @read_time = 3
     end
     # 初始化 ssl 协议
@@ -169,16 +175,6 @@ module HttpCrawler
       h
     end
-    # 发送 get 请求
-    def get(path, params = {})
-      http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
-    end
-    # 发送 post 请求
-    def post(path, params = {})
-      http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
-    end
     #
     #  init_uri 如果未初始化@uri,则会报错
     #  继承类需要重定义 init_uri
@@ -200,6 +196,59 @@ module HttpCrawler
       @proxy_params = {key: "#{self.class}"}
     end
+    # 发送 get 请求
+    def get(path, params = {})
+      request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
+    end
+    # 发送 post 请求
+    def post(path, params = {})
+      request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
+    end
+    # 请求的响应
+    attr_accessor :response
+    protected :response=
+    # 出现如果验证码,切换代理
+    def validation_to_proxy?(r = response)
+      # 判断是否出现验证码
+      if r.validation_page?
+        # 触发验证码切换代理
+        self.update_proxy?
+        # 成功处理
+        return true
+      else
+        return false
+      end
+    end
+    protected
+    # 发送请求
+    def request(&block)
+      raise "必须定义块" unless block_given?
+      n = max_error_num
+      begin
+        block.call
+      rescue HTTP::TimeoutError
+        # 超时错误切换代理
+        if self.update_proxy?
+          retry
+        else
+          raise error
+        end
+      rescue => error
+        # 错误尝试次数
+        if n <= 0
+          raise error
+        else
+          n -= 1
+          retry
+        end
+      end
+    end
   end
 end

data/lib/http_crawler/http/response.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module HTTP
     # 解压并转码 body 数据
     def decoding_body
-      @decoding_body ||= self.to_s
+      @decoding_body ||= self.body.to_s
     end
     #  def decoding_body
@@ -48,5 +48,26 @@ module HTTP
       return time
     end
+    # 验证码判断
+    attr_accessor :validations
+    def validations
+      @validations ||= []
+    end
+    # 是否验证码界面
+    def validation_page?
+      # 正则匹配数组 validations 的所有匹配值
+      validations.each do |regular|
+        if decoding_body[regular]
+          Rails.logger.warn("触发验证信息")
+          Rails.logger.warn(decoding_body[(decoding_body =~ regular)..100])
+          return true
+        end
+      end
+      return false
+    end
   end # class Net::HTTPResponse
 end

data/lib/http_crawler/proxy/test_proxy_api/client.rb CHANGED Viewed

@@ -4,7 +4,6 @@ module HttpCrawler
     module TestProxyApi
       class Client
-        include(HttpCrawler::Client)
         include(HttpCrawler::Proxy::Client)

data/lib/http_crawler/proxy.rb CHANGED Viewed

@@ -1,8 +1,7 @@
 module HttpCrawler
   module Proxy
+    include(HttpCrawler::Client)
     class << self
       # 接收格式
@@ -15,6 +14,9 @@ module HttpCrawler
     end
+    def max_error_num
+      @max_error_num ||= 0
+    end
   end
 end

data/lib/http_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HttpCrawler
-  VERSION = "0.3.0.0"
+  VERSION = "0.3.0.1"
 end

data/lib/http_crawler.rb CHANGED Viewed

@@ -10,4 +10,7 @@ load 'http_crawler/proxy.rb'
 module HttpCrawler
   # Your code goes here...
+  def self.a
+    puts "112"
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: http_crawler
 version: !ruby/object:Gem::Version
-  version: 0.3.0.0
+  version: 0.3.0.1
 platform: ruby
 authors:
 - jagger
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-02-16 00:00:00.000000000 Z
+date: 2019-02-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -108,7 +108,7 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 0.2.1
-description: 初级开发工程师，基于net/http 写的爬虫扩展包。
+description: 初级开发工程师，基于 http 写的爬虫扩展包。
 email:
 - 1336098842@qq.com
 executables: []