RubyGems - http_crawler - Versions diffs - 0.3.0.2 → 0.3.0.3 - Mend

http_crawler 0.3.0.2 → 0.3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/.idea/http_crawler.iml +13 -2
data/lib/http_crawler/client.rb +4 -4
data/lib/http_crawler/common.rb +2 -2
data/lib/http_crawler/http/response.rb +59 -3
data/lib/http_crawler/proxy.rb +1 -1
data/lib/http_crawler/version.rb +1 -1
data/lib/http_crawler/web.rb +1 -1
data/lib/http_crawler.rb +6 -9
metadata +2 -3
data/lib/http_crawler/http.rb +0 -260

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e47fc7ceac8e7335c7d873104a8ca7f504885af1c19a0802d23c1986d4ae5588
-  data.tar.gz: 392e793eae03814c1f3475e7515124d51b8adcdffdec9065873a90c800765225
+  metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
+  data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
 SHA512:
-  metadata.gz: fb7ba4091d7320d1fcbb3926edb060fd55155156c34cf42b7ea1b67e1b8eba3c0cdf317a2f53d8094dee3672a17058dd57f688da6a89b4f86cfcdedad5bda42f
-  data.tar.gz: cd6001c16fbbff9023fe26c739fe270c62176849d3a4809d7bfa1aff4dd74856b6a8db95297c312d4fe56334dc7c8f04772d6eeb8a97f8d2de9a9df841c8a2ab
+  metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
+  data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4

data/.idea/http_crawler.iml CHANGED Viewed

@@ -5,8 +5,19 @@
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="RVM: ruby-2.4.1 [rails5.1.6]" jdkType="RUBY_SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="brotli (v0.2.2, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.3, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="guess_html_encoding (v0.0.11, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="mini_portile2 (v2.4.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.10.1, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rchardet (v1.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rspec (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rspec-core (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rspec-mocks (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rspec-support (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="ruby-readability (v0.7.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
   </component>
 </module>

data/lib/http_crawler/client.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-load File.dirname(__FILE__) + '/http/response.rb'
+require_dependency File.dirname(__FILE__) + '/http/response.rb'
 module HttpCrawler
   module Client
@@ -40,9 +40,9 @@ module HttpCrawler
     # 初始化超时时间
     def init_timeout
-      @connect_time = 3
-      @write_time = 3
-      @read_time = 3
+      @connect_time = 5
+      @write_time = 5
+      @read_time = 5
     end
     # 初始化 ssl 协议

data/lib/http_crawler/common.rb CHANGED Viewed

@@ -1,2 +1,2 @@
-load File.dirname(__FILE__) + '/common/object.rb'
-load File.dirname(__FILE__) + '/common/string.rb'
+require_dependency File.dirname(__FILE__) + '/common/object.rb'
+require_dependency File.dirname(__FILE__) + '/common/string.rb'

data/lib/http_crawler/http/response.rb CHANGED Viewed

@@ -1,11 +1,63 @@
 module HTTP
   class Response
     # 解压并转码 body 数据
     def decoding_body
-      @decoding_body ||= self.body.to_s
+      return @decoding_body if @decoding_body
+      return nil unless self.body
+      # 数据解压
+      case self.headers['Content-Encoding']
+      when 'gzip' then
+        sio = StringIO.new(self.body.to_s)
+        gz = Zlib::GzipReader.new(sio)
+        @decoding_body = gz.read()
+      when 'br'
+        @decoding_body = Brotli.inflate(self.body.to_s)
+        # when 'deflate'
+        #   # 可能错误代码 暂时没解决 deflate 编码格式
+        #   @decoding_body = Zlib::Inflate.inflate(self.body.to_s)
+      else
+        @decoding_body = self.body.to_s
+      end
+      # @decoding_body = self.body.to_s
+      # 判断解压后数据编码格式
+      # 从header取编码格式
+      encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type']
+      # 从html中的 charset 取编码格式
+      # 不能使用，因为 decoding_body 还未转码，直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8
+      # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
+      # 通过 CharDet 判断编码格式
+      encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
+      # 进行转码
+      begin
+        @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
+      rescue => e
+        # 转码错误后再次使用 CharDet 判断编码格式后进行转码
+        cd = CharDet.detect(@decoding_body)["encoding"]
+        if (cd && cd != encoding)
+          @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
+        else
+          # 还是转码错误则抛出异常
+          Rails.logger.debug "encoding => #{encoding}"
+          Rails.logger.debug "cd => #{cd}"
+          Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
+          raise e
+        end
+      end
     end
+    alias_method :dec, :decoding_body
     #  def decoding_body
     def html
@@ -19,13 +71,17 @@ module HTTP
     end
     # 通过readability 解析数据
+    # [Readability::Document]
     def readability
       @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
     end
+    def content
+      Nokogiri::HTML(readability.content).text
+    end
     # 解析
     def parsing
-      json
+      self.json
     end
     # 获取解析结果
@@ -63,7 +119,7 @@ module HTTP
         regular_num = decoding_body =~ regular
         if regular_num
           Rails.logger.warn("触发验证信息")
-          Rails.logger.warn(decoding_body[regular_num..(regular_num + 100)])
+          Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)])
           return true
         end
       end

data/lib/http_crawler/proxy.rb CHANGED Viewed

@@ -21,4 +21,4 @@ module HttpCrawler
   end
 end
-load File.dirname(__FILE__) + '/proxy/client.rb'
+require_dependency File.dirname(__FILE__) + '/proxy/client.rb'

data/lib/http_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HttpCrawler
-  VERSION = "0.3.0.2"
+  VERSION = "0.3.0.3"
 end

data/lib/http_crawler/web.rb CHANGED Viewed

@@ -5,4 +5,4 @@ module HttpCrawler
   end
 end
-load File.dirname(__FILE__) + '/web/client.rb'
+require_dependency File.dirname(__FILE__) + '/web/client.rb'

data/lib/http_crawler.rb CHANGED Viewed

@@ -1,17 +1,14 @@
-require 'net/http'
 require 'json'
 require 'digest/md5'
 require 'nokogiri'
-require 'http_crawler/errors.rb'
-load 'http_crawler/common.rb'
-load 'http_crawler/client.rb'
-load 'http_crawler/web.rb'
-load 'http_crawler/proxy.rb'
+# 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
+require_dependency 'http_crawler/errors.rb'
+require_dependency 'http_crawler/common.rb'
+require_dependency 'http_crawler/client.rb'
+require_dependency 'http_crawler/web.rb'
+require_dependency 'http_crawler/proxy.rb'
 module HttpCrawler
   # Your code goes here...
-  def self.a
-    puts "112"
-  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: http_crawler
 version: !ruby/object:Gem::Version
-  version: 0.3.0.2
+  version: 0.3.0.3
 platform: ruby
 authors:
 - jagger
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-02-19 00:00:00.000000000 Z
+date: 2019-02-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -137,7 +137,6 @@ files:
 - lib/http_crawler/common/object.rb
 - lib/http_crawler/common/string.rb
 - lib/http_crawler/errors.rb
-- lib/http_crawler/http.rb
 - lib/http_crawler/http/response.rb
 - lib/http_crawler/proxy.rb
 - lib/http_crawler/proxy/README.md

data/lib/http_crawler/http.rb DELETED Viewed

@@ -1,260 +0,0 @@
-module HttpCrawler
-  class HTTP < Net::HTTP
-    # 自动获取代理，true 表示自动获取代理 、false 表示不自动获取
-    attr_accessor :auto_proxy
-    # 代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API
-    attr_accessor :proxy_api
-    def proxy_api
-      @proxy_api ||= "my"
-    end
-    # 调用自己的代理池所需要的主键 key
-    attr_accessor :proxy_key
-    # 请求错误后的重复最大请求次数
-    attr_accessor :max_error_num
-    # 错误的url地址，存的是正则
-    attr_accessor :error_urls
-    def initialize(address, port = nil)
-      super(address, port)
-      @max_error_num = 2
-      @error_num = 0
-      @proxy_key = "default"
-      @error_urls = []
-    end
-    def http_error_sleep
-      sleep(0.5)
-    end
-    def server_error_sleep
-      sleep(3)
-    end
-    @@proxy_list = []
-    # 为 @http 重设代理
-    def proxy(p = {})
-      raise '代理设置 p_addr 不能为空' unless p["p_addr"]
-      raise '代理设置 p_port 不能为空' unless p["p_port"]
-      p["p_user"] ||= nil
-      p["p_pass"] ||= nil
-      Rails.logger.info("切换代理至 => #{p}")
-      # 设为 false 否则不会启用代理
-      @proxy_from_env = false
-      # 初始化代理数据
-      @proxy_address = p["p_addr"]
-      @proxy_port = p["p_port"]
-      @proxy_user = p["p_user"]
-      @proxy_pass = p["p_pass"]
-    end
-    # 通过调用 api 获取代理或者通过自定义设置代理
-    def get_proxy
-      # while @@proxy_list.blank?
-      #   Rails.logger.debug("@@proxy_list 为空进行更新")
-      #   proxy_client = HttpCrawler::Proxy.for(proxy_api)
-      #   proxy_r = proxy_client.get_proxy(key: proxy_key)
-      #   @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
-      #   Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
-      #   sleep(1)
-      # end
-      # p = @@proxy_list.delete_at(0)
-      proxy_ip = nil
-      begin
-        Rails.logger.debug("开始获取代理IP")
-        proxy_client = HttpCrawler::Proxy.for(proxy_api)
-        proxy_r = proxy_client.get_proxy(key: proxy_key)
-        proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
-        if proxy_ip.blank?
-          Rails.logger.warn "无最新代理等待5秒后重新获取"
-        else
-          break
-        end
-        sleep(5)
-      end while true
-      Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
-      unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
-        Rails.logger.warn "无最新代理等待5秒后重新获取"
-        sleep(5)
-        proxy_ip = get_proxy
-      end
-      if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
-        Rails.logger.warn "无最新代理等待5秒后重新获取"
-        sleep(5)
-        proxy_ip = get_proxy
-      end
-      proxy_ip
-    end
-    def update_proxy(proxy_ip = {})
-      if proxy_ip.blank?
-        proxy(get_proxy)
-      else
-        proxy(proxy_ip)
-      end
-    end
-    # 如果自动更新代理 则更新代理返回 true，否则返回false
-    def update_proxy?(proxy_ip = {})
-      if auto_proxy
-        if proxy_ip.blank?
-          proxy(get_proxy)
-        else
-          proxy(proxy_ip)
-        end
-        return true
-      else
-        return false
-      end
-    end
-    # 重定向请求
-    def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
-      # You should choose a better exception.
-      raise ArgumentError, 'too many HTTP repeated' if limit == 0
-      # 更新uri_or_path
-      uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
-      response = get(uri_or_path, initheader, dest, &block)
-      case response
-      when Net::HTTPSuccess then
-        response
-      when Net::HTTPRedirection then
-        location = response['location']
-        Rails.logger.warn "redirected to #{location}"
-        @error_urls.each do |url_string|
-          if location =~ /#{url_string}/
-            raise "跳转到异常url => #{location}"
-          end
-        end
-        # 传入 location 进行跳转
-        get_fetch(location, initheader, dest, limit - 1, &block)
-      when Net::HTTPServerError then
-        Rails.logger.warn "Net::HTTPServerError  5XX to #{address}"
-        server_error_sleep
-        # 重新请求
-        get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
-      when Net::HTTPProxyAuthenticationRequired then
-        Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
-        if update_proxy?
-          server_error_sleep
-          # 重新请求
-          get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
-        else
-          response.error!
-        end
-      else
-        Rails.logger.debug uri_or_path
-        Rails.logger.debug initheader
-        Rails.logger.debug response.body
-        response.error!
-      end
-    end
-    # 重定向请求
-    def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
-      # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
-      uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
-      # Rails.logger.debug "post_fetch => #{uri_or_path}"
-      response = post(uri_or_path, data, initheader, dest, &block)
-      case response
-      when Net::HTTPSuccess then
-        response
-      when Net::HTTPRedirection then
-        location = response['location']
-        Rails.logger.warn "redirected to #{location}"
-        @error_urls.each do |url_string|
-          if location =~ /#{url_string}/
-            raise "跳转到异常url => #{location}"
-          end
-        end
-        # 传入 location 进行跳转
-        get_fetch(location, initheader, dest, 9, &block)
-      when Net::HTTPServerError then
-        Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
-        server_error_sleep
-        # 重新请求
-        post_fetch(uri_or_path, initheader, dest, &block)
-      when Net::HTTPProxyAuthenticationRequired then
-        Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
-        if update_proxy?
-          server_error_sleep
-          # 重新请求
-          post_fetch(uri_or_path, initheader, dest, &block)
-        else
-          response.error!
-        end
-      else
-        server_error_sleep
-        response.error!
-      end
-    end
-    # def post_fetch
-    #
-    # 重写 发送请求的方法
-    #
-    def request(req, body = nil, &block)
-      begin
-        Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
-        Rails.logger.debug("body => #{body}") if started? && body
-        super(req, body, &block)
-      rescue => error
-        Rails.logger.error "出错了! 错误类型 => #{error.class}"
-        if started?
-          # started? 是为了判断是否结束http请求，如果不添加则会处理2次异常
-          Rails.logger.error("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}")
-          Rails.logger.error("body => #{body}") if body
-          raise error
-        else
-          http_error_sleep
-          # 最大错误尝试次数
-          if @error_num < @max_error_num
-            @error_num += 1
-            retry # 这将把控制移到 begin 的开头
-          else
-            # 超过最大错误限制 判断错误类型
-            case error
-            when EOFError
-              Rails.logger.warn "EOFError!"
-            when Timeout::Error
-              Rails.logger.warn "请求超时!"
-            when Net::HTTPServerException
-              Rails.logger.warn "代理失效:[#{proxy_address}:#{proxy_port}]"
-            when Errno::ECONNREFUSED
-              Rails.logger.warn "Errno::ECONNREFUSED"
-            else
-              raise error
-            end
-            if update_proxy?
-              @error_num = 0
-              retry # 这将把控制移到 begin 的开头
-            else
-              raise error
-            end
-          end
-        end
-      end # begin
-    end # def request(req, body = nil, &block)
-  end
-end
-load File.dirname(__FILE__) + '/http/response.rb'