RubyGems - http_crawler - Versions diffs - 0.3.0.3 → 0.3.0.4 - Mend

http_crawler 0.3.0.3 → 0.3.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/README.md +0 -2
data/lib/http_crawler.rb +9 -0
data/lib/http_crawler/client.rb +62 -20
data/lib/http_crawler/proxy.rb +0 -8
data/lib/http_crawler/proxy/client.rb +17 -2
data/lib/http_crawler/proxy/test_proxy_api/client.rb +1 -3
data/lib/http_crawler/version.rb +1 -1
data/lib/http_crawler/web/baidu/client.rb +1 -3
data/lib/http_crawler/web/client.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
-  data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
+  metadata.gz: e44504aa656dea432bd96e28cc1908aac5c6164d1aa5ab9399da5b05db77b5b8
+  data.tar.gz: df36de8464939d97436941534bd441f7ee83b60b12d83c6547e9ed986d109276
 SHA512:
-  metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
-  data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4
+  metadata.gz: a9baf0b81a3888c11d0b8d3a908fe805e9e498d32f6418aabb0f2c6392a39d9c354b74ce75e8c1673b2c2ea9b67305539b0a0c363905893e03be09050dfdb2be
+  data.tar.gz: ccaff3ba7029675ba6d6109b235b9a274f5f20638f9d59d37892cf179c24a0cbf8f2d49fdd69ea730471ca81677cd47f2080aac1fe73e404461dbd9a75a23c99

data/README.md CHANGED Viewed

@@ -45,11 +45,9 @@ client.index   # 抓取首页
 ```ruby
 client = HttpCrawler::Proxy::TestProxyApi::Client.new
-client.index  # 抓取首页
 ```
 ### 通过别名调用
 ```ruby
 client = HttpCrawler::Proxy.for("test_proxy_api") #
-client.index   # 抓取首页
 ```

data/lib/http_crawler.rb CHANGED Viewed

@@ -2,6 +2,15 @@ require 'json'
 require 'digest/md5'
 require 'nokogiri'
+# 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
+class << self.class
+  def require_rename
+    # require 取别名 require_dependency
+    alias_method :require_dependency, :require
+  end
+end
+self.class.require_rename
 # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
 require_dependency 'http_crawler/errors.rb'
 require_dependency 'http_crawler/common.rb'

data/lib/http_crawler/client.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require_dependency File.dirname(__FILE__) + '/http/response.rb'
 module HttpCrawler
-  module Client
+  class Client
     class << self
@@ -9,8 +9,8 @@ module HttpCrawler
       # web_name = "biquge_duquanben"
       # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
       #
-      def for(web_name, *args)
-        "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
+      def for(web_name)
+        "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
       end
       #
@@ -19,7 +19,11 @@ module HttpCrawler
       # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
       #
       def for_module(module_name, *args)
-        "#{module_name}::Client".constantize.new(*args)
+        "#{module_name}::Client".constantize.new()
+      end
+      def for_uri(path)
+        self.new(uri: path)
       end
     end
@@ -38,6 +42,25 @@ module HttpCrawler
       @uri = nil
     end
+    # 更新uri
+    def update_uri(uri_or_path)
+      case uri_or_path
+      when URI
+        @uri = uri_or_path
+      when String
+        if uri_or_path =~ /^http/
+          @uri = URI(uri_or_path)
+        else
+          @uri = @uri + uri_or_path
+        end
+      else
+        raise ArgumentError, uri_or_path
+      end
+      # 初始化 ssl 协议
+      self.init_ssl
+      self.uri
+    end
     # 初始化超时时间
     def init_timeout
       @connect_time = 5
@@ -55,23 +78,30 @@ module HttpCrawler
     end
     # 头文件相关方法
-    def header
+    def header(parameter = {})
       @header ||= init_header
     end
-    def init_header
-      nil
+    def init_header(parameter = {})
+      @header = {}
     end
     def update_header(parameter = {})
       nil
     end
-    # cookies
-    def cookies
-      @cookies ||= {}
+    # cookies相关方法
+    def cookies(parameter = {})
+      @cookies ||= init_cookies
+    end
+    def init_cookies(parameter = {})
+      @cookies = {}
     end
+    def update_cookies(parameter = {})
+      nil
+    end
     # 代理设置
     def auto_proxy=(value)
@@ -150,9 +180,9 @@ module HttpCrawler
     end
-    # 初始化http参数
+    # 初始化init_client参数
     def init_client
+      nil
     end
     # 初始化http请求前置条件
@@ -179,9 +209,15 @@ module HttpCrawler
     #  init_uri 如果未初始化@uri,则会报错
     #  继承类需要重定义 init_uri
     #
-    def initialize
+    def initialize(parameter = {})
       # 初始化 uri
-      raise "Client uri为空" unless init_uri
+      init_uri
+      # 如果自定义uri
+      if parameter[:uri]
+        raise "Client uri为重复初始化" if uri
+        update_uri(parameter[:uri])
+      end
       # 初始化超时时间
       init_timeout
@@ -198,12 +234,20 @@ module HttpCrawler
     # 发送 get 请求
     def get(path, params = {})
-      request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
+      raise "Client uri为空" unless self.uri
+      request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
+    end
+    # 直接发送uri的get请求
+    def get_uri
+      raise "Client uri为空" unless self.uri
+      request {http.get(self.uri.to_s, :ssl_context => @ctx)}
     end
     # 发送 post 请求
     def post(path, params = {})
-      request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
+      raise "Client uri为空" unless self.uri
+      request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
     end
     # 请求的响应
@@ -233,7 +277,7 @@ module HttpCrawler
       begin
         block.call
       rescue => error
+        Rails.logger.debug error.class
         case error
         when HTTP::TimeoutError
           # 超时错误切换代理
@@ -242,7 +286,6 @@ module HttpCrawler
           else
             raise error
           end
         else
           # 错误尝试次数
           if n <= 0
@@ -252,9 +295,8 @@ module HttpCrawler
             retry
           end
         end
       end
-    end
+    end # def request(&block)
   end
 end

data/lib/http_crawler/proxy.rb CHANGED Viewed

@@ -1,7 +1,5 @@
 module HttpCrawler
   module Proxy
-    include(HttpCrawler::Client)
     class << self
       # 接收格式
@@ -11,13 +9,7 @@ module HttpCrawler
       def for(web_name, *arg)
         "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
       end
     end
-    def max_error_num
-      @max_error_num ||= 0
-    end
   end
 end

data/lib/http_crawler/proxy/client.rb CHANGED Viewed

@@ -1,7 +1,22 @@
 module HttpCrawler
   module Proxy
-    module Client
+    class Client < HttpCrawler::Client
+      class << self
+        # 接收格式
+        # web_name = "test_proxy_api"
+        # 返回 HttpCrawler::Proxy::TestProxyApi::Client 实例
+        #
+        def for(web_name, *arg)
+          "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
+        end
+      end
+      def max_error_num
+        @max_error_num ||= 0
+      end
     end
   end

data/lib/http_crawler/proxy/test_proxy_api/client.rb CHANGED Viewed

@@ -2,9 +2,7 @@
 module HttpCrawler
   module Proxy
     module TestProxyApi
-      class Client
-        include(HttpCrawler::Proxy::Client)
+      class Client < HttpCrawler::Proxy::Client

data/lib/http_crawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HttpCrawler
-  VERSION = "0.3.0.3"
+  VERSION = "0.3.0.4"
 end

data/lib/http_crawler/web/baidu/client.rb CHANGED Viewed

@@ -2,9 +2,7 @@
 module HttpCrawler
   module Web
     module Baidu
-      class Client
-        include(HttpCrawler::Client)
+      class Client < HttpCrawler::Web::Client
         def init_uri
           @uri = URI("https://www.baidu.com")

data/lib/http_crawler/web/client.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module HttpCrawler
   module Web
-    module Client
+    class Client < HttpCrawler::Client
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: http_crawler
 version: !ruby/object:Gem::Version
-  version: 0.3.0.3
+  version: 0.3.0.4
 platform: ruby
 authors:
 - jagger