RubyGems - list_spider - Versions diffs - 1.0.0 → 2.0.0 - Mend

list_spider 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
-  data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
+  metadata.gz: 837d9e4cb2b3aa829466cf9eaa4f48a24b5d4ff5067bbc27fb67fbdb37eec291
+  data.tar.gz: 8d378b9e3240b8d9c3bdc9c7e32aceb39a16fc63310224dc7ce6a68a2c570893
 SHA512:
-  metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
-  data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
+  metadata.gz: dd2c77aa71d8ff3d7ecba93fc6e30ec158b479dcffed9e3cc744944e2bcea3cb5425fc59f85acc22573bcbe3d1eb9a0967e7d0b1e11d3c9cb8d04a58450a0a7e
+  data.tar.gz: ec0e3ac5b2a09a3986eea20c69efc31c9536d1d96f77507e50755bfa07531c4bf7303317bc657a573dd8347bd304d8e93c9adbabf868918f0bdbe56c480e82e6

data/README.md CHANGED Viewed

@@ -86,9 +86,9 @@ def parse_response(file_name)
 end
-# extra_data is passed by TaskStruct's extra_data param
+# custom_data is passed by TaskStruct's custom_data param
-def parse_response(file_name, extra_data)
+def parse_response(file_name, custom_data)
   #...
 end
@@ -99,7 +99,7 @@ end
 # response_header.cookie
 # response_header['Last-Modified']
-def parse_response(file_name, extra_data, response_header)
+def parse_response(file_name, custom_data, response_header)
   response_header.status
   response_header['Last-Modified']
@@ -113,7 +113,7 @@ end
 # req.uri
 # req.host
 # req.port
-def parse_response(file_name, extra_data, response_header, req)
+def parse_response(file_name, custom_data, response_header, req)
   puts req.body
   puts req.headers
   puts req.uri
@@ -128,7 +128,7 @@ end
 ## And there are many options you can use
 ```ruby
-TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
 ```
 ```ruby

data/lib/file_filter.rb CHANGED Viewed

@@ -2,7 +2,8 @@
 class FileFilter
   # 4033
   # 920
-  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
+  def initialize(dir_pattern, size_threshold: 1000,
+                 cust_judge: nil, process_block: nil)
     @dir_pattern = dir_pattern
     @size_threshold = size_threshold
     @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
     ).start
   end
-  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
+                             size_threshold: 1000, cust_judge: nil)
     result_file = File.open(save_file_name, 'wt')
     FileFilter.new(
       dir_pattern,

data/lib/list_spider/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ListSpider
-  VERSION = '1.0.0'.freeze
+  VERSION = '2.0.0'.freeze
 end

data/lib/list_spider.rb CHANGED Viewed

@@ -8,22 +8,98 @@ require File.expand_path('../spider_helper', __FILE__)
 require File.expand_path('../file_filter', __FILE__)
 class TaskStruct
-  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+  def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http.response_header.status 状态码
+                 # http.response_header  返回头
+                 # http.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # request options
+                 redirects: 3, # 重定向次数
+                 #  keepalive: nil, # （暂不支持）
+                 file: nil, # 要上传的文件路径
+                 #  path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # connection options
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
     @href = href
-    @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
     @local_path = local_path
     @http_method = http_method
-    @params = params
-    @extra_data = extra_data
+    @custom_data = custom_data
     @parse_method = parse_method
-    @header = header
+    @callback = callback
+    @errback = errback
+    @stream_callback = stream_callback
+    @convert_to_utf8 = convert_to_utf8
+    @overwrite_exist = overwrite_exist
+    @request_options = {
+      redirects: redirects,
+      # keepalive: keepalive,
+      file: file,
+      # path: path,
+      query: query,
+      body: body,
+      head: head
+    }.compact
+    @connection_options = {
+      connect_timeout: connect_timeout,
+      inactivity_timeout: inactivity_timeout,
+      ssl: ssl,
+      bind: bind,
+      proxy: proxy
+    }.compact
   end
-  def ==(other)
-    other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
-  end
-  attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
+  attr_accessor :href, :local_path,
+                :http_method,
+                :custom_data,
+                :request_object,
+                :parse_method,
+                :callback,
+                :errback,
+                :stream_callback,
+                :convert_to_utf8,
+                :overwrite_exist,
+                :request_options,
+                :connection_options
 end
 module ListSpider
@@ -33,33 +109,9 @@ module ListSpider
   DEFAULT_INTERVAL = 0
   @random_time_range = 3..10
-  @convert_to_utf8 = false
-  @connection_opts = { connect_timeout: 60 }
-  @overwrite_exist = false
-  @max_redirects = 10
   @local_path_set = Set.new
   class << self
-    attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
-    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-      @connection_opts = {
-        proxy: {
-          host: proxy_addr,
-          port: proxy_port
-        }
-      }
-      @connection_opts[:proxy][:authorization] = [username, password] if username && password
-    end
-    def connect_timeout(max_connect_time)
-      @connection_opts[:connect_timeout] = max_connect_time
-    end
-    def set_header_option(header_option)
-      @header_option = header_option
-    end
     def event_machine_down(link_struct_list, callback = nil)
       failed_list = []
       succeed_list = []
@@ -67,78 +119,65 @@ module ListSpider
       begin_time = Time.now
       for_each_proc =
-        proc do |e|
-          opt = { redirects: @max_redirects }
-          if e.header
-            opt[:head] = e.header
-          elsif defined? @header_option
-            opt[:head] = @header_option
-          end
-          if e.http_method == :post
-            opt[:body] = e.params unless e.params.empty?
-            w =
-              if @connection_opts
-                EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
-              else
-                EventMachine::HttpRequest.new(e.href).post opt
-              end
-          else
-            if @connection_opts
-              opt[:query] = e.params unless e.params.empty?
-              w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).get opt
-            end
-          end
+        proc do |task_struct|
+          http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
+          http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
+          task_struct.request_object = http_req
-          e.request_object = w
-          w.callback do
-            s = w.response_header.status
+          http_req.callback do
+            s = http_req.response_header.status
             puts s
-            if s != 404
-              local_dir = File.dirname(e.local_path)
+            if s == 200
+              local_dir = File.dirname(task_struct.local_path)
               FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
               begin
-                File.open(e.local_path, 'wb') do |f|
+                File.open(task_struct.local_path, 'wb') do |f|
                   f << if @convert_to_utf8 == true
-                         SpiderHelper.to_utf8(w.response)
+                         SpiderHelper.to_utf8(http_req.response)
                        else
-                         w.response
+                         http_req.response
                        end
                 end
-                succeed_list << e
-              rescue StandardError => e
-                puts e
+                call_parse_method(task_struct)
+                succeed_list << task_struct
+              rescue StandardError => exception
+                puts exception
               end
             end
+            task_struct.callback.call(task_struct, http_req) if task_struct.callback
           end
-          w.errback do
-            puts "errback:#{w.response_header},retry..."
-            puts e.href
-            puts w.response_header.status
-            ret = false
-            if e.http_method == :get
-              ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
-            elsif e.http_method == :post
-              ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
-            end
-            if ret
-              succeed_list << e
+          http_req.errback do
+            puts "errback:#{http_req.response_header},retry..."
+            puts task_struct.href
+            puts http_req.response_header.status
+            if task_struct.errback
+              task_struct.errback.call(task_struct, http_req)
             else
-              failed_list << e
+              ret = false
+              if task_struct.http_method == :get
+                ret = SpiderHelper.direct_http_get(task_struct.href, task_struct.local_path, convert_to_utf8: @convert_to_utf8)
+              elsif task_struct.http_method == :post
+                ret = SpiderHelper.direct_http_post(task_struct.href, task_struct.local_path, task_struct.params, convert_to_utf8: @convert_to_utf8)
+              end
+              if ret
+                call_parse_method(task_struct)
+                succeed_list << task_struct
+              else
+                failed_list << task_struct
+              end
             end
           end
           begin
-            multi.add e.local_path, w
+            multi.add task_struct.local_path, http_req
           rescue StandardError => exception
             puts exception
-            puts e.href
-            puts e.local_path
+            puts task_struct.href
+            puts task_struct.local_path
             stop_machine
           end
         end
@@ -170,38 +209,15 @@ module ListSpider
       @down_list.shift(@max)
     end
-    def call_parse_method(e)
-      pm = e.parse_method
-      if pm
-        case pm.arity
-        when 1
-          pm.call(e.local_path)
-        when 2
-          pm.call(e.local_path, e.extra_data)
-        when 3
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header)
-        when 4
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          req = nil
-          req = e.request_object.req if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header, req)
-        else
-          puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
-        end
-      end
+    def call_parse_method(task_struct)
+      task_struct.parse_method.call(task_struct) if task_struct.parse_method
     end
     def complete(_multi, success_list, failed_list)
       @succeed_size += success_list.size
       @failed_size += failed_list.size
-      success_list.each do |e|
-        call_parse_method(e)
-      end
+      @succeed_list.concat(success_list)
+      @failed_list.concat(failed_list)
       todo = next_task
@@ -223,6 +239,8 @@ module ListSpider
     def event_machine_start_list(down_list, callback = nil)
       EventMachine.run do
+        @succeed_list = []
+        @failed_list = []
         @begin_time = Time.now
         if down_list.empty?
           if callback
@@ -239,7 +257,7 @@ module ListSpider
     def filter_list(down_list)
       need_down_list = []
       down_list.each do |ts|
-        if !@overwrite_exist && File.exist?(ts.local_path)
+        if !ts.overwrite_exist && File.exist?(ts.local_path)
           call_parse_method(ts)
         elsif @local_path_set.add?(ts.local_path)
           need_down_list << ts

data/lib/spider_helper.rb CHANGED Viewed

@@ -3,8 +3,9 @@ require 'net/http'
 module SpiderHelper
   class << self
-    def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_get(href, local_path, params: nil,
+                        header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
       false
     end
-    def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_post(href, local_path, params,
+                         header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
     def string_to_uri(href)
       l = href
-      l.sub!('http:///', 'http://') if l.start_with?('http:///')
+      l.sub!('http:///', 'http://')
       l = Addressable::URI.parse(l)
       l.normalize!
     end

data/spider_example.rb CHANGED Viewed

@@ -1,10 +1,10 @@
-require 'list_spider'
-# require File.expand_path('../lib/list_spider', __FILE__)
+# require 'list_spider'
+require File.expand_path('../lib/list_spider', __FILE__)
 DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 2.0.0
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-01-29 00:00:00.000000000 Z
+date: 2018-02-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler