RubyGems - list_spider - Versions diffs - 1.0.0 → 2.2.0 - Mend

list_spider 1.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
-  data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
+  metadata.gz: 39600b837bb18841d083c7b50dbaadf82e72c3013f690129af6786efec193a39
+  data.tar.gz: 4128e673c551e3fcc2c1f9d4a3302407bcf7bc26829a4957d04ebc0505d5ce07
 SHA512:
-  metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
-  data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
+  metadata.gz: f900e8f76086f37239872d9b4452f5d735799100879ac16570d29c9570837adca52c3c9e37c725913920a68add7784bc2f94e2cef42663c54930ae5b3e37ec50
+  data.tar.gz: 90495a4dae2552c3f41e55f0efa61fef0511581eb2e13d90256e0a585c48f7fdb2af167cd8c6daa98ca80c2229d970187fbcec3db4a8edd43738f76f79c18951

data/.rdoc_options ADDED Viewed

@@ -0,0 +1,23 @@
+--- !ruby/object:RDoc::Options
+encoding: UTF-8
+static_path: []
+rdoc_include:
+- "."
+- "/Users/zhangchao/github/list_spider"
+charset: UTF-8
+exclude:
+hyperlink_all: false
+line_numbers: false
+locale:
+locale_dir: locale
+locale_name:
+main_page:
+markup: markdown
+output_decoration: true
+page_dir:
+show_hash: false
+tab_width: 8
+template_stylesheets: []
+title:
+visibility: :protected
+webcvs:

data/.rubocop.yml CHANGED Viewed

@@ -18,9 +18,9 @@ Style/Documentation:
   Enabled: false
 Lint/AmbiguousRegexpLiteral:
   Enabled: false
-Lint/DefEndAlignment:
+Layout/DefEndAlignment:
   AutoCorrect: true
-Lint/EndAlignment:
+Layout/EndAlignment:
   AutoCorrect: true
 Style/BracesAroundHashParameters:
   Enabled: false

data/English_README.md ADDED Viewed

@@ -0,0 +1,169 @@
+# list_spider
+A url list spider based on em-http-request.
+Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+## Features
+* Duplicate url filtering (based on local path, so you can custom your behavior).
+* Convert to UTF-8 support.
+* Increased spider support (don't spider exist).
+* Customize concurrent number and interval between task.
+* Http options support.
+## Getting started
+```ruby
+gem install list_spider
+```
+Or add it to your Gemfile
+```ruby
+gem 'list_spider'
+```
+## Use like this
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+@next_list = []
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # or you can save them to database for later use
+    @next_list << TaskStruct.new(href, local_path)
+  end
+end
+task_list = []
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
+ListSpider.get_list(task_list)
+ListSpider.get_list(@next_list, max: 60)
+```
+## Or in one step
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    ListSpider.add_task(TaskStruct.new(href, local_path))
+  end
+end
+# get_one is a simple function for one taskstruct situation
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
+```
+## And there are many options you can use
+```ruby
+def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，404
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http.response_header.status 状态码
+                 # http.response_header  返回头
+                 # http.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # request options
+                 redirects: 3, # 重定向次数
+                 keepalive: nil, # （暂不支持复用）
+                 file: nil, # 要上传的文件路径
+                 path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # connection options
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
+```
+## Callback methods form
+```ruby
+# called when the file is saved successfully
+def parse_eresponse(task_struct)
+  # ...
+end
+def call_back(task_struct, http_req)
+  # http_req is a EventMachine::HttpRequest object
+  # http_req.response_header.status
+  # ...
+end
+def err_back(task_struct, http_req)
+  # ...
+end
+```
+### License
+(MIT License) - Copyright (c) 2016 Charles Zhang

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,41 @@
+PATH
+  remote: .
+  specs:
+    list_spider (2.0.2)
+      em-http-request (~> 1.1, >= 1.1.3)
+      nokogiri (~> 1.6, >= 1.6.7)
+      rchardet (~> 1.6, >= 1.6.1)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    addressable (2.5.2)
+      public_suffix (>= 2.0.2, < 4.0)
+    cookiejar (0.3.3)
+    em-http-request (1.1.5)
+      addressable (>= 2.3.4)
+      cookiejar (!= 0.3.1)
+      em-socksify (>= 0.3)
+      eventmachine (>= 1.0.3)
+      http_parser.rb (>= 0.6.0)
+    em-socksify (0.3.2)
+      eventmachine (>= 1.0.0.beta.4)
+    eventmachine (1.2.5)
+    http_parser.rb (0.6.0)
+    mini_portile2 (2.3.0)
+    nokogiri (1.8.2)
+      mini_portile2 (~> 2.3.0)
+    public_suffix (3.0.2)
+    rake (10.5.0)
+    rchardet (1.7.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.16)
+  list_spider!
+  rake (~> 10.0)
+BUNDLED WITH
+   1.16.1

data/README.md CHANGED Viewed

@@ -1,186 +1,181 @@
-# list_spider
+# 关于list_spider
-A url list spider based on em-http-request.
+list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
-Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+许多情况下，爬虫的工作是爬取链接，解析返回数据，从中提取链接，继续爬取，list_spider就是适用这种场景的爬虫工具。
-## Features
-* Duplicate url filtering (based on local path, so you can custom your behavior).
+## 功能特点
+* 去重过滤 (使用本地文件路径做唯一性校验)。
-* Convert to UTF-8 support.
+* 支持UTF-8编码转换。
-* Increased spider support (don't spider exist).
+* 默认增量爬取，已爬取的不再重复爬取（可以通过选项强制重新获取）。
-* Customize concurrent number and interval between task.
+* 自由设置最大并发数和爬取任务间隔时间。
-* Http options support.
+* 支持http所有选项设置。
-## Getting started
+## 开始
-    gem install list_spider
+```ruby
+gem install list_spider
+```
+或者添加到Gemfile
+```ruby
+gem 'list_spider'
+```
-## Use like this
+## 使用方法
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-$next_list = []
+@next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
-    #or you can save them to database for later use
-    $next_list<< TaskStruct.new(href, local_path)
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # 可以存入数据库后续处理
+    @next_list << TaskStruct.new(href, local_path)
   end
 end
 task_list = []
-task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
 ListSpider.get_list(task_list)
-ListSpider.get_list($next_list, max: 60)
+ListSpider.get_list(@next_list, max: 60)
 ```
-## Or in one step
+## 或者使用更简单的一步完成
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
+    local_path = DOWNLOAD_DIR + link.content + '.html'
     ListSpider.add_task(TaskStruct.new(href, local_path))
   end
 end
-#get_one is a simple function for one taskstruct situation
-ListSpider.get_one(TaskStruct.new(
-  'https://coolshell.cn/',
-  DOWNLOAD_DIR + 'index.html',
-  parse_method: method(:parse_index_item)),
-max: 60)
+# get_one是封装了get_list的简化形式，方便一个任务时调用
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
 ```
-## You can define parse method in four forms
-```ruby
-def parse_response(file_name)
-  #...
-end
-# extra_data is passed by TaskStruct's extra_data param
-def parse_response(file_name, extra_data)
-  #...
-end
-# response_header is a EventMachine::HttpResponseHeader object
-# you can use it like this:
-# response_header.status
-# response_header.cookie
-# response_header['Last-Modified']
-def parse_response(file_name, extra_data, response_header)
-  response_header.status
-  response_header['Last-Modified']
-  #...
-end
-# req is a EventMachine::HttpClientOptions object
-# you can use it like this:
-# req.body
-# req.headers
-# req.uri
-# req.host
-# req.port
-def parse_response(file_name, extra_data, response_header, req)
-  puts req.body
-  puts req.headers
-  puts req.uri
-  puts req.host
-  puts req.port
-  #...
-end
-```
-## And there are many options you can use
-```ruby
-TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+## get_list/get_one参数
 ```
+# down_list: 要请求的TaskStruct数组
+# interval: 任务间隔，默认为0。若参数为Range对象，则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
+# max: 最大并发数，默认为50。若设为NO_LIMIT_CONCURRENT，则所有请求任务全部一起并发执行
-```ruby
-#no concurrent limit (note: only use when list size is small)
-ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
-#sleep random time, often used in site which limit spider
-ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
-#set random time range
-ListSpider.get_list(down_list, interval: (1..10), max: 1)
+get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
 ```
-###Options below will take effect in the whole program (set them before call get_list)
+## 下面是TaskStruct可以设置的选项，与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
 ```ruby
-#set proxy
-ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-#set http header (if TaskStruct has header it will be used priority)
-ListSpider.set_header_option(header_option)
-#convert the file encoding to utf-8
-ListSpider.convert_to_utf8 = true
-#set connect timeout
-ListSpider.connect_timeout = 2*60
-#over write exist file
-ListSpider.overwrite_exist = false
-#set redirect depth
-ListSpider.max_redirects = 10
+new(href, # 请求链接
+    local_path, # 保存数据的本地路径（此路径作为去重标准）
+    # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+    http_method: :get,
+    custom_data: nil, # 自定义数据
+    parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+    # 请求成功后的回调，此时可能没有保存文件，比如301，404
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    # http_req.response_header.status 状态码
+    # http_req.response_header  返回头
+    # http_req.response 返回体
+    callback: nil,
+    # 请求失败后的回调
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    errback: nil,
+    stream_callback: nil, # 流数据处理回调
+    convert_to_utf8: false, # 是否转换为utf8编码
+    overwrite_exist: false, # 是否覆盖现有文件
+    # 请求设置
+    redirects: 3, # 重定向次数
+    keepalive: nil, # （暂不支持复用）
+    file: nil, # 要上传的文件路径
+    path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+    query: nil, # 查询字符串，可以是string或hash类型
+    body: nil, # 请求体，可以是string或hash类型
+    head: nil, # 请求头
+    # 连接设置
+    connect_timeout: 60, # 连接超时时间
+    inactivity_timeout: nil, # 连接后超时时间
+    # ssl设置
+    # ssl: {
+    #     :private_key_file => '/tmp/server.key',
+    #     :cert_chain_file => '/tmp/server.crt',
+    #     :verify_peer => false
+    # }
+    ssl: nil,
+    # bind: {
+    #     :host => '123.123.123.123',   # use a specific interface for outbound request
+    #     :port => '123'
+    # }
+    bind: nil,
+    # 代理设置
+    # proxy: {
+    #     :host => '127.0.0.1',    # proxy address
+    #     :port => 9000,           # proxy port
+    #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+    #     :authorization => ['user', 'pass']  # proxy authorization header
+    # }
+    proxy: nil)
 ```
-## There is a util class to help check or delete unvalid file
+## 回调函数形式
 ```ruby
-FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
+# 文件成功保存后调用，通过parse_method参数传入
+def parse_eresponse(task_struct)
+  # ...
+end
-#params
-FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
+# http请求成功后调用，通过callback参数传入
+def call_back(task_struct, http_req)
+  # http_req 是EventMachine::HttpRequest对象
+  # http_req.response_header.status
+  # ...
+end
-FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+# http请求出错后调用，通过errback参数传入
+def err_back(task_struct, http_req)
+  # ...
+end
 ```
-### License
+## License
 (MIT License) - Copyright (c) 2016 Charles Zhang

data/lib/file_filter.rb CHANGED Viewed

@@ -2,7 +2,8 @@
 class FileFilter
   # 4033
   # 920
-  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
+  def initialize(dir_pattern, size_threshold: 1000,
+                 cust_judge: nil, process_block: nil)
     @dir_pattern = dir_pattern
     @size_threshold = size_threshold
     @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
     ).start
   end
-  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
+                             size_threshold: 1000, cust_judge: nil)
     result_file = File.open(save_file_name, 'wt')
     FileFilter.new(
       dir_pattern,

data/lib/list_spider/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ListSpider
-  VERSION = '1.0.0'.freeze
+  VERSION = '2.2.0'.freeze
 end

data/lib/list_spider.rb CHANGED Viewed

@@ -4,26 +4,108 @@ require 'nokogiri'
 require 'fileutils'
 require 'set'
 require 'addressable/uri'
-require File.expand_path('../spider_helper', __FILE__)
-require File.expand_path('../file_filter', __FILE__)
+require File.expand_path('spider_helper', __dir__)
+require File.expand_path('file_filter', __dir__)
+# 爬取任务类
 class TaskStruct
-  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+  # * href 请求链接
+  # * local_path 保存数据的本地路径（此路径作为去重标准）
+  # * http_method http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+  # * custom_data 自定义数据
+  # * parse_method 解析保存文件的回调，参数是TaskStruct对象本身
+  def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，404
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http_req.response_header.status 状态码
+                 # http_req.response_header  返回头
+                 # http_req.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # 请求设置
+                 redirects: 3, # 重定向次数
+                 keepalive: nil, # （暂不支持复用）
+                 file: nil, # 要上传的文件路径
+                 path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # 连接设置
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
     @href = href
-    @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
     @local_path = local_path
     @http_method = http_method
-    @params = params
-    @extra_data = extra_data
+    @custom_data = custom_data
     @parse_method = parse_method
-    @header = header
+    @callback = callback
+    @errback = errback
+    @stream_callback = stream_callback
+    @convert_to_utf8 = convert_to_utf8
+    @overwrite_exist = overwrite_exist
+    @request_options = {
+      redirects: redirects,
+      keepalive: keepalive,
+      file: file,
+      path: path,
+      query: query,
+      body: body,
+      head: head
+    }.compact
+    @connection_options = {
+      connect_timeout: connect_timeout,
+      inactivity_timeout: inactivity_timeout,
+      ssl: ssl,
+      bind: bind,
+      proxy: proxy
+    }.compact
   end
-  def ==(other)
-    other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
-  end
-  attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
+  attr_accessor :href, :local_path,
+                :http_method,
+                :custom_data,
+                :request_object,
+                :parse_method,
+                :callback,
+                :errback,
+                :stream_callback,
+                :convert_to_utf8,
+                :overwrite_exist,
+                :request_options,
+                :connection_options
 end
 module ListSpider
@@ -33,33 +115,44 @@ module ListSpider
   DEFAULT_INTERVAL = 0
   @random_time_range = 3..10
-  @convert_to_utf8 = false
-  @connection_opts = { connect_timeout: 60 }
-  @overwrite_exist = false
-  @max_redirects = 10
   @local_path_set = Set.new
   class << self
-    attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
+    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      if interval.is_a? Range
+        @random_time_range = interval
+        interval = RANDOM_TIME
+      end
+      @down_list = filter_list(down_list)
+      @interval = interval
+      @max = max
+      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
+      @succeed_size = 0
+      @failed_size = 0
-    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-      @connection_opts = {
-        proxy: {
-          host: proxy_addr,
-          port: proxy_port
-        }
-      }
-      @connection_opts[:proxy][:authorization] = [username, password] if username && password
+      puts "total size:#{@down_list.size}"
+      event_machine_start_list(next_task, method(:complete))
     end
-    def connect_timeout(max_connect_time)
-      @connection_opts[:connect_timeout] = max_connect_time
+    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      get_list([task], interval: interval, max: max)
     end
-    def set_header_option(header_option)
-      @header_option = header_option
+    def add_task(task)
+      if task.is_a? Array
+        need_down_list = filter_list(task)
+        @down_list += need_down_list
+      elsif task.is_a?TaskStruct
+        need_down_list = filter_list([task])
+        @down_list += need_down_list
+      else
+        puts "error task type:#{task.class}"
+      end
     end
+    private
     def event_machine_down(link_struct_list, callback = nil)
       failed_list = []
       succeed_list = []
@@ -67,78 +160,47 @@ module ListSpider
       begin_time = Time.now
       for_each_proc =
-        proc do |e|
-          opt = { redirects: @max_redirects }
-          if e.header
-            opt[:head] = e.header
-          elsif defined? @header_option
-            opt[:head] = @header_option
-          end
+        proc do |task_struct|
+          http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
+          http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
+          task_struct.request_object = http_req
-          if e.http_method == :post
-            opt[:body] = e.params unless e.params.empty?
-            w =
-              if @connection_opts
-                EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
-              else
-                EventMachine::HttpRequest.new(e.href).post opt
-              end
-          else
-            if @connection_opts
-              opt[:query] = e.params unless e.params.empty?
-              w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).get opt
-            end
-          end
-          e.request_object = w
+          http_req.callback do
+            s = http_req.response_header.status
+            puts "#{Time.now}, http status code: #{s}"
-          w.callback do
-            s = w.response_header.status
-            puts s
-            if s != 404
-              local_dir = File.dirname(e.local_path)
+            if s == 200
+              local_dir = File.dirname(task_struct.local_path)
               FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
               begin
-                File.open(e.local_path, 'wb') do |f|
+                File.open(task_struct.local_path, 'wb') do |f|
                   f << if @convert_to_utf8 == true
-                         SpiderHelper.to_utf8(w.response)
+                         SpiderHelper.to_utf8(http_req.response)
                        else
-                         w.response
+                         http_req.response
                        end
                 end
-                succeed_list << e
-              rescue StandardError => e
-                puts e
+                call_parse_method(task_struct)
+                succeed_list << task_struct
+              rescue StandardError => exception
+                puts exception
               end
             end
+            task_struct.callback.call(task_struct, http_req) if task_struct.callback
           end
-          w.errback do
-            puts "errback:#{w.response_header},retry..."
-            puts e.href
-            puts w.response_header.status
-            ret = false
-            if e.http_method == :get
-              ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
-            elsif e.http_method == :post
-              ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
-            end
+          http_req.errback do
+            puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
-            if ret
-              succeed_list << e
-            else
-              failed_list << e
-            end
+            task_struct.errback.call(task_struct, http_req) if task_struct.errback
           end
           begin
-            multi.add e.local_path, w
+            multi.add task_struct.local_path, http_req
           rescue StandardError => exception
             puts exception
-            puts e.href
-            puts e.local_path
+            puts task_struct.href
+            puts task_struct.local_path
             stop_machine
           end
         end
@@ -170,38 +232,15 @@ module ListSpider
       @down_list.shift(@max)
     end
-    def call_parse_method(e)
-      pm = e.parse_method
-      if pm
-        case pm.arity
-        when 1
-          pm.call(e.local_path)
-        when 2
-          pm.call(e.local_path, e.extra_data)
-        when 3
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header)
-        when 4
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          req = nil
-          req = e.request_object.req if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header, req)
-        else
-          puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
-        end
-      end
+    def call_parse_method(task_struct)
+      task_struct.parse_method.call(task_struct) if task_struct.parse_method
     end
     def complete(_multi, success_list, failed_list)
       @succeed_size += success_list.size
       @failed_size += failed_list.size
-      success_list.each do |e|
-        call_parse_method(e)
-      end
+      @succeed_list.concat(success_list)
+      @failed_list.concat(failed_list)
       todo = next_task
@@ -223,6 +262,8 @@ module ListSpider
     def event_machine_start_list(down_list, callback = nil)
       EventMachine.run do
+        @succeed_list = []
+        @failed_list = []
         @begin_time = Time.now
         if down_list.empty?
           if callback
@@ -239,7 +280,7 @@ module ListSpider
     def filter_list(down_list)
       need_down_list = []
       down_list.each do |ts|
-        if !@overwrite_exist && File.exist?(ts.local_path)
+        if !ts.overwrite_exist && File.exist?(ts.local_path)
           call_parse_method(ts)
         elsif @local_path_set.add?(ts.local_path)
           need_down_list << ts
@@ -247,43 +288,6 @@ module ListSpider
       end
       need_down_list
     end
-    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      if interval.is_a? Range
-        @random_time_range = interval
-        interval = RANDOM_TIME
-      end
-      @down_list = []
-      need_down_list = filter_list(down_list)
-      @down_list += need_down_list
-      @interval = interval
-      @max = max
-      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
-      @succeed_size = 0
-      @failed_size = 0
-      puts "total size:#{@down_list.size}"
-      event_machine_start_list(next_task, method(:complete))
-    end
-    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      get_list([task], interval: interval, max: max)
-    end
-    def add_task(task)
-      if task.is_a? Array
-        need_down_list = filter_list(task)
-        @down_list += need_down_list
-      elsif task.is_a?TaskStruct
-        need_down_list = filter_list([task])
-        @down_list += need_down_list
-      else
-        puts "error task type:#{task.class}"
-      end
-    end
   end
   Signal.trap('INT') do

data/lib/spider_helper.rb CHANGED Viewed

@@ -3,8 +3,9 @@ require 'net/http'
 module SpiderHelper
   class << self
-    def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_get(href, local_path, params: nil,
+                        header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
       false
     end
-    def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_post(href, local_path, params,
+                         header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
     def string_to_uri(href)
       l = href
-      l.sub!('http:///', 'http://') if l.start_with?('http:///')
+      l.sub!('http:///', 'http://')
       l = Addressable::URI.parse(l)
       l.normalize!
     end

data/list_spider.gemspec CHANGED Viewed

@@ -1,5 +1,5 @@
-lib = File.expand_path('../lib', __FILE__)
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'list_spider/version'
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'rake', '~> 10.0'
   spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
-  spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
+  spec.add_dependency 'nokogiri', '~> 1.11'
   spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
 end

data/spider_example.rb CHANGED Viewed

@@ -3,8 +3,8 @@ require 'list_spider'
 DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')
@@ -16,8 +16,6 @@ def parse_index_item(file_name)
   end
 end
-# ListSpider.convert_to_utf8 = true
 # get_one is a simple function for one taskstruct situation
 ListSpider.get_one(
   TaskStruct.new(

data/spider_example_2.rb CHANGED Viewed

@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
 @next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 2.2.0
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-01-29 00:00:00.000000000 Z
+date: 2019-09-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -64,20 +64,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.6'
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: '1.11'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.6'
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: '1.11'
 - !ruby/object:Gem::Dependency
   name: rchardet
   requirement: !ruby/object:Gem::Requirement
@@ -106,8 +100,11 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rdoc_options"
 - ".rubocop.yml"
+- English_README.md
 - Gemfile
+- Gemfile.lock
 - README.md
 - Rakefile
 - bin/console
@@ -139,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.3
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: List Spider