RubyGems - list_spider - Versions diffs - 1.0.0 → 2.2.0 - Mend

list_spider 1.0.0 → 2.2.0

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
-  data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
+  metadata.gz: 39600b837bb18841d083c7b50dbaadf82e72c3013f690129af6786efec193a39
+  data.tar.gz: 4128e673c551e3fcc2c1f9d4a3302407bcf7bc26829a4957d04ebc0505d5ce07
 SHA512:
-  metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
-  data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
+  metadata.gz: f900e8f76086f37239872d9b4452f5d735799100879ac16570d29c9570837adca52c3c9e37c725913920a68add7784bc2f94e2cef42663c54930ae5b3e37ec50
+  data.tar.gz: 90495a4dae2552c3f41e55f0efa61fef0511581eb2e13d90256e0a585c48f7fdb2af167cd8c6daa98ca80c2229d970187fbcec3db4a8edd43738f76f79c18951

data/.rdoc_options ADDED Viewed

@@ -0,0 +1,23 @@
+--- !ruby/object:RDoc::Options
+encoding: UTF-8
+static_path: []
+rdoc_include:
+- "."
+- "/Users/zhangchao/github/list_spider"
+charset: UTF-8
+exclude:
+hyperlink_all: false
+line_numbers: false
+locale:
+locale_dir: locale
+locale_name:
+main_page:
+markup: markdown
+output_decoration: true
+page_dir:
+show_hash: false
+tab_width: 8
+template_stylesheets: []
+title:
+visibility: :protected
+webcvs:

data/.rubocop.yml CHANGED Viewed

@@ -18,9 +18,9 @@ Style/Documentation:
   Enabled: false
 Lint/AmbiguousRegexpLiteral:
   Enabled: false
-Lint/DefEndAlignment:
+Layout/DefEndAlignment:
   AutoCorrect: true
-Lint/EndAlignment:
+Layout/EndAlignment:
   AutoCorrect: true
 Style/BracesAroundHashParameters:
   Enabled: false

data/English_README.md ADDED Viewed

@@ -0,0 +1,169 @@
+# list_spider
+A url list spider based on em-http-request.
+Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+## Features
+* Duplicate url filtering (based on local path, so you can custom your behavior).
+* Convert to UTF-8 support.
+* Increased spider support (don't spider exist).
+* Customize concurrent number and interval between task.
+* Http options support.
+## Getting started
+```ruby
+gem install list_spider
+```
+Or add it to your Gemfile
+```ruby
+gem 'list_spider'
+```
+## Use like this
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+@next_list = []
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # or you can save them to database for later use
+    @next_list << TaskStruct.new(href, local_path)
+  end
+end
+task_list = []
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
+ListSpider.get_list(task_list)
+ListSpider.get_list(@next_list, max: 60)
+```
+## Or in one step
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    ListSpider.add_task(TaskStruct.new(href, local_path))
+  end
+end
+# get_one is a simple function for one taskstruct situation
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
+```
+## And there are many options you can use
+```ruby
+def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，404
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http.response_header.status 状态码
+                 # http.response_header  返回头
+                 # http.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # request options
+                 redirects: 3, # 重定向次数
+                 keepalive: nil, # （暂不支持复用）
+                 file: nil, # 要上传的文件路径
+                 path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # connection options
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
+```
+## Callback methods form
+```ruby
+# called when the file is saved successfully
+def parse_eresponse(task_struct)
+  # ...
+end
+def call_back(task_struct, http_req)
+  # http_req is a EventMachine::HttpRequest object
+  # http_req.response_header.status
+  # ...
+end
+def err_back(task_struct, http_req)
+  # ...
+end
+```
+### License
+(MIT License) - Copyright (c) 2016 Charles Zhang

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,41 @@
+PATH
+  remote: .
+  specs:
+    list_spider (2.0.2)
+      em-http-request (~> 1.1, >= 1.1.3)
+      nokogiri (~> 1.6, >= 1.6.7)
+      rchardet (~> 1.6, >= 1.6.1)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    addressable (2.5.2)
+      public_suffix (>= 2.0.2, < 4.0)
+    cookiejar (0.3.3)
+    em-http-request (1.1.5)
+      addressable (>= 2.3.4)
+      cookiejar (!= 0.3.1)
+      em-socksify (>= 0.3)
+      eventmachine (>= 1.0.3)
+      http_parser.rb (>= 0.6.0)
+    em-socksify (0.3.2)
+      eventmachine (>= 1.0.0.beta.4)
+    eventmachine (1.2.5)
+    http_parser.rb (0.6.0)
+    mini_portile2 (2.3.0)
+    nokogiri (1.8.2)
+      mini_portile2 (~> 2.3.0)
+    public_suffix (3.0.2)
+    rake (10.5.0)
+    rchardet (1.7.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.16)
+  list_spider!
+  rake (~> 10.0)
+BUNDLED WITH
+   1.16.1

data/README.md CHANGED Viewed

@@ -1,186 +1,181 @@
-# list_spider
+# 关于list_spider
-A url list spider based on em-http-request.
+list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
-Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+许多情况下，爬虫的工作是爬取链接，解析返回数据，从中提取链接，继续爬取，list_spider就是适用这种场景的爬虫工具。
-## Features
-* Duplicate url filtering (based on local path, so you can custom your behavior).
+## 功能特点
+* 去重过滤 (使用本地文件路径做唯一性校验)。
-* Convert to UTF-8 support.
+* 支持UTF-8编码转换。
-* Increased spider support (don't spider exist).
+* 默认增量爬取，已爬取的不再重复爬取（可以通过选项强制重新获取）。
-* Customize concurrent number and interval between task.
+* 自由设置最大并发数和爬取任务间隔时间。
-* Http options support.
+* 支持http所有选项设置。
-## Getting started
+## 开始
-    gem install list_spider
+```ruby
+gem install list_spider
+```
+或者添加到Gemfile
+```ruby
+gem 'list_spider'
+```
-## Use like this
+## 使用方法
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-$next_list = []
+@next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
-    #or you can save them to database for later use
-    $next_list<< TaskStruct.new(href, local_path)
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # 可以存入数据库后续处理
+    @next_list << TaskStruct.new(href, local_path)
   end
 end
 task_list = []
-task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
 ListSpider.get_list(task_list)
-ListSpider.get_list($next_list, max: 60)
+ListSpider.get_list(@next_list, max: 60)
 ```
-## Or in one step
+## 或者使用更简单的一步完成
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
+    local_path = DOWNLOAD_DIR + link.content + '.html'
     ListSpider.add_task(TaskStruct.new(href, local_path))
   end
 end
-#get_one is a simple function for one taskstruct situation
-ListSpider.get_one(TaskStruct.new(
-  'https://coolshell.cn/',
-  DOWNLOAD_DIR + 'index.html',
-  parse_method: method(:parse_index_item)),
-max: 60)
+# get_one是封装了get_list的简化形式，方便一个任务时调用
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
 ```
-## You can define parse method in four forms
-```ruby
-def parse_response(file_name)
-  #...
-end
-# extra_data is passed by TaskStruct's extra_data param
-def parse_response(file_name, extra_data)
-  #...
-end
-# response_header is a EventMachine::HttpResponseHeader object
-# you can use it like this:
-# response_header.status
-# response_header.cookie
-# response_header['Last-Modified']
-def parse_response(file_name, extra_data, response_header)
-  response_header.status
-  response_header['Last-Modified']
-  #...
-end
-# req is a EventMachine::HttpClientOptions object
-# you can use it like this:
-# req.body
-# req.headers
-# req.uri
-# req.host
-# req.port
-def parse_response(file_name, extra_data, response_header, req)
-  puts req.body
-  puts req.headers
-  puts req.uri
-  puts req.host
-  puts req.port
-  #...
-end
-```
-## And there are many options you can use
-```ruby
-TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+## get_list/get_one参数
 ```
+# down_list: 要请求的TaskStruct数组
+# interval: 任务间隔，默认为0。若参数为Range对象，则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
+# max: 最大并发数，默认为50。若设为NO_LIMIT_CONCURRENT，则所有请求任务全部一起并发执行
-```ruby
-#no concurrent limit (note: only use when list size is small)
-ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
-#sleep random time, often used in site which limit spider
-ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
-#set random time range
-ListSpider.get_list(down_list, interval: (1..10), max: 1)
+get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
 ```
-###Options below will take effect in the whole program (set them before call get_list)
+## 下面是TaskStruct可以设置的选项，与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
 ```ruby
-#set proxy
-ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-#set http header (if TaskStruct has header it will be used priority)
-ListSpider.set_header_option(header_option)
-#convert the file encoding to utf-8
-ListSpider.convert_to_utf8 = true
-#set connect timeout
-ListSpider.connect_timeout = 2*60
-#over write exist file
-ListSpider.overwrite_exist = false
-#set redirect depth
-ListSpider.max_redirects = 10
+new(href, # 请求链接
+    local_path, # 保存数据的本地路径（此路径作为去重标准）
+    # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+    http_method: :get,
+    custom_data: nil, # 自定义数据
+    parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+    # 请求成功后的回调，此时可能没有保存文件，比如301，404
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    # http_req.response_header.status 状态码
+    # http_req.response_header  返回头
+    # http_req.response 返回体
+    callback: nil,
+    # 请求失败后的回调
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    errback: nil,
+    stream_callback: nil, # 流数据处理回调
+    convert_to_utf8: false, # 是否转换为utf8编码
+    overwrite_exist: false, # 是否覆盖现有文件
+    # 请求设置
+    redirects: 3, # 重定向次数
+    keepalive: nil, # （暂不支持复用）
+    file: nil, # 要上传的文件路径
+    path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+    query: nil, # 查询字符串，可以是string或hash类型
+    body: nil, # 请求体，可以是string或hash类型
+    head: nil, # 请求头
+    # 连接设置
+    connect_timeout: 60, # 连接超时时间
+    inactivity_timeout: nil, # 连接后超时时间
+    # ssl设置
+    # ssl: {
+    #     :private_key_file => '/tmp/server.key',
+    #     :cert_chain_file => '/tmp/server.crt',
+    #     :verify_peer => false
+    # }
+    ssl: nil,
+    # bind: {
+    #     :host => '123.123.123.123',   # use a specific interface for outbound request
+    #     :port => '123'
+    # }
+    bind: nil,
+    # 代理设置
+    # proxy: {
+    #     :host => '127.0.0.1',    # proxy address
+    #     :port => 9000,           # proxy port
+    #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+    #     :authorization => ['user', 'pass']  # proxy authorization header
+    # }
+    proxy: nil)
 ```
-## There is a util class to help check or delete unvalid file
+## 回调函数形式
 ```ruby
-FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
+# 文件成功保存后调用，通过parse_method参数传入
+def parse_eresponse(task_struct)
+  # ...
+end
-#params
-FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
+# http请求成功后调用，通过callback参数传入
+def call_back(task_struct, http_req)
+  # http_req 是EventMachine::HttpRequest对象
+  # http_req.response_header.status
+  # ...
+end
-FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+# http请求出错后调用，通过errback参数传入
+def err_back(task_struct, http_req)
+  # ...
+end
 ```
-### License
+## License
 (MIT License) - Copyright (c) 2016 Charles Zhang

data/lib/file_filter.rb CHANGED Viewed

@@ -2,7 +2,8 @@
 class FileFilter
   # 4033
   # 920
-  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
+  def initialize(dir_pattern, size_threshold: 1000,
+                 cust_judge: nil, process_block: nil)
     @dir_pattern = dir_pattern
     @size_threshold = size_threshold
     @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
     ).start
   end
-  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
+                             size_threshold: 1000, cust_judge: nil)
     result_file = File.open(save_file_name, 'wt')
     FileFilter.new(
       dir_pattern,

data/lib/list_spider/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ListSpider
-  VERSION = '1.0.0'.freeze
+  VERSION = '2.2.0'.freeze
 end

data/lib/list_spider.rb CHANGED Viewed

@@ -4,26 +4,108 @@ require 'nokogiri'
 require 'fileutils'
 require 'set'
 require 'addressable/uri'
-require File.expand_path('../spider_helper', __FILE__)
-require File.expand_path('../file_filter', __FILE__)
+require File.expand_path('spider_helper', __dir__)
+require File.expand_path('file_filter', __dir__)
+# 爬取任务类
 class TaskStruct
-  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
+  # * href 请求链接
+  # * local_path 保存数据的本地路径（此路径作为去重标准）
+  # * http_method http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+  # * custom_data 自定义数据
+  # * parse_method 解析保存文件的回调，参数是TaskStruct对象本身
+  def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，404
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http_req.response_header.status 状态码
+                 # http_req.response_header  返回头
+                 # http_req.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # 请求设置
+                 redirects: 3, # 重定向次数
+                 keepalive: nil, # （暂不支持复用）
+                 file: nil, # 要上传的文件路径
+                 path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # 连接设置
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
     @href = href
-    @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
     @local_path = local_path
     @http_method = http_method
-    @params = params
-    @extra_data = extra_data
+    @custom_data = custom_data
     @parse_method = parse_method
-    @header = header
+    @callback = callback
+    @errback = errback
+    @stream_callback = stream_callback
+    @convert_to_utf8 = convert_to_utf8
+    @overwrite_exist = overwrite_exist
+    @request_options = {
+      redirects: redirects,
+      keepalive: keepalive,
+      file: file,
+      path: path,
+      query: query,
+      body: body,
+      head: head
+    }.compact
+    @connection_options = {
+      connect_timeout: connect_timeout,
+      inactivity_timeout: inactivity_timeout,
+      ssl: ssl,
+      bind: bind,
+      proxy: proxy
+    }.compact
   end
-  def ==(other)
-    other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
-  end
-  attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
+  attr_accessor :href, :local_path,
+                :http_method,
+                :custom_data,
+                :request_object,
+                :parse_method,
+                :callback,
+                :errback,
+                :stream_callback,
+                :convert_to_utf8,
+                :overwrite_exist,
+                :request_options,
+                :connection_options
 end
 module ListSpider
@@ -33,33 +115,44 @@ module ListSpider
   DEFAULT_INTERVAL = 0
   @random_time_range = 3..10
-  @convert_to_utf8 = false
-  @connection_opts = { connect_timeout: 60 }
-  @overwrite_exist = false
-  @max_redirects = 10
   @local_path_set = Set.new
   class << self
-    attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
+    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      if interval.is_a? Range
+        @random_time_range = interval
+        interval = RANDOM_TIME
+      end
+      @down_list = filter_list(down_list)
+      @interval = interval
+      @max = max
+      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
+      @succeed_size = 0
+      @failed_size = 0
-    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-      @connection_opts = {
-        proxy: {
-          host: proxy_addr,
-          port: proxy_port
-        }
-      }
-      @connection_opts[:proxy][:authorization] = [username, password] if username && password
+      puts "total size:#{@down_list.size}"
+      event_machine_start_list(next_task, method(:complete))
     end
-    def connect_timeout(max_connect_time)
-      @connection_opts[:connect_timeout] = max_connect_time
+    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      get_list([task], interval: interval, max: max)
     end
-    def set_header_option(header_option)
-      @header_option = header_option
+    def add_task(task)
+      if task.is_a? Array
+        need_down_list = filter_list(task)
+        @down_list += need_down_list
+      elsif task.is_a?TaskStruct
+        need_down_list = filter_list([task])
+        @down_list += need_down_list
+      else
+        puts "error task type:#{task.class}"
+      end
     end
+    private
     def event_machine_down(link_struct_list, callback = nil)
       failed_list = []
       succeed_list = []
@@ -67,78 +160,47 @@ module ListSpider
       begin_time = Time.now
       for_each_proc =
-        proc do |e|
-          opt = { redirects: @max_redirects }
-          if e.header
-            opt[:head] = e.header
-          elsif defined? @header_option
-            opt[:head] = @header_option
-          end
+        proc do |task_struct|
+          http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
+          http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
+          task_struct.request_object = http_req
-          if e.http_method == :post
-            opt[:body] = e.params unless e.params.empty?
-            w =
-              if @connection_opts
-                EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
-              else
-                EventMachine::HttpRequest.new(e.href).post opt
-              end
-          else
-            if @connection_opts
-              opt[:query] = e.params unless e.params.empty?
-              w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
-            else
-              w = EventMachine::HttpRequest.new(e.href).get opt
-            end
-          end
-          e.request_object = w
+          http_req.callback do
+            s = http_req.response_header.status
+            puts "#{Time.now}, http status code: #{s}"
-          w.callback do
-            s = w.response_header.status
-            puts s
-            if s != 404
-              local_dir = File.dirname(e.local_path)
+            if s == 200
+              local_dir = File.dirname(task_struct.local_path)
               FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
               begin
-                File.open(e.local_path, 'wb') do |f|
+                File.open(task_struct.local_path, 'wb') do |f|
                   f << if @convert_to_utf8 == true
-                         SpiderHelper.to_utf8(w.response)
+                         SpiderHelper.to_utf8(http_req.response)
                        else
-                         w.response
+                         http_req.response
                        end
                 end
-                succeed_list << e
-              rescue StandardError => e
-                puts e
+                call_parse_method(task_struct)
+                succeed_list << task_struct
+              rescue StandardError => exception
+                puts exception
               end
             end
+            task_struct.callback.call(task_struct, http_req) if task_struct.callback
           end
-          w.errback do
-            puts "errback:#{w.response_header},retry..."
-            puts e.href
-            puts w.response_header.status
-            ret = false
-            if e.http_method == :get
-              ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
-            elsif e.http_method == :post
-              ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
-            end
+          http_req.errback do
+            puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
-            if ret
-              succeed_list << e
-            else
-              failed_list << e
-            end
+            task_struct.errback.call(task_struct, http_req) if task_struct.errback
           end
           begin
-            multi.add e.local_path, w
+            multi.add task_struct.local_path, http_req
           rescue StandardError => exception
             puts exception
-            puts e.href
-            puts e.local_path
+            puts task_struct.href
+            puts task_struct.local_path
             stop_machine
           end
         end
@@ -170,38 +232,15 @@ module ListSpider
       @down_list.shift(@max)
     end
-    def call_parse_method(e)
-      pm = e.parse_method
-      if pm
-        case pm.arity
-        when 1
-          pm.call(e.local_path)
-        when 2
-          pm.call(e.local_path, e.extra_data)
-        when 3
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header)
-        when 4
-          res_header = nil
-          res_header = e.request_object.response_header if e.request_object
-          req = nil
-          req = e.request_object.req if e.request_object
-          pm.call(e.local_path, e.extra_data, res_header, req)
-        else
-          puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
-        end
-      end
+    def call_parse_method(task_struct)
+      task_struct.parse_method.call(task_struct) if task_struct.parse_method
     end
     def complete(_multi, success_list, failed_list)
       @succeed_size += success_list.size
       @failed_size += failed_list.size
-      success_list.each do |e|
-        call_parse_method(e)
-      end
+      @succeed_list.concat(success_list)
+      @failed_list.concat(failed_list)
       todo = next_task
@@ -223,6 +262,8 @@ module ListSpider
     def event_machine_start_list(down_list, callback = nil)
       EventMachine.run do
+        @succeed_list = []
+        @failed_list = []
         @begin_time = Time.now
         if down_list.empty?
           if callback
@@ -239,7 +280,7 @@ module ListSpider
     def filter_list(down_list)
       need_down_list = []
       down_list.each do |ts|
-        if !@overwrite_exist && File.exist?(ts.local_path)
+        if !ts.overwrite_exist && File.exist?(ts.local_path)
           call_parse_method(ts)
         elsif @local_path_set.add?(ts.local_path)
           need_down_list << ts
@@ -247,43 +288,6 @@ module ListSpider
       end
       need_down_list
     end
-    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      if interval.is_a? Range
-        @random_time_range = interval
-        interval = RANDOM_TIME
-      end
-      @down_list = []
-      need_down_list = filter_list(down_list)
-      @down_list += need_down_list
-      @interval = interval
-      @max = max
-      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
-      @succeed_size = 0
-      @failed_size = 0
-      puts "total size:#{@down_list.size}"
-      event_machine_start_list(next_task, method(:complete))
-    end
-    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      get_list([task], interval: interval, max: max)
-    end
-    def add_task(task)
-      if task.is_a? Array
-        need_down_list = filter_list(task)
-        @down_list += need_down_list
-      elsif task.is_a?TaskStruct
-        need_down_list = filter_list([task])
-        @down_list += need_down_list
-      else
-        puts "error task type:#{task.class}"
-      end
-    end
   end
   Signal.trap('INT') do

data/lib/spider_helper.rb CHANGED Viewed

@@ -3,8 +3,9 @@ require 'net/http'
 module SpiderHelper
   class << self
-    def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_get(href, local_path, params: nil,
+                        header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
       false
     end
-    def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
-      href = string_to_uri(href) if href.class == ''.class
+    def direct_http_post(href, local_path, params,
+                         header: nil, convert_to_utf8: false)
+      href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
       begin
         req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
     def string_to_uri(href)
       l = href
-      l.sub!('http:///', 'http://') if l.start_with?('http:///')
+      l.sub!('http:///', 'http://')
       l = Addressable::URI.parse(l)
       l.normalize!
     end

data/list_spider.gemspec CHANGED Viewed

@@ -1,5 +1,5 @@
-lib = File.expand_path('../lib', __FILE__)
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'list_spider/version'
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'rake', '~> 10.0'
   spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
-  spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
+  spec.add_dependency 'nokogiri', '~> 1.11'
   spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
 end

data/spider_example.rb CHANGED Viewed

@@ -3,8 +3,8 @@ require 'list_spider'
 DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')
@@ -16,8 +16,6 @@ def parse_index_item(file_name)
   end
 end
-# ListSpider.convert_to_utf8 = true
 # get_one is a simple function for one taskstruct situation
 ListSpider.get_one(
   TaskStruct.new(

data/spider_example_2.rb CHANGED Viewed

@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
 @next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 2.2.0
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-01-29 00:00:00.000000000 Z
+date: 2019-09-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -64,20 +64,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.6'
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: '1.11'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.6'
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: '1.11'
 - !ruby/object:Gem::Dependency
   name: rchardet
   requirement: !ruby/object:Gem::Requirement
@@ -106,8 +100,11 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rdoc_options"
 - ".rubocop.yml"
+- English_README.md
 - Gemfile
+- Gemfile.lock
 - README.md
 - Rakefile
 - bin/console
@@ -139,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.3
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: List Spider