RubyGems - list_spider - Versions diffs - 2.0.2 → 2.1.0 - Mend

list_spider 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4a69a63919c971b077855f236631b5b91895e11836338c8970dec65214f92a4c
-  data.tar.gz: 6e0fefdd9a9eba8bfbead303bfbe2e27a0d98171bd60cdae89864f026289c987
+  metadata.gz: 2af55a6c3577dc734aa7ee545cef217059abfc7be4724eaac9cf94126b869b0e
+  data.tar.gz: 48e8f116b91e36613b05958f173a1bc168c0c6daa163fd137266515c3a19c2b7
 SHA512:
-  metadata.gz: 11a2591f94021e7bb3d06bb7a712f98cf8329142150cf09979d44cc65f1803c809e9077d85ad5d41c83457b20c2bf9694a3a65692898145eb7738e18460e40ba
-  data.tar.gz: 3e393c808733042ba3b13a950fcc63f7498641a55d16a5ccae11f3bf1100ca04558e32873bd9063602abd6c72194e8d90492b9b292e47cbd08bfc44476f5417f
+  metadata.gz: 778ae0918059fd2edea3a02081cf479d054521f216afa789f4d9131708b8339486f39b9f7603e303de91f4c03b1bb7ebf30e6b45ac0921fe0c29640743df9e5d
+  data.tar.gz: bcfc6df857085630faf802f3cff9d21653c2d8ced9b2595a3bc92a8093d8883cd470132b770801bc2e4977ad17e5f82aea726a2ad600ab5d1560150dede7c20f

data/.rdoc_options ADDED

@@ -0,0 +1,23 @@
+--- !ruby/object:RDoc::Options
+encoding: UTF-8
+static_path: []
+rdoc_include:
+- "."
+- "/Users/zhangchao/github/list_spider"
+charset: UTF-8
+exclude:
+hyperlink_all: false
+line_numbers: false
+locale:
+locale_dir: locale
+locale_name:
+main_page:
+markup: markdown
+output_decoration: true
+page_dir:
+show_hash: false
+tab_width: 8
+template_stylesheets: []
+title:
+visibility: :protected
+webcvs:

data/.rubocop.yml CHANGED

@@ -18,9 +18,9 @@ Style/Documentation:
   Enabled: false
 Lint/AmbiguousRegexpLiteral:
   Enabled: false
-Lint/DefEndAlignment:
+Layout/DefEndAlignment:
   AutoCorrect: true
-Lint/EndAlignment:
+Layout/EndAlignment:
   AutoCorrect: true
 Style/BracesAroundHashParameters:
   Enabled: false

data/English_README.md ADDED

@@ -0,0 +1,169 @@
+# list_spider
+A url list spider based on em-http-request.
+Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+## Features
+* Duplicate url filtering (based on local path, so you can custom your behavior).
+* Convert to UTF-8 support.
+* Increased spider support (don't spider exist).
+* Customize concurrent number and interval between task.
+* Http options support.
+## Getting started
+```ruby
+gem install list_spider
+```
+Or add it to your Gemfile
+```ruby
+gem 'list_spider'
+```
+## Use like this
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+@next_list = []
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # or you can save them to database for later use
+    @next_list << TaskStruct.new(href, local_path)
+  end
+end
+task_list = []
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
+ListSpider.get_list(task_list)
+ListSpider.get_list(@next_list, max: 60)
+```
+## Or in one step
+```ruby
+require 'list_spider'
+DOWNLOAD_DIR = 'coolshell/'.freeze
+def parse_index_item(e)
+  content = File.read(e.local_path)
+  doc = Nokogiri::HTML(content)
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
+  link_list.each do |link|
+    href = link['href']
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    ListSpider.add_task(TaskStruct.new(href, local_path))
+  end
+end
+# get_one is a simple function for one taskstruct situation
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
+```
+## And there are many options you can use
+```ruby
+def initialize(href, # 请求链接
+                 local_path, # 保存数据的本地路径（此路径作为去重标准）
+                 # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+                 http_method: :get,
+                 custom_data: nil, # 自定义数据
+                 parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+                 # 请求成功后的回调，此时可能没有保存文件，比如301，404
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 # http.response_header.status 状态码
+                 # http.response_header  返回头
+                 # http.response 返回体
+                 callback: nil,
+                 # 请求失败后的回调
+                 # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+                 errback: nil,
+                 stream_callback: nil, # 流数据处理回调
+                 convert_to_utf8: false, # 是否转换为utf8编码
+                 overwrite_exist: false, # 是否覆盖现有文件
+                 # request options
+                 redirects: 3, # 重定向次数
+                 keepalive: nil, # （暂不支持复用）
+                 file: nil, # 要上传的文件路径
+                 path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+                 query: nil, # 查询字符串，可以是string或hash类型
+                 body: nil, # 请求体，可以是string或hash类型
+                 head: nil, # 请求头
+                 # connection options
+                 connect_timeout: 60, # 连接超时时间
+                 inactivity_timeout: nil, # 连接后超时时间
+                 # ssl设置
+                 # ssl: {
+                 #     :private_key_file => '/tmp/server.key',
+                 #     :cert_chain_file => '/tmp/server.crt',
+                 #     :verify_peer => false
+                 # }
+                 ssl: nil,
+                 # bind: {
+                 #     :host => '123.123.123.123',   # use a specific interface for outbound request
+                 #     :port => '123'
+                 # }
+                 bind: nil,
+                 # 代理设置
+                 # proxy: {
+                 #     :host => '127.0.0.1',    # proxy address
+                 #     :port => 9000,           # proxy port
+                 #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+                 #     :authorization => ['user', 'pass']  # proxy authorization header
+                 # }
+                 proxy: nil)
+```
+## Callback methods form
+```ruby
+# called when the file is saved successfully
+def parse_eresponse(task_struct)
+  # ...
+end
+def call_back(task_struct, http_req)
+  # http_req is a EventMachine::HttpRequest object
+  # http_req.response_header.status
+  # ...
+end
+def err_back(task_struct, http_req)
+  # ...
+end
+```
+### License
+(MIT License) - Copyright (c) 2016 Charles Zhang

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    list_spider (2.0.1)
+    list_spider (2.0.2)
       em-http-request (~> 1.1, >= 1.1.3)
       nokogiri (~> 1.6, >= 1.6.7)
       rchardet (~> 1.6, >= 1.6.1)

data/README.md CHANGED

@@ -1,186 +1,181 @@
-# list_spider
+# 关于list_spider
-A url list spider based on em-http-request.
+list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
-Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
+许多情况下，爬虫的工作是爬取链接，解析返回数据，从中提取链接，继续爬取，list_spider就是适用这种场景的爬虫工具。
-## Features
-* Duplicate url filtering (based on local path, so you can custom your behavior).
+## 功能特点
+* 去重过滤 (使用本地文件路径做唯一性校验)。
-* Convert to UTF-8 support.
+* 支持UTF-8编码转换。
-* Increased spider support (don't spider exist).
+* 默认增量爬取，已爬取的不再重复爬取（可以通过选项强制重新获取）。
-* Customize concurrent number and interval between task.
+* 自由设置最大并发数和爬取任务间隔时间。
-* Http options support.
+* 支持http所有选项设置。
-## Getting started
+## 开始
-    gem install list_spider
+```ruby
+gem install list_spider
+```
+或者添加到Gemfile
+```ruby
+gem 'list_spider'
+```
-## Use like this
+## 使用方法
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-$next_list = []
+@next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
-    #or you can save them to database for later use
-    $next_list<< TaskStruct.new(href, local_path)
+    local_path = DOWNLOAD_DIR + link.content + '.html'
+    # 可以存入数据库后续处理
+    @next_list << TaskStruct.new(href, local_path)
   end
 end
 task_list = []
-task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
+task_list << TaskStruct.new(
+  'https://coolshell.cn/',
+  DOWNLOAD_DIR + 'index.html',
+  parse_method: method(:parse_index_item)
+)
 ListSpider.get_list(task_list)
-ListSpider.get_list($next_list, max: 60)
+ListSpider.get_list(@next_list, max: 60)
 ```
-## Or in one step
+## 或者使用更简单的一步完成
 ```ruby
 require 'list_spider'
-DOWNLOAD_DIR = 'coolshell/'
+DOWNLOAD_DIR = 'coolshell/'.freeze
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
-  list_group = doc.css("h2.entry-title")
-  link_list = list_group.css("a")
+  list_group = doc.css('h2.entry-title')
+  link_list = list_group.css('a')
   link_list.each do |link|
     href = link['href']
-    local_path = DOWNLOAD_DIR + link.content + ".html"
+    local_path = DOWNLOAD_DIR + link.content + '.html'
     ListSpider.add_task(TaskStruct.new(href, local_path))
   end
 end
-#get_one is a simple function for one taskstruct situation
-ListSpider.get_one(TaskStruct.new(
-  'https://coolshell.cn/',
-  DOWNLOAD_DIR + 'index.html',
-  parse_method: method(:parse_index_item)),
-max: 60)
+# get_one是封装了get_list的简化形式，方便一个任务时调用
+ListSpider.get_one(
+  TaskStruct.new(
+    'https://coolshell.cn/',
+    DOWNLOAD_DIR + 'index.html',
+    parse_method: method(:parse_index_item)
+  ),
+  max: 60
+)
 ```
-## You can define parse method in four forms
-```ruby
-def parse_response(file_name)
-  #...
-end
-# custom_data is passed by TaskStruct's custom_data param
-def parse_response(file_name, custom_data)
-  #...
-end
-# response_header is a EventMachine::HttpResponseHeader object
-# you can use it like this:
-# response_header.status
-# response_header.cookie
-# response_header['Last-Modified']
-def parse_response(file_name, custom_data, response_header)
-  response_header.status
-  response_header['Last-Modified']
-  #...
-end
-# req is a EventMachine::HttpClientOptions object
-# you can use it like this:
-# req.body
-# req.headers
-# req.uri
-# req.host
-# req.port
-def parse_response(file_name, custom_data, response_header, req)
-  puts req.body
-  puts req.headers
-  puts req.uri
-  puts req.host
-  puts req.port
-  #...
-end
-```
-## And there are many options you can use
-```ruby
-TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
+## get_list/get_one参数
 ```
+# down_list: 要请求的TaskStruct数组
+# interval: 任务间隔，默认为0。若参数为Range对象，则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
+# max: 最大并发数，默认为50。若设为NO_LIMIT_CONCURRENT，则所有请求任务全部一起并发执行
-```ruby
-#no concurrent limit (note: only use when list size is small)
-ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
-#sleep random time, often used in site which limit spider
-ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
-#set random time range
-ListSpider.get_list(down_list, interval: (1..10), max: 1)
+get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
 ```
-###Options below will take effect in the whole program (set them before call get_list)
+## 下面是TaskStruct可以设置的选项，与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
 ```ruby
-#set proxy
-ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
-#set http header (if TaskStruct has header it will be used priority)
-ListSpider.set_header_option(header_option)
-#convert the file encoding to utf-8
-ListSpider.convert_to_utf8 = true
-#set connect timeout
-ListSpider.connect_timeout = 2*60
-#over write exist file
-ListSpider.overwrite_exist = false
-#set redirect depth
-ListSpider.max_redirects = 10
+new(href, # 请求链接
+    local_path, # 保存数据的本地路径（此路径作为去重标准）
+    # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+    http_method: :get,
+    custom_data: nil, # 自定义数据
+    parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
+    # 请求成功后的回调，此时可能没有保存文件，比如301，404
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    # http_req.response_header.status 状态码
+    # http_req.response_header  返回头
+    # http_req.response 返回体
+    callback: nil,
+    # 请求失败后的回调
+    # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
+    errback: nil,
+    stream_callback: nil, # 流数据处理回调
+    convert_to_utf8: false, # 是否转换为utf8编码
+    overwrite_exist: false, # 是否覆盖现有文件
+    # 请求设置
+    redirects: 3, # 重定向次数
+    keepalive: nil, # （暂不支持复用）
+    file: nil, # 要上传的文件路径
+    path: nil, # 请求路径，在流水线方式请求时有用（暂不支持）
+    query: nil, # 查询字符串，可以是string或hash类型
+    body: nil, # 请求体，可以是string或hash类型
+    head: nil, # 请求头
+    # 连接设置
+    connect_timeout: 60, # 连接超时时间
+    inactivity_timeout: nil, # 连接后超时时间
+    # ssl设置
+    # ssl: {
+    #     :private_key_file => '/tmp/server.key',
+    #     :cert_chain_file => '/tmp/server.crt',
+    #     :verify_peer => false
+    # }
+    ssl: nil,
+    # bind: {
+    #     :host => '123.123.123.123',   # use a specific interface for outbound request
+    #     :port => '123'
+    # }
+    bind: nil,
+    # 代理设置
+    # proxy: {
+    #     :host => '127.0.0.1',    # proxy address
+    #     :port => 9000,           # proxy port
+    #     :type => :socks5         # default proxy mode is HTTP proxy, change to :socks5 if required
+    #     :authorization => ['user', 'pass']  # proxy authorization header
+    # }
+    proxy: nil)
 ```
-## There is a util class to help check or delete unvalid file
+## 回调函数形式
 ```ruby
-FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
-FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
+# 文件成功保存后调用，通过parse_method参数传入
+def parse_eresponse(task_struct)
+  # ...
+end
-#params
-FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
+# http请求成功后调用，通过callback参数传入
+def call_back(task_struct, http_req)
+  # http_req 是EventMachine::HttpRequest对象
+  # http_req.response_header.status
+  # ...
+end
-FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
+# http请求出错后调用，通过errback参数传入
+def err_back(task_struct, http_req)
+  # ...
+end
 ```
-### License
+## License
 (MIT License) - Copyright (c) 2016 Charles Zhang

data/lib/list_spider.rb CHANGED

@@ -4,10 +4,16 @@ require 'nokogiri'
 require 'fileutils'
 require 'set'
 require 'addressable/uri'
-require File.expand_path('../spider_helper', __FILE__)
-require File.expand_path('../file_filter', __FILE__)
+require File.expand_path('spider_helper', __dir__)
+require File.expand_path('file_filter', __dir__)
+# 爬取任务类
 class TaskStruct
+  # * href 请求链接
+  # * local_path 保存数据的本地路径（此路径作为去重标准）
+  # * http_method http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
+  # * custom_data 自定义数据
+  # * parse_method 解析保存文件的回调，参数是TaskStruct对象本身
   def initialize(href, # 请求链接
                  local_path, # 保存数据的本地路径（此路径作为去重标准）
                  # http方法，取值：:get, :head, :delete, :put, :post, :patch, :options
@@ -16,9 +22,9 @@ class TaskStruct
                  parse_method: nil, # 解析保存文件的回调，参数是TaskStruct对象本身
                  # 请求成功后的回调，此时可能没有保存文件，比如301，404
                  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
-                 # http.response_header.status 状态码
-                 # http.response_header  返回头
-                 # http.response 返回体
+                 # http_req.response_header.status 状态码
+                 # http_req.response_header  返回头
+                 # http_req.response 返回体
                  callback: nil,
                  # 请求失败后的回调
                  # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
@@ -26,7 +32,7 @@ class TaskStruct
                  stream_callback: nil, # 流数据处理回调
                  convert_to_utf8: false, # 是否转换为utf8编码
                  overwrite_exist: false, # 是否覆盖现有文件
-                 # request options
+                 # 请求设置
                  redirects: 3, # 重定向次数
                  keepalive: nil, # （暂不支持复用）
                  file: nil, # 要上传的文件路径
@@ -34,7 +40,7 @@ class TaskStruct
                  query: nil, # 查询字符串，可以是string或hash类型
                  body: nil, # 请求体，可以是string或hash类型
                  head: nil, # 请求头
-                 # connection options
+                 # 连接设置
                  connect_timeout: 60, # 连接超时时间
                  inactivity_timeout: nil, # 连接后超时时间
                  # ssl设置
@@ -112,6 +118,41 @@ module ListSpider
   @local_path_set = Set.new
   class << self
+    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      if interval.is_a? Range
+        @random_time_range = interval
+        interval = RANDOM_TIME
+      end
+      @down_list = filter_list(down_list)
+      @interval = interval
+      @max = max
+      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
+      @succeed_size = 0
+      @failed_size = 0
+      puts "total size:#{@down_list.size}"
+      event_machine_start_list(next_task, method(:complete))
+    end
+    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
+      get_list([task], interval: interval, max: max)
+    end
+    def add_task(task)
+      if task.is_a? Array
+        need_down_list = filter_list(task)
+        @down_list += need_down_list
+      elsif task.is_a?TaskStruct
+        need_down_list = filter_list([task])
+        @down_list += need_down_list
+      else
+        puts "error task type:#{task.class}"
+      end
+    end
+    private
     def event_machine_down(link_struct_list, callback = nil)
       failed_list = []
       succeed_list = []
@@ -247,43 +288,6 @@ module ListSpider
       end
       need_down_list
     end
-    def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      if interval.is_a? Range
-        @random_time_range = interval
-        interval = RANDOM_TIME
-      end
-      @down_list = []
-      need_down_list = filter_list(down_list)
-      @down_list += need_down_list
-      @interval = interval
-      @max = max
-      @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
-      @succeed_size = 0
-      @failed_size = 0
-      puts "total size:#{@down_list.size}"
-      event_machine_start_list(next_task, method(:complete))
-    end
-    def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
-      get_list([task], interval: interval, max: max)
-    end
-    def add_task(task)
-      if task.is_a? Array
-        need_down_list = filter_list(task)
-        @down_list += need_down_list
-      elsif task.is_a?TaskStruct
-        need_down_list = filter_list([task])
-        @down_list += need_down_list
-      else
-        puts "error task type:#{task.class}"
-      end
-    end
   end
   Signal.trap('INT') do

data/lib/list_spider/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ListSpider
-  VERSION = '2.0.2'.freeze
+  VERSION = '2.1.0'.freeze
 end

data/list_spider.gemspec CHANGED

@@ -1,5 +1,5 @@
-lib = File.expand_path('../lib', __FILE__)
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'list_spider/version'
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'rake', '~> 10.0'
   spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
-  spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
+  spec.add_dependency 'nokogiri', '>= 1.8.5'
   spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
 end

data/spider_example.rb CHANGED

@@ -1,5 +1,5 @@
-# require 'list_spider'
-require File.expand_path('../lib/list_spider', __FILE__)
+require 'list_spider'
+# require File.expand_path('../lib/list_spider', __FILE__)
 DOWNLOAD_DIR = 'coolshell/'.freeze
@@ -16,8 +16,6 @@ def parse_index_item(e)
   end
 end
-# ListSpider.convert_to_utf8 = true
 # get_one is a simple function for one taskstruct situation
 ListSpider.get_one(
   TaskStruct.new(

data/spider_example_2.rb CHANGED

@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
 @next_list = []
-def parse_index_item(file_name)
-  content = File.read(file_name)
+def parse_index_item(e)
+  content = File.read(e.local_path)
   doc = Nokogiri::HTML(content)
   list_group = doc.css('h2.entry-title')
   link_list = list_group.css('a')

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: list_spider
 version: !ruby/object:Gem::Version
-  version: 2.0.2
+  version: 2.1.0
 platform: ruby
 authors:
 - Charles Zhang
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-02-28 00:00:00.000000000 Z
+date: 2019-06-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -62,22 +62,16 @@ dependencies:
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.6'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: 1.8.5
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.6'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.6.7
+        version: 1.8.5
 - !ruby/object:Gem::Dependency
   name: rchardet
   requirement: !ruby/object:Gem::Requirement
@@ -106,7 +100,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rdoc_options"
 - ".rubocop.yml"
+- English_README.md
 - Gemfile
 - Gemfile.lock
 - README.md
@@ -140,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.3
+rubygems_version: 3.0.1
 signing_key:
 specification_version: 4
 summary: List Spider