list_spider 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +298 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +3 -4
    
        data/lib/list_spider/version.rb
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
            -
            module ListSpider
         | 
| 2 | 
            -
              VERSION = '2. | 
| 3 | 
            -
            end
         | 
| 1 | 
            +
            module ListSpider
         | 
| 2 | 
            +
              VERSION = '2.4.0'.freeze
         | 
| 3 | 
            +
            end
         | 
    
        data/lib/spider_helper.rb
    CHANGED
    
    | @@ -1,110 +1,110 @@ | |
| 1 | 
            -
            require 'rchardet'
         | 
| 2 | 
            -
            require 'net/http'
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            module SpiderHelper
         | 
| 5 | 
            -
              class << self
         | 
| 6 | 
            -
                def direct_http_get(href, local_path, params: nil,
         | 
| 7 | 
            -
                                    header: nil, convert_to_utf8: false)
         | 
| 8 | 
            -
                  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                  begin
         | 
| 11 | 
            -
                    href.query = URI.encode_www_form(params) if params
         | 
| 12 | 
            -
                    req = Net::HTTP::Get.new(href)
         | 
| 13 | 
            -
                    header.each { |k, v| req[k] = v } if header
         | 
| 14 | 
            -
             | 
| 15 | 
            -
                    res =
         | 
| 16 | 
            -
                      Net::HTTP.start(href.hostname, href.port) do |http|
         | 
| 17 | 
            -
                        http.request(req)
         | 
| 18 | 
            -
                      end
         | 
| 19 | 
            -
             | 
| 20 | 
            -
                    if res.is_a?(Net::HTTPSuccess)
         | 
| 21 | 
            -
                      local_dir = File.dirname(local_path)
         | 
| 22 | 
            -
                      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
         | 
| 23 | 
            -
                      content = res.body
         | 
| 24 | 
            -
                      content = to_utf8(content) if convert_to_utf8
         | 
| 25 | 
            -
                      File.write(local_path, content)
         | 
| 26 | 
            -
                      puts 'succeed'
         | 
| 27 | 
            -
                      return true
         | 
| 28 | 
            -
                    else
         | 
| 29 | 
            -
                      puts res
         | 
| 30 | 
            -
                    end
         | 
| 31 | 
            -
                  rescue StandardError => e
         | 
| 32 | 
            -
                    puts e.backtrace
         | 
| 33 | 
            -
                    puts e
         | 
| 34 | 
            -
                    false
         | 
| 35 | 
            -
                  end
         | 
| 36 | 
            -
                  false
         | 
| 37 | 
            -
                end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                def direct_http_post(href, local_path, params,
         | 
| 40 | 
            -
                                     header: nil, convert_to_utf8: false)
         | 
| 41 | 
            -
                  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                  begin
         | 
| 44 | 
            -
                    req = Net::HTTP::Post.new(href)
         | 
| 45 | 
            -
                    req.set_form_data(params)
         | 
| 46 | 
            -
                    header.each { |k, v| req[k] = v } if header
         | 
| 47 | 
            -
             | 
| 48 | 
            -
                    res =
         | 
| 49 | 
            -
                      Net::HTTP.start(href.hostname, href.port) do |http|
         | 
| 50 | 
            -
                        http.request(req)
         | 
| 51 | 
            -
                      end
         | 
| 52 | 
            -
             | 
| 53 | 
            -
                    if res.is_a?(Net::HTTPSuccess)
         | 
| 54 | 
            -
                      local_dir = File.dirname(local_path)
         | 
| 55 | 
            -
                      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
         | 
| 56 | 
            -
                      content = res.body
         | 
| 57 | 
            -
                      content = to_utf8(content) if convert_to_utf8
         | 
| 58 | 
            -
                      File.write(local_path, content)
         | 
| 59 | 
            -
                      puts 'succeed'
         | 
| 60 | 
            -
                      return true
         | 
| 61 | 
            -
                    else
         | 
| 62 | 
            -
                      puts res
         | 
| 63 | 
            -
                    end
         | 
| 64 | 
            -
                  rescue StandardError => e
         | 
| 65 | 
            -
                    puts e
         | 
| 66 | 
            -
                    false
         | 
| 67 | 
            -
                  end
         | 
| 68 | 
            -
                  false
         | 
| 69 | 
            -
                end
         | 
| 70 | 
            -
             | 
| 71 | 
            -
                def extract_href_last(origin_href)
         | 
| 72 | 
            -
                  origin_href.split('/')[-1]
         | 
| 73 | 
            -
                end
         | 
| 74 | 
            -
             | 
| 75 | 
            -
                def string_to_uri(href)
         | 
| 76 | 
            -
                  l = href
         | 
| 77 | 
            -
                  l.sub!('http:///', 'http://')
         | 
| 78 | 
            -
                  l = Addressable::URI.parse(l)
         | 
| 79 | 
            -
                  l.normalize!
         | 
| 80 | 
            -
                end
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
         | 
| 83 | 
            -
                                 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
         | 
| 84 | 
            -
                                 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
         | 
| 85 | 
            -
                                 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
         | 
| 86 | 
            -
                                 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                # 此函数有时此判断有误,使用to_utf8函数直接转换
         | 
| 89 | 
            -
                def smart_to_utf8(str)
         | 
| 90 | 
            -
                  return str if str.encoding == Encoding::UTF_8
         | 
| 91 | 
            -
                  to_utf8(str)
         | 
| 92 | 
            -
                end
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                def to_utf8(str)
         | 
| 95 | 
            -
                  # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
         | 
| 96 | 
            -
                  str.force_encoding(Encoding::ASCII_8BIT)
         | 
| 97 | 
            -
                  cd = CharDet.detect(str)
         | 
| 98 | 
            -
                  if cd['confidence'] > 0.6
         | 
| 99 | 
            -
                    puts cd['encoding']
         | 
| 100 | 
            -
                    str.force_encoding(cd['encoding'])
         | 
| 101 | 
            -
                    # 移除BOM头
         | 
| 102 | 
            -
                    bom_header = BomHeaderMap[cd['encoding']]
         | 
| 103 | 
            -
                    str.sub!(bom_header, '') if bom_header
         | 
| 104 | 
            -
                  end
         | 
| 105 | 
            -
                  str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                  str
         | 
| 108 | 
            -
                end
         | 
| 109 | 
            -
              end
         | 
| 110 | 
            -
            end
         | 
| 1 | 
            +
            require 'rchardet'
         | 
| 2 | 
            +
            require 'net/http'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module SpiderHelper
         | 
| 5 | 
            +
              class << self
         | 
| 6 | 
            +
                def direct_http_get(href, local_path, params: nil,
         | 
| 7 | 
            +
                                    header: nil, convert_to_utf8: false)
         | 
| 8 | 
            +
                  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  begin
         | 
| 11 | 
            +
                    href.query = URI.encode_www_form(params) if params
         | 
| 12 | 
            +
                    req = Net::HTTP::Get.new(href)
         | 
| 13 | 
            +
                    header.each { |k, v| req[k] = v } if header
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                    res =
         | 
| 16 | 
            +
                      Net::HTTP.start(href.hostname, href.port) do |http|
         | 
| 17 | 
            +
                        http.request(req)
         | 
| 18 | 
            +
                      end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    if res.is_a?(Net::HTTPSuccess)
         | 
| 21 | 
            +
                      local_dir = File.dirname(local_path)
         | 
| 22 | 
            +
                      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
         | 
| 23 | 
            +
                      content = res.body
         | 
| 24 | 
            +
                      content = to_utf8(content) if convert_to_utf8
         | 
| 25 | 
            +
                      File.write(local_path, content)
         | 
| 26 | 
            +
                      puts 'succeed'
         | 
| 27 | 
            +
                      return true
         | 
| 28 | 
            +
                    else
         | 
| 29 | 
            +
                      puts res
         | 
| 30 | 
            +
                    end
         | 
| 31 | 
            +
                  rescue StandardError => e
         | 
| 32 | 
            +
                    puts e.backtrace
         | 
| 33 | 
            +
                    puts e
         | 
| 34 | 
            +
                    false
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
                  false
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                def direct_http_post(href, local_path, params,
         | 
| 40 | 
            +
                                     header: nil, convert_to_utf8: false)
         | 
| 41 | 
            +
                  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  begin
         | 
| 44 | 
            +
                    req = Net::HTTP::Post.new(href)
         | 
| 45 | 
            +
                    req.set_form_data(params)
         | 
| 46 | 
            +
                    header.each { |k, v| req[k] = v } if header
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    res =
         | 
| 49 | 
            +
                      Net::HTTP.start(href.hostname, href.port) do |http|
         | 
| 50 | 
            +
                        http.request(req)
         | 
| 51 | 
            +
                      end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    if res.is_a?(Net::HTTPSuccess)
         | 
| 54 | 
            +
                      local_dir = File.dirname(local_path)
         | 
| 55 | 
            +
                      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
         | 
| 56 | 
            +
                      content = res.body
         | 
| 57 | 
            +
                      content = to_utf8(content) if convert_to_utf8
         | 
| 58 | 
            +
                      File.write(local_path, content)
         | 
| 59 | 
            +
                      puts 'succeed'
         | 
| 60 | 
            +
                      return true
         | 
| 61 | 
            +
                    else
         | 
| 62 | 
            +
                      puts res
         | 
| 63 | 
            +
                    end
         | 
| 64 | 
            +
                  rescue StandardError => e
         | 
| 65 | 
            +
                    puts e
         | 
| 66 | 
            +
                    false
         | 
| 67 | 
            +
                  end
         | 
| 68 | 
            +
                  false
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                def extract_href_last(origin_href)
         | 
| 72 | 
            +
                  origin_href.split('/')[-1]
         | 
| 73 | 
            +
                end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                def string_to_uri(href)
         | 
| 76 | 
            +
                  l = href
         | 
| 77 | 
            +
                  l.sub!('http:///', 'http://')
         | 
| 78 | 
            +
                  l = Addressable::URI.parse(l)
         | 
| 79 | 
            +
                  l.normalize!
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
         | 
| 83 | 
            +
                                 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
         | 
| 84 | 
            +
                                 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
         | 
| 85 | 
            +
                                 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
         | 
| 86 | 
            +
                                 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                # 此函数有时此判断有误,使用to_utf8函数直接转换
         | 
| 89 | 
            +
                def smart_to_utf8(str)
         | 
| 90 | 
            +
                  return str if str.encoding == Encoding::UTF_8
         | 
| 91 | 
            +
                  to_utf8(str)
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                def to_utf8(str)
         | 
| 95 | 
            +
                  # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
         | 
| 96 | 
            +
                  str.force_encoding(Encoding::ASCII_8BIT)
         | 
| 97 | 
            +
                  cd = CharDet.detect(str)
         | 
| 98 | 
            +
                  if cd['confidence'] > 0.6
         | 
| 99 | 
            +
                    puts cd['encoding']
         | 
| 100 | 
            +
                    str.force_encoding(cd['encoding'])
         | 
| 101 | 
            +
                    # 移除BOM头
         | 
| 102 | 
            +
                    bom_header = BomHeaderMap[cd['encoding']]
         | 
| 103 | 
            +
                    str.sub!(bom_header, '') if bom_header
         | 
| 104 | 
            +
                  end
         | 
| 105 | 
            +
                  str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  str
         | 
| 108 | 
            +
                end
         | 
| 109 | 
            +
              end
         | 
| 110 | 
            +
            end
         | 
    
        data/list_spider.gemspec
    CHANGED
    
    | @@ -1,31 +1,31 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
            lib = File.expand_path('lib', __dir__)
         | 
| 3 | 
            -
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 4 | 
            -
            require 'list_spider/version'
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            Gem::Specification.new do |spec|
         | 
| 7 | 
            -
              spec.name          = 'list_spider'
         | 
| 8 | 
            -
              spec.version       = ListSpider::VERSION
         | 
| 9 | 
            -
              spec.authors       = ['Charles Zhang']
         | 
| 10 | 
            -
              spec.email         = ['gis05zc@163.com']
         | 
| 11 | 
            -
             | 
| 12 | 
            -
              spec.summary       = 'List Spider'
         | 
| 13 | 
            -
              spec.description   = 'A url list spider based on em-http-request.'
         | 
| 14 | 
            -
              spec.homepage      = 'https://github.com/chinazhangchao/list_spider'
         | 
| 15 | 
            -
              spec.license = 'MIT'
         | 
| 16 | 
            -
             | 
| 17 | 
            -
              spec.files =
         | 
| 18 | 
            -
                `git ls-files -z`.split("\x0").reject do |f|
         | 
| 19 | 
            -
                  f.match(%r{^(test|spec|features)/})
         | 
| 20 | 
            -
                end
         | 
| 21 | 
            -
              spec.bindir        = 'exe'
         | 
| 22 | 
            -
              spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
         | 
| 23 | 
            -
              spec.require_paths = ['lib']
         | 
| 24 | 
            -
             | 
| 25 | 
            -
              spec.add_development_dependency 'bundler', '~> 1.16'
         | 
| 26 | 
            -
              spec.add_development_dependency 'rake', '~> 10.0'
         | 
| 27 | 
            -
             | 
| 28 | 
            -
              spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
         | 
| 29 | 
            -
              spec.add_dependency 'nokogiri', '~> 1.10'
         | 
| 30 | 
            -
              spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
         | 
| 31 | 
            -
            end
         | 
| 1 | 
            +
             | 
| 2 | 
            +
            lib = File.expand_path('lib', __dir__)
         | 
| 3 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 4 | 
            +
            require 'list_spider/version'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Gem::Specification.new do |spec|
         | 
| 7 | 
            +
              spec.name          = 'list_spider'
         | 
| 8 | 
            +
              spec.version       = ListSpider::VERSION
         | 
| 9 | 
            +
              spec.authors       = ['Charles Zhang']
         | 
| 10 | 
            +
              spec.email         = ['gis05zc@163.com']
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              spec.summary       = 'List Spider'
         | 
| 13 | 
            +
              spec.description   = 'A url list spider based on em-http-request.'
         | 
| 14 | 
            +
              spec.homepage      = 'https://github.com/chinazhangchao/list_spider'
         | 
| 15 | 
            +
              spec.license = 'MIT'
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              spec.files =
         | 
| 18 | 
            +
                `git ls-files -z`.split("\x0").reject do |f|
         | 
| 19 | 
            +
                  f.match(%r{^(test|spec|features)/})
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
              spec.bindir        = 'exe'
         | 
| 22 | 
            +
              spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
         | 
| 23 | 
            +
              spec.require_paths = ['lib']
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              spec.add_development_dependency 'bundler', '~> 1.16'
         | 
| 26 | 
            +
              spec.add_development_dependency 'rake', '~> 10.0'
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
         | 
| 29 | 
            +
              spec.add_dependency 'nokogiri', '~> 1.10'
         | 
| 30 | 
            +
              spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
         | 
| 31 | 
            +
            end
         | 
    
        data/spider_example.rb
    CHANGED
    
    | @@ -1,27 +1,27 @@ | |
| 1 | 
            -
            require 'list_spider'
         | 
| 2 | 
            -
            # require File.expand_path('../lib/list_spider', __FILE__)
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            DOWNLOAD_DIR = 'coolshell/'.freeze
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            def parse_index_item(e)
         | 
| 7 | 
            -
              content = File.read(e.local_path)
         | 
| 8 | 
            -
              doc = Nokogiri::HTML(content)
         | 
| 9 | 
            -
              list_group = doc.css('h2.entry-title')
         | 
| 10 | 
            -
              link_list = list_group.css('a')
         | 
| 11 | 
            -
             | 
| 12 | 
            -
              link_list.each do |link|
         | 
| 13 | 
            -
                href = link['href']
         | 
| 14 | 
            -
                local_path = DOWNLOAD_DIR + link.content + '.html'
         | 
| 15 | 
            -
                ListSpider.add_task(TaskStruct.new(href, local_path))
         | 
| 16 | 
            -
              end
         | 
| 17 | 
            -
            end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
            # get_one is a simple function for one taskstruct situation
         | 
| 20 | 
            -
            ListSpider.get_one(
         | 
| 21 | 
            -
              TaskStruct.new(
         | 
| 22 | 
            -
                'https://coolshell.cn/',
         | 
| 23 | 
            -
                DOWNLOAD_DIR + 'index.html',
         | 
| 24 | 
            -
                parse_method: method(:parse_index_item)
         | 
| 25 | 
            -
              ),
         | 
| 26 | 
            -
              max: 60
         | 
| 27 | 
            -
            )
         | 
| 1 | 
            +
            require 'list_spider'
         | 
| 2 | 
            +
            # require File.expand_path('../lib/list_spider', __FILE__)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            DOWNLOAD_DIR = 'coolshell/'.freeze
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            def parse_index_item(e)
         | 
| 7 | 
            +
              content = File.read(e.local_path)
         | 
| 8 | 
            +
              doc = Nokogiri::HTML(content)
         | 
| 9 | 
            +
              list_group = doc.css('h2.entry-title')
         | 
| 10 | 
            +
              link_list = list_group.css('a')
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              link_list.each do |link|
         | 
| 13 | 
            +
                href = link['href']
         | 
| 14 | 
            +
                local_path = DOWNLOAD_DIR + link.content + '.html'
         | 
| 15 | 
            +
                ListSpider.add_task(TaskStruct.new(href, local_path))
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
            end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            # get_one is a simple function for one taskstruct situation
         | 
| 20 | 
            +
            ListSpider.get_one(
         | 
| 21 | 
            +
              TaskStruct.new(
         | 
| 22 | 
            +
                'https://coolshell.cn/',
         | 
| 23 | 
            +
                DOWNLOAD_DIR + 'index.html',
         | 
| 24 | 
            +
                parse_method: method(:parse_index_item)
         | 
| 25 | 
            +
              ),
         | 
| 26 | 
            +
              max: 60
         | 
| 27 | 
            +
            )
         | 
    
        data/spider_example_2.rb
    CHANGED
    
    | @@ -1,29 +1,29 @@ | |
| 1 | 
            -
            require 'list_spider'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            DOWNLOAD_DIR = 'coolshell/'.freeze
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            @next_list = []
         | 
| 6 | 
            -
             | 
| 7 | 
            -
            def parse_index_item(e)
         | 
| 8 | 
            -
              content = File.read(e.local_path)
         | 
| 9 | 
            -
              doc = Nokogiri::HTML(content)
         | 
| 10 | 
            -
              list_group = doc.css('h2.entry-title')
         | 
| 11 | 
            -
              link_list = list_group.css('a')
         | 
| 12 | 
            -
             | 
| 13 | 
            -
              link_list.each do |link|
         | 
| 14 | 
            -
                href = link['href']
         | 
| 15 | 
            -
                local_path = DOWNLOAD_DIR + link.content + '.html'
         | 
| 16 | 
            -
                # or you can save them to database for later use
         | 
| 17 | 
            -
                @next_list << TaskStruct.new(href, local_path)
         | 
| 18 | 
            -
              end
         | 
| 19 | 
            -
            end
         | 
| 20 | 
            -
             | 
| 21 | 
            -
            task_list = []
         | 
| 22 | 
            -
            task_list << TaskStruct.new(
         | 
| 23 | 
            -
              'https://coolshell.cn/',
         | 
| 24 | 
            -
              DOWNLOAD_DIR + 'index.html',
         | 
| 25 | 
            -
              parse_method: method(:parse_index_item)
         | 
| 26 | 
            -
            )
         | 
| 27 | 
            -
             | 
| 28 | 
            -
            ListSpider.get_list(task_list)
         | 
| 29 | 
            -
            ListSpider.get_list(@next_list, max: 60)
         | 
| 1 | 
            +
            require 'list_spider'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            DOWNLOAD_DIR = 'coolshell/'.freeze
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            @next_list = []
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            def parse_index_item(e)
         | 
| 8 | 
            +
              content = File.read(e.local_path)
         | 
| 9 | 
            +
              doc = Nokogiri::HTML(content)
         | 
| 10 | 
            +
              list_group = doc.css('h2.entry-title')
         | 
| 11 | 
            +
              link_list = list_group.css('a')
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              link_list.each do |link|
         | 
| 14 | 
            +
                href = link['href']
         | 
| 15 | 
            +
                local_path = DOWNLOAD_DIR + link.content + '.html'
         | 
| 16 | 
            +
                # or you can save them to database for later use
         | 
| 17 | 
            +
                @next_list << TaskStruct.new(href, local_path)
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            task_list = []
         | 
| 22 | 
            +
            task_list << TaskStruct.new(
         | 
| 23 | 
            +
              'https://coolshell.cn/',
         | 
| 24 | 
            +
              DOWNLOAD_DIR + 'index.html',
         | 
| 25 | 
            +
              parse_method: method(:parse_index_item)
         | 
| 26 | 
            +
            )
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ListSpider.get_list(task_list)
         | 
| 29 | 
            +
            ListSpider.get_list(@next_list, max: 60)
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: list_spider
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 2. | 
| 4 | 
            +
              version: 2.4.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Charles Zhang
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020- | 
| 11 | 
            +
            date: 2020-03-06 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: bundler
         | 
| @@ -136,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 136 136 | 
             
                - !ruby/object:Gem::Version
         | 
| 137 137 | 
             
                  version: '0'
         | 
| 138 138 | 
             
            requirements: []
         | 
| 139 | 
            -
             | 
| 140 | 
            -
            rubygems_version: 2.7.6
         | 
| 139 | 
            +
            rubygems_version: 3.0.3
         | 
| 141 140 | 
             
            signing_key: 
         | 
| 142 141 | 
             
            specification_version: 4
         | 
| 143 142 | 
             
            summary: List Spider
         |