RubyGems - list_spider - Versions diffs - 0.1.0 - Mend

list_spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 0f897ec31adc2d2a6c9713729030e8decc85cb9d
+  data.tar.gz: c81ed9cab5fc1bbd4395aeb9692924887042c27f
+SHA512:
+  metadata.gz: 907529b23336256e2c72232ec2b413c50ebee99df49695bac368ae4dc029afaa81424fe542c63d6c7247714964382e2eaeaae0ea34ab51658e34ce67e5c7b9e7
+  data.tar.gz: 22dff7b012b1c12f8cefb50606e0a9b0874c5b013a305c268ae2a950374760ba77ed24af075f6ca5217e128d905b31bae754107e0cf063e0d90171dd51b94f20

data/lib/delete_unvalid.rb ADDED Viewed

@@ -0,0 +1,40 @@
+class DeleteUnvalid
+# 4033
+# 920
+  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
+    @dir_pattern = dir_pattern
+    @size_threshold = size_threshold
+    if cust_judge
+      @cust_judge = cust_judge
+    else
+      @cust_judge = method(:default_judge)
+    end
+    @total = 0
+  end
+  def default_judge(f)
+    File.size(f) <= @size_threshold
+  end
+  def delete_unvaild(f)
+    if @cust_judge.call(f)
+      @total += 1
+      puts "deleted file: #{f}"
+      File.delete(f)
+    end
+  end
+  def start
+    Dir.glob(@dir_pattern) do |f|
+      # puts f
+      delete_unvaild(f)
+    end
+    puts "delete total:#{@total}"
+  end
+  def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
+    DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
+  end
+end

data/lib/list_spider.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require File.expand_path('../spider_base', __FILE__)
+require File.expand_path('../delete_unvalid', __FILE__)
+class ListSpider
+  RANDOM_TIME = -1
+  NO_LIMIT_CONCURRENT = -1
+  @@random_time_range = 3..10
+  include SpiderBase
+  def initialize(down_list, inter_val: 0, max: 30)
+    @down_list = down_list
+    @inter_val = inter_val
+    @max = max
+    @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
+    @succeed_size = 0
+    @failed_size = 0
+  end
+  attr_reader :succeed_size, :failed_size
+  class << self
+    attr_accessor :random_time_range
+  end
+  def add_task(task)
+    if task.is_a?Array
+      @down_list = @down_list + task
+    elsif task.is_a?TaskStruct
+      @down_list << task
+    else
+      puts "error task type:#{task.class}"
+    end
+  end
+  def complete(multi, success_list, failed_list)
+    @succeed_size += success_list.size
+    @failed_size += failed_list.size
+    # puts "success size:#{success_list.size}"
+    # puts "failed size:#{failed_list.size}"
+    success_list.each do |e|
+      e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
+    end
+    todo = @down_list.slice!(0, @max)
+    if todo.empty?
+      puts "success size:#{@succeed_size}"
+      puts "failed size:#{@failed_size}"
+      EventMachine.stop
+    else
+      if @inter_val != 0
+        if success_list.size != 0 || failed_list.size !=0
+          if @inter_val == RANDOM_TIME
+            sleep(rand(@@random_time_range))
+          else
+            sleep(@inter_val)
+          end
+        end
+      end
+      batch_down_list(todo, method(:complete))
+    end
+  end
+  def start
+    puts "total size:#{@down_list.size}"
+    event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
+  end
+  def self.get_list(down_list, inter_val: 0, max: 30)
+    ListSpider.new(down_list, inter_val: inter_val, max: max).start
+  end
+end

data/lib/spider_base.rb ADDED Viewed

@@ -0,0 +1,296 @@
+require 'em-http-request'
+require 'nokogiri'
+require 'fileutils'
+require 'set'
+require File.expand_path('../spider_helper', __FILE__)
+require "addressable/uri"
+class TaskStruct
+  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
+    @href = href
+    if @href.class == "".class
+      @href = SpiderHelper.string_to_uri(@href)
+    end
+    @local_path = local_path
+    @http_method = http_method
+    @params = params
+    @extra_data = extra_data
+    @parse_method = parse_method
+  end
+  def == (o)
+    o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
+  end
+  attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method
+end
+module SpiderBase
+  @@conver_to_utf8 = false
+  @@connection_opts = {:connect_timeout => 2*60}
+  @@overwrite_exist = false
+  @@max_redirects = 10
+  class << self
+    attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
+    def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
+      @@connection_opts = {
+        :proxy => {
+          :host => proxy_addr,
+          :port => proxy_port
+        }
+      }
+      @@connection_opts[:proxy][:authorization] = [username, password] if username && password
+    end
+    def connect_timeout(max_connect_time)
+      @@connection_opts[:connect_timeout] = max_connect_time
+    end
+    def set_header_option(header_option)
+      @@header_option = optHash
+    end
+    def event_machine_down(link_struct_list, callback = nil)
+      failed_list = []
+      succeed_list = []
+      # puts "event_machine_down callback:#{callback}"
+      multi = EventMachine::MultiRequest.new
+      no_job = true
+      begin_time = Time.now
+      for_each_proc = proc do |e|
+        if !@@overwrite_exist && File.exist?(e.local_path)
+          succeed_list << e
+        else
+          no_job = false
+          opt = {}
+          opt = {:redirects => @@max_redirects}
+          opt[:head] = @@header_option if defined? @@header_option
+          if e.http_method == :post
+            opt[:body] = e.params unless e.params.empty?
+            if @@connection_opts
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).post opt
+            end
+          else
+            if @@connection_opts
+              opt[:query] = e.params unless e.params.empty?
+              w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
+            else
+              w = EventMachine::HttpRequest.new(e.href).get opt
+            end
+          end
+          w.callback {
+            # puts "complete:#{w.response_header}"
+            s = w.response_header.status
+            puts s
+            if s == 403 || s == 502 #Forbidden
+              # EventMachine.stop
+            elsif s != 404
+              local_dir = File.dirname(e.local_path)
+              FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
+              begin
+                File.open(e.local_path, "w") do |f|
+                  if @@conver_to_utf8 == true
+                    f << SpiderHelper.to_utf8( w.response)
+                  else
+                    f << w.response
+                  end
+                end
+                succeed_list << e
+              rescue Exception => e
+                puts e
+              end
+            end
+          }
+          w.errback {
+            puts "errback:#{w.response_header}"
+            puts e.href
+            puts w.response_header.status
+            failed_list << e
+            if e.http_method == :get
+              SpiderHelper.direct_http_get(e.href, e.local_path)
+            elsif e.http_method == :post
+              SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
+            end
+          }
+          multi.add e.local_path, w
+        end
+      end
+      # em_for_each_proc = proc do |e, iter|
+      #   for_each_proc.call(e)
+      #   iter.next
+      # end
+      cb = Proc.new do
+        end_time = Time.now
+        puts "use time:#{end_time-begin_time} seconds"
+        if callback.nil?
+          puts "success size:#{self.succeed_size}"
+          puts "failed size:#{self.failed_size}"
+          EventMachine.stop
+        else
+          callback.call(multi, succeed_list, failed_list)
+        end
+      end
+      after_proc = proc {
+        if no_job #没有任务直接调回调
+          cb.call
+        else
+          multi.callback &cb
+        end
+      }
+      # if DownLoadConfig::MaxConcurrent <= 0
+        link_struct_list.each &for_each_proc
+        after_proc.call
+      # else
+        # EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
+      # end
+    end
+    def event_machine_start(url, down_dir, file_name, callback = nil)
+      down_dir << "/" unless down_dir.end_with?("/")
+      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
+      down_list = []
+      down_list << TaskStruct.new(url, down_dir + file_name)
+      EventMachine.run {
+        index = 0
+        begin_time = Time.now
+        event_machine_down(down_list, callback)
+        end_time = Time.now
+      }
+    end
+    def event_machine_start_list(down_list, callback = nil)
+      EventMachine.run {
+        index = 0
+        begin_time = Time.now
+        event_machine_down(down_list, callback)
+        end_time = Time.now
+      }
+    end
+  end#self end
+end#SpiderBase end
+def batch_down_list(down_list, callback = nil)
+  SpiderBase.event_machine_down(down_list, callback)
+end
+def event_machine_start_list(down_list, callback = nil)
+  SpiderBase.event_machine_start_list(down_list, callback)
+end
+def parse_down_load_url(url, down_dir, file_name, callback = nil)
+  SpiderBase.event_machine_start(url, down_dir, file_name, callback)
+end
+class GetRelative
+  def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
+    @get_depth = get_depth
+    @base_url = base_url
+    @down_dir = down_dir
+    @suffix = suffix
+  end
+  def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
+    puts "success"
+    puts succeed_list.size
+    puts "error"
+    puts failed_list.size
+    puts failed_list
+    puts "get index complete"
+    if succeed_list.size > 0
+      link_list = []
+      succeed_list.each do |e|
+        doc = Nokogiri::HTML(open(e.local_path))
+        link_list.concat(doc.css("a"))
+      end
+      puts "extrat href complete"
+      down_dir << "/" unless down_dir.end_with?("/")
+      FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
+      down_list = []
+      set_list = Set.new
+      link_list.each do |link|
+        href = link['href']
+        next if href.nil? || !href.include?(@suffix)
+        #process such as "scheme_2.html#SEC15"
+        href = href[0, href.index(@suffix) + 5]
+        #process such as "./preface.html"
+        href = href[2..-1] if href.start_with?("./")
+        next if !set_list.add?(href)
+        unless base_url.end_with?("/")
+          i = base_url.rindex"/"
+          base_url = base_url[0..i]
+        end
+        #process such as "http://www.ccs.neu.edu/~dorai"
+        next if href.start_with?("http:") || href.start_with?("https:")
+        local_path = down_dir + href
+        down_list.push( TaskStruct.new(base_url + href, local_path))
+      end
+      puts "down list complete,size:#{down_list.size}"
+      batch_down_list(down_list, callback)
+    end
+  end
+  def down_other_node (multi, succeed_list, failed_list)
+    puts "down_other_node"
+    @get_depth = @get_depth - 1
+    puts "depth:#{@get_depth}"
+    if @get_depth <= 0
+      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
+    else
+      down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
+    end
+  end
+  def event_all_complete (multi, succeed_list, failed_list)
+    puts "all complete"
+    puts "success"
+    puts succeed_list.size
+    puts "error"
+    puts failed_list.size
+    puts failed_list
+    EventMachine.stop
+  end
+  attr_writer :get_depth,:base_url,:down_dir
+  def start
+    index_file_name = "index.html"
+    #http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
+    unless @base_url.end_with?("/")
+      i = @base_url.rindex"/"
+      index_file_name = @base_url[i+1 .. -1]
+    end
+    @get_depth = @get_depth - 1
+    puts @get_depth
+    if @get_depth <= 0
+      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
+    else
+      parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
+    end
+  end
+  def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
+    GetRelative.new(base_url,down_dir, get_depth, suffix).start
+  end
+end #GetRelative

data/lib/spider_helper.rb ADDED Viewed

@@ -0,0 +1,103 @@
+require 'rchardet'
+require 'net/http'
+module SpiderHelper
+  class << self
+    def direct_http_get(href, local_path, params: nil, header: nil)
+      if href.class == "".class
+        href = string_to_uri(href)
+      end
+      begin
+        href.query = URI.encode_www_form(params) if params
+        req = Net::HTTP::Get.new(href)
+        header.each{|k,v| req[k] = v} if header
+        res = Net::HTTP.start(href.hostname, href.port) do |http|
+          http.request(req)
+        end
+        if res.is_a?(Net::HTTPSuccess)
+          local_dir = File.dirname(local_path)
+          FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
+          File.write(local_path, res.body)
+          puts "succeed"
+        else
+          puts res
+        end
+      rescue Exception => e
+        puts e.backtrace
+        puts e
+      end
+    end
+    def direct_http_post(href, local_path, params, header: nil)
+      if href.class == "".class
+        href = string_to_uri(href)
+      end
+      begin
+        req = Net::HTTP::Post.new(href)
+        req.set_form_data(params)
+        header.each{|k,v| req[k] = v} if header
+        res = Net::HTTP.start(href.hostname, href.port) do |http|
+          http.request(req)
+        end
+        if res.is_a?(Net::HTTPSuccess)
+          local_dir = File.dirname(local_path)
+          FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
+          File.write(local_path, res.body)
+        else
+          puts res
+        end
+      rescue Exception => e
+        puts e
+      end
+    end
+    def extract_href_last(origin_href)
+      origin_href.split('/')[-1]
+    end
+    def string_to_uri(href)
+      l = href
+      l.sub!('http:///', 'http://') if l.start_with?('http:///')
+      l = Addressable::URI.parse(l)
+      l.normalize!
+    end
+    BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
+      "UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
+      "UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
+      "UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
+      "UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
+    #此函数有时此判断有误，使用to_utf8函数直接转换
+    def smart_to_utf8(str)
+      return str if str.encoding == Encoding::UTF_8
+      to_utf8(str)
+    end
+    def to_utf8(str)
+      #解决windows下CharDet库编译为ASCII_8BIT，无法与UTF-8兼容问题
+      str.force_encoding(Encoding::ASCII_8BIT)
+      cd = CharDet.detect(str)
+      if cd["confidence"] > 0.6
+        puts cd["encoding"]
+        str.force_encoding(cd["encoding"])
+        #移除BOM头
+        bomHeader = BomHeaderMap[cd["encoding"]]
+        str.sub!(bomHeader, "") if bomHeader
+      end
+      str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
+      return str
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,88 @@
+--- !ruby/object:Gem::Specification
+name: list_spider
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Charles Zhang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-04-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: em-http-request
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.1.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.1.3
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.7
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.7
+description: A url list spider based on em-http-request.
+email: gis05zc@163.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/delete_unvalid.rb
+- lib/list_spider.rb
+- lib/spider_base.rb
+- lib/spider_helper.rb
+homepage: https://github.com/chinazhangchao/list_spider
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.4
+signing_key:
+specification_version: 4
+summary: List Spider
+test_files: []
+has_rdoc: